daft.functions.jaro_similarity#

jaro_similarity #

jaro_similarity(left: Expression, right: Expression) -> Expression

Compute the Jaro similarity between two strings.

The Jaro similarity is a measure of similarity between two strings, based on matching characters and transpositions. Returns a value between 0.0 (no similarity) and 1.0 (identical strings).

Parameters:

Name	Type	Description	Default
`left`	`Expression`	The left string expression to compare.	required
`right`	`Expression`	The right string expression to compare against.	required

Returns:

Type	Description
`Expression`	The Jaro similarity (0.0 to 1.0) for each pair of strings. Returns null when
`Expression`	either input is null.

Examples:

>>> import daft
>>> from daft.functions import jaro_similarity
>>> df = daft.from_pydict({"x": ["martha", "dwayne", "dixon"], "y": ["marhta", "duane", "dicksonx"]})
>>> df = df.with_column("similarity", jaro_similarity(df["x"], df["y"]))
>>> df.collect()

╭────────┬──────────┬────────────────────╮
│ x      ┆ y        ┆ similarity         │
│ ---    ┆ ---      ┆ ---                │
│ String ┆ String   ┆ Float64            │
╞════════╪══════════╪════════════════════╡
│ martha ┆ marhta   ┆ 0.9444444444444445 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ dwayne ┆ duane    ┆ 0.8222222222222223 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ dixon  ┆ dicksonx ┆ 0.7666666666666666 │
╰────────┴──────────┴────────────────────╯
(Showing first 3 of 3 rows)

Source code in daft/functions/str.py

def jaro_similarity(left: Expression, right: Expression) -> Expression:
    """Compute the Jaro similarity between two strings.

    The Jaro similarity is a measure of similarity between two strings, based on
    matching characters and transpositions. Returns a value between 0.0 (no similarity)
    and 1.0 (identical strings).

    Args:
        left: The left string expression to compare.
        right: The right string expression to compare against.

    Returns:
        The Jaro similarity (0.0 to 1.0) for each pair of strings. Returns null when
        either input is null.

    Examples:
        >>> import daft
        >>> from daft.functions import jaro_similarity
        >>> df = daft.from_pydict({"x": ["martha", "dwayne", "dixon"], "y": ["marhta", "duane", "dicksonx"]})
        >>> df = df.with_column("similarity", jaro_similarity(df["x"], df["y"]))
        >>> df.collect()
        ╭────────┬──────────┬────────────────────╮
        │ x      ┆ y        ┆ similarity         │
        │ ---    ┆ ---      ┆ ---                │
        │ String ┆ String   ┆ Float64            │
        ╞════════╪══════════╪════════════════════╡
        │ martha ┆ marhta   ┆ 0.9444444444444445 │
        ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ dwayne ┆ duane    ┆ 0.8222222222222223 │
        ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ dixon  ┆ dicksonx ┆ 0.7666666666666666 │
        ╰────────┴──────────┴────────────────────╯
        <BLANKLINE>
        (Showing first 3 of 3 rows)
    """
    return Expression._call_builtin_scalar_fn("jaro_similarity", left, right)