daft.functions.jaro_winkler_similarity#

jaro_winkler_similarity #

jaro_winkler_similarity(left: Expression, right: Expression) -> Expression

Compute the Jaro-Winkler similarity between two strings.

This is the Jaro similarity with a prefix bonus for strings sharing a common prefix (up to 4 characters). Returns a value between 0.0 (no similarity) and 1.0 (identical strings).

Parameters:

Name	Type	Description	Default
`left`	`Expression`	The left string expression to compare.	required
`right`	`Expression`	The right string expression to compare against.	required

Returns:

Type	Description
`Expression`	The Jaro-Winkler similarity (0.0 to 1.0) for each pair of strings. Returns
`Expression`	null when either input is null.

Examples:

>>> import daft
>>> from daft.functions import jaro_winkler_similarity
>>> df = daft.from_pydict({"x": ["martha", "dwayne", "dixon"], "y": ["marhta", "duane", "dicksonx"]})
>>> df = df.with_column("similarity", jaro_winkler_similarity(df["x"], df["y"]))
>>> df.collect()

╭────────┬──────────┬────────────────────╮
│ x      ┆ y        ┆ similarity         │
│ ---    ┆ ---      ┆ ---                │
│ String ┆ String   ┆ Float64            │
╞════════╪══════════╪════════════════════╡
│ martha ┆ marhta   ┆ 0.9611111111111111 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ dwayne ┆ duane    ┆ 0.8400000000000001 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ dixon  ┆ dicksonx ┆ 0.8133333333333332 │
╰────────┴──────────┴────────────────────╯
(Showing first 3 of 3 rows)

Source code in daft/functions/str.py

def jaro_winkler_similarity(left: Expression, right: Expression) -> Expression:
    """Compute the Jaro-Winkler similarity between two strings.

    This is the Jaro similarity with a prefix bonus for strings sharing a common
    prefix (up to 4 characters). Returns a value between 0.0 (no similarity) and
    1.0 (identical strings).

    Args:
        left: The left string expression to compare.
        right: The right string expression to compare against.

    Returns:
        The Jaro-Winkler similarity (0.0 to 1.0) for each pair of strings. Returns
        null when either input is null.

    Examples:
        >>> import daft
        >>> from daft.functions import jaro_winkler_similarity
        >>> df = daft.from_pydict({"x": ["martha", "dwayne", "dixon"], "y": ["marhta", "duane", "dicksonx"]})
        >>> df = df.with_column("similarity", jaro_winkler_similarity(df["x"], df["y"]))
        >>> df.collect()
        ╭────────┬──────────┬────────────────────╮
        │ x      ┆ y        ┆ similarity         │
        │ ---    ┆ ---      ┆ ---                │
        │ String ┆ String   ┆ Float64            │
        ╞════════╪══════════╪════════════════════╡
        │ martha ┆ marhta   ┆ 0.9611111111111111 │
        ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ dwayne ┆ duane    ┆ 0.8400000000000001 │
        ├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ dixon  ┆ dicksonx ┆ 0.8133333333333332 │
        ╰────────┴──────────┴────────────────────╯
        <BLANKLINE>
        (Showing first 3 of 3 rows)
    """
    return Expression._call_builtin_scalar_fn("jaro_winkler_similarity", left, right)