Skip to content

daft.functions.levenshtein_distance#

levenshtein_distance #

levenshtein_distance(left: Expression, right: Expression) -> Expression

Compute the Levenshtein edit distance between two strings.

The Levenshtein distance is the minimum number of single-character insertions, deletions, or substitutions required to transform one string into the other.

Parameters:

Name Type Description Default
left Expression

The left string expression to compare.

required
right Expression

The right string expression to compare against.

required

Returns:

Type Description
Expression

The Levenshtein distance for each pair of strings. Returns null when either

Expression

input is null.

Examples:

1
2
3
4
5
>>> import daft
>>> from daft.functions import levenshtein_distance
>>> df = daft.from_pydict({"x": ["kitten", "saturday", ""], "y": ["sitting", "sunday", "abc"]})
>>> df = df.with_column("distance", levenshtein_distance(df["x"], df["y"]))
>>> df.collect()
╭──────────┬─────────┬──────────╮
│ x        ┆ y       ┆ distance │
│ ---      ┆ ---     ┆ ---      │
│ String   ┆ String  ┆ Int64    │
╞══════════╪═════════╪══════════╡
│ kitten   ┆ sitting ┆ 3        │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ saturday ┆ sunday  ┆ 3        │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│          ┆ abc     ┆ 3        │
╰──────────┴─────────┴──────────╯
(Showing first 3 of 3 rows)
Source code in daft/functions/str.py
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
def levenshtein_distance(left: Expression, right: Expression) -> Expression:
    """Compute the Levenshtein edit distance between two strings.

    The Levenshtein distance is the minimum number of single-character insertions,
    deletions, or substitutions required to transform one string into the other.

    Args:
        left: The left string expression to compare.
        right: The right string expression to compare against.

    Returns:
        The Levenshtein distance for each pair of strings. Returns null when either
        input is null.

    Examples:
        >>> import daft
        >>> from daft.functions import levenshtein_distance
        >>> df = daft.from_pydict({"x": ["kitten", "saturday", ""], "y": ["sitting", "sunday", "abc"]})
        >>> df = df.with_column("distance", levenshtein_distance(df["x"], df["y"]))
        >>> df.collect()
        ╭──────────┬─────────┬──────────╮
        │ x        ┆ y       ┆ distance │
        │ ---      ┆ ---     ┆ ---      │
        │ String   ┆ String  ┆ Int64    │
        ╞══════════╪═════════╪══════════╡
        │ kitten   ┆ sitting ┆ 3        │
        ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
        │ saturday ┆ sunday  ┆ 3        │
        ├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
        │          ┆ abc     ┆ 3        │
        ╰──────────┴─────────┴──────────╯
        <BLANKLINE>
        (Showing first 3 of 3 rows)
    """
    return Expression._call_builtin_scalar_fn("levenshtein_distance", left, right)