Skip to content

daft.functions.approx_count_distinct#

approx_count_distinct #

approx_count_distinct(expr: Expression) -> Expression

Calculates the approximate number of non-NULL distinct values in the expression.

Approximation is performed using the HyperLogLog algorithm.

Examples:

A global calculation of approximate distinct values in a non-NULL column:

1
2
3
4
5
6
7
8
>>> import daft
>>> from daft.functions import approx_count_distinct
>>>
>>> df = daft.from_pydict({"values": [1, 2, 3, None]})
>>> df = df.agg(
...     approx_count_distinct(df["values"]).alias("distinct_values"),
... )
>>> df.show()
╭─────────────────╮
│ distinct_values │
│ ---             │
│ UInt64          │
╞═════════════════╡
│ 3               │
╰─────────────────╯
(Showing first 1 of 1 rows)
Source code in daft/functions/agg.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def approx_count_distinct(expr: Expression) -> Expression:
    """Calculates the approximate number of non-`NULL` distinct values in the expression.

    Approximation is performed using the [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) algorithm.

    Examples:
        A global calculation of approximate distinct values in a non-NULL column:

        >>> import daft
        >>> from daft.functions import approx_count_distinct
        >>>
        >>> df = daft.from_pydict({"values": [1, 2, 3, None]})
        >>> df = df.agg(
        ...     approx_count_distinct(df["values"]).alias("distinct_values"),
        ... )
        >>> df.show()
        ╭─────────────────╮
        │ distinct_values │
        │ ---             │
        │ UInt64          │
        ╞═════════════════╡
        │ 3               │
        ╰─────────────────╯
        <BLANKLINE>
        (Showing first 1 of 1 rows)
    """
    return Expression._from_pyexpr(expr._expr.approx_count_distinct())