Calculates the approximate number of non-NULL distinct values in the expression.
Approximation is performed using the HyperLogLog algorithm.
Examples:
A global calculation of approximate distinct values in a non-NULL column:
| >>> import daft
>>> from daft.functions import approx_count_distinct
>>>
>>> df = daft.from_pydict({"values": [1, 2, 3, None]})
>>> df = df.agg(
... approx_count_distinct(df["values"]).alias("distinct_values"),
... )
>>> df.show()
|
╭─────────────────╮
│ distinct_values │
│ --- │
│ UInt64 │
╞═════════════════╡
│ 3 │
╰─────────────────╯
(Showing first 1 of 1 rows)
Source code in daft/functions/agg.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71 | def approx_count_distinct(expr: Expression) -> Expression:
"""Calculates the approximate number of non-`NULL` distinct values in the expression.
Approximation is performed using the [HyperLogLog](https://en.wikipedia.org/wiki/HyperLogLog) algorithm.
Examples:
A global calculation of approximate distinct values in a non-NULL column:
>>> import daft
>>> from daft.functions import approx_count_distinct
>>>
>>> df = daft.from_pydict({"values": [1, 2, 3, None]})
>>> df = df.agg(
... approx_count_distinct(df["values"]).alias("distinct_values"),
... )
>>> df.show()
╭─────────────────╮
│ distinct_values │
│ --- │
│ UInt64 │
╞═════════════════╡
│ 3 │
╰─────────────────╯
<BLANKLINE>
(Showing first 1 of 1 rows)
"""
return Expression._from_pyexpr(expr._expr.approx_count_distinct())
|