Skip to content

Series#

Each column in a Table is a Series. Series expose methods which invoke high-performance kernels for manipulation of a column of data.

Series #

Series()

A Daft Series is an array of data of a single type, and is usually a column in a DataFrame.

Methods:

Name Description
from_arrow

Construct a Series from an pyarrow array or chunked array.

from_numpy

Construct a Series from a NumPy ndarray.

from_pandas

Construct a Series from a pandas Series.

from_pylist

Construct a Series from a Python list.

to_arrow

Convert this Series to an pyarrow array.

to_pylist

Convert this Series to a Python list.

Source code in daft/series.py
25
26
def __init__(self) -> None:
    raise NotImplementedError("We do not support creating a Series via __init__ ")

from_arrow #

from_arrow(array: Array | ChunkedArray, name: str = 'arrow_series', dtype: DataType | None = None) -> Series

Construct a Series from an pyarrow array or chunked array.

Parameters:

Name Type Description Default
array Array | ChunkedArray

The pyarrow (chunked) array whose data we wish to put in the Series.

required
name str

The name associated with the Series; this is usually the column name.

'arrow_series'
dtype DataType | None

The DataType to use for the Series. If not provided, Daft will infer the DataType from the data.

None
Source code in daft/series.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
@staticmethod
def from_arrow(
    array: pa.Array | pa.ChunkedArray, name: str = "arrow_series", dtype: DataType | None = None
) -> Series:
    """Construct a Series from an pyarrow array or chunked array.

    Args:
        array: The pyarrow (chunked) array whose data we wish to put in the Series.
        name: The name associated with the Series; this is usually the column name.
        dtype: The DataType to use for the Series. If not provided, Daft will infer the
            DataType from the data.
    """
    _ensure_registered_super_ext_type()
    try:
        DataType.from_arrow_type(array.type, python_fallback=False)
    except TypeError:
        # If the Arrow type is not natively supported, go through the Python list path.
        return Series.from_pylist(array.to_pylist(), name=name, pyobj="force")
    if isinstance(array, pa.Array):
        if isinstance(array.type, pa.FixedShapeTensorType):
            series = Series.from_arrow(array.storage, name=name)
            return series.cast(dtype or DataType.from_arrow_type(array.type))
        else:
            pys = PySeries.from_arrow(name, array, dtype=dtype._dtype if dtype else None)
            return Series._from_pyseries(pys)
    elif isinstance(array, pa.ChunkedArray):
        arr_type = array.type
        if isinstance(arr_type, pa.BaseExtensionType):
            combined_storage_array = array.cast(arr_type.storage_type).combine_chunks()
            combined_array = arr_type.wrap_array(combined_storage_array)
        else:
            combined_array = array.combine_chunks()
        return Series.from_arrow(combined_array, name=name, dtype=dtype)
    else:
        raise TypeError(f"expected either PyArrow Array or Chunked Array, got {type(array)}")

from_numpy #

from_numpy(data: ndarray[Any, Any], name: str = 'numpy_series', dtype: DataType | None = None) -> Series

Construct a Series from a NumPy ndarray.

If the provided NumPy ndarray is 1-dimensional, Daft will attempt to store the ndarray in a pyarrow Array. If the ndarray has more than 1 dimension OR storing the 1D array in Arrow failed, Daft will store the ndarray data as a Python list of NumPy ndarrays.

Parameters:

Name Type Description Default
data ndarray[Any, Any]

The NumPy ndarray whose data we wish to put in the Series.

required
name str

The name associated with the Series; this is usually the column name.

'numpy_series'
dtype DataType | None

The DataType to use for the Series. If not provided, Daft will infer the DataType from the data.

None
Source code in daft/series.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
@classmethod
def from_numpy(
    cls, data: np.ndarray[Any, Any], name: str = "numpy_series", dtype: DataType | None = None
) -> Series:
    """Construct a Series from a NumPy ndarray.

    If the provided NumPy ndarray is 1-dimensional, Daft will attempt to store the ndarray
    in a pyarrow Array. If the ndarray has more than 1 dimension OR storing the 1D array in Arrow failed,
    Daft will store the ndarray data as a Python list of NumPy ndarrays.

    Args:
        data: The NumPy ndarray whose data we wish to put in the Series.
        name: The name associated with the Series; this is usually the column name.
        dtype: The DataType to use for the Series. If not provided, Daft will infer the
            DataType from the data.
    """
    if not isinstance(data, np.ndarray):
        raise TypeError(f"Expected a NumPy ndarray, got {type(data)}")
    if data.ndim <= 1:
        try:
            arrow_array = pa.array(data)
        except pa.ArrowInvalid:
            pass
        else:
            return cls.from_arrow(arrow_array, name=name, dtype=dtype)
    # TODO(Clark): Represent the tensor series with an Arrow extension type in order
    # to keep the series data contiguous.
    return cls.from_pylist(list(data), name=name, dtype=dtype)

from_pandas #

from_pandas(data: Series[Any], name: str = 'pd_series', dtype: DataType | None = None) -> Series

Construct a Series from a pandas Series.

This will first try to convert the series into a pyarrow array, then will fall back to converting the series to a NumPy ndarray and going through that construction path, and will finally fall back to converting the series to a Python list and going through that path.

Parameters:

Name Type Description Default
data Series[Any]

The pandas Series whose data we wish to put in the Daft Series.

required
name str

The name associated with the Series; this is usually the column name.

'pd_series'
dtype DataType | None

The DataType to use for the Series. If not provided, Daft will infer the DataType from the data.

None
Source code in daft/series.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
@classmethod
def from_pandas(cls, data: pd.Series[Any], name: str = "pd_series", dtype: DataType | None = None) -> Series:
    """Construct a Series from a pandas Series.

    This will first try to convert the series into a pyarrow array, then will fall
    back to converting the series to a NumPy ndarray and going through that construction path,
    and will finally fall back to converting the series to a Python list and going through that
    path.

    Args:
        data: The pandas Series whose data we wish to put in the Daft Series.
        name: The name associated with the Series; this is usually the column name.
        dtype: The DataType to use for the Series. If not provided, Daft will infer the
            DataType from the data.
    """
    if not isinstance(data, pd.Series):
        raise TypeError(f"expected a pandas Series, got {type(data)}")
    # First, try Arrow path.
    try:
        arrow_arr = pa.Array.from_pandas(data)
    except pa.ArrowInvalid:
        pass
    else:
        return cls.from_arrow(arrow_arr, name=name, dtype=dtype)
    # Second, fall back to NumPy path. Note that .from_numpy() does _not_ fall back to
    # the pylist representation for 1D arrays and instead raises an error that we can catch.
    # We do the pylist representation fallback ourselves since the pd.Series.to_list()
    # preserves more type information for types that are not natively representable in Python.
    try:
        ndarray = data.to_numpy()
        return cls.from_numpy(ndarray, name=name, dtype=dtype)
    except Exception:
        pass
    # Finally, fall back to pylist path.
    # NOTE: For element types that don't have a native Python representation,
    # a Pandas scalar object will be returned.
    return cls.from_pylist(data.to_list(), name=name, dtype=dtype if dtype else DataType.python())

from_pylist #

from_pylist(data: list[Any], name: str = 'list_series', dtype: DataType | None = None, pyobj: Literal['allow', 'disallow', 'force'] = 'allow') -> Series

Construct a Series from a Python list.

If dtype is not defined, then the resulting type depends on the setting of pyobj: - "allow": Daft-native types if possible, else PyObject; - "disallow": Daft-native types only, raising error if not convertible; - "force": Always store as PyObject types. Equivalent to dtype=daft.DataType.python().

Parameters:

Name Type Description Default
data list[Any]

The Python list whose data we wish to put in the Series.

required
name str

The name associated with the Series; this is usually the column name.

'list_series'
dtype DataType | None

The DataType to use for the Series. If not provided, Daft will infer the DataType from the data.

None
pyobj Literal['allow', 'disallow', 'force']

Whether we want to "allow" coercion to Arrow types, "disallow" falling back to Python type representation. Default is "allow".

'allow'
Source code in daft/series.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
@staticmethod
def from_pylist(
    data: list[Any],
    name: str = "list_series",
    dtype: DataType | None = None,
    pyobj: Literal["allow", "disallow", "force"] = "allow",
) -> Series:
    """Construct a Series from a Python list.

    If `dtype` is not defined, then the resulting type depends on the setting of `pyobj`:
        - ``"allow"``: Daft-native types if possible, else PyObject;
        - ``"disallow"``: Daft-native types only, raising error if not convertible;
        - ``"force"``: Always store as PyObject types. Equivalent to `dtype=daft.DataType.python()`.

    Args:
        data: The Python list whose data we wish to put in the Series.
        name: The name associated with the Series; this is usually the column name.
        dtype: The DataType to use for the Series. If not provided, Daft will infer the
            DataType from the data.
        pyobj: Whether we want to ``"allow"`` coercion to Arrow types, ``"disallow"``
            falling back to Python type representation. Default is ``"allow"``.
    """
    if not isinstance(data, list):
        raise TypeError(f"expected a python list, got {type(data)}")

    if pyobj not in {"allow", "disallow", "force"}:
        raise ValueError(f"pyobj: expected either 'allow', 'disallow', or 'force', but got {pyobj})")

    if pyobj == "force":
        dtype = DataType.python()

    pys = PySeries.from_pylist(data, name, None if dtype is None else dtype._dtype)
    series = Series._from_pyseries(pys)

    if pyobj == "disallow" and series.datatype().is_python():
        raise TypeError("Could not convert Python list to a Daft-native type, and pyobj='disallow' was set.")

    return series

to_arrow #

to_arrow() -> Array

Convert this Series to an pyarrow array.

Source code in daft/series.py
216
217
218
219
def to_arrow(self) -> pa.Array:
    """Convert this Series to an pyarrow array."""
    _ensure_registered_super_ext_type()
    return self._series.to_arrow()

to_pylist #

to_pylist(maps_as_pydicts: Literal['lossy', 'strict'] | None = None) -> list[Any]

Convert this Series to a Python list.

Parameters:

Name Type Description Default
maps_as_pydicts Literal['lossy', 'strict'] | None

If None (default), Map values are converted to association lists (list[tuple[key, value]]) preserving order and duplicates. If "lossy" or "strict", Map values are converted to Python dicts. "lossy" keeps the last value for duplicate keys and warns. "strict" raises on duplicate keys.

None
Source code in daft/series.py
227
228
229
230
231
232
233
234
235
236
237
def to_pylist(self, maps_as_pydicts: Literal["lossy", "strict"] | None = None) -> list[Any]:
    """Convert this Series to a Python list.

    Args:
        maps_as_pydicts: If None (default), Map values are converted to association lists
            (`list[tuple[key, value]]`) preserving order and duplicates.
            If `"lossy"` or `"strict"`, Map values are converted to Python dicts.
            `"lossy"` keeps the last value for duplicate keys and warns.
            `"strict"` raises on duplicate keys.
    """
    return self._series.to_pylist(maps_as_pydicts)