Datasets#

Daft provides simple, performant, and responsible ways to access useful datasets like Common Crawl and DROID.

Common Crawl#

Check out our Common Crawl dataset guide for more examples!

common_crawl #

common_crawl(crawl: str, segment: str | None = None, content: Literal['raw', 'text', 'metadata', 'warc', 'wet', 'wat'] = 'raw', num_files: int | None = None, io_config: IOConfig | None = None, *, in_aws: bool = False, source: Literal['s3', 'hf', 'http'] | None = None) -> DataFrame

Load Common Crawl data as a DataFrame.

This function automatically resolves the specified crawl and segment into the appropriate Common Crawl files and loads them as a DataFrame, handling the WARC reading process internally.

Parameters:

Name	Type	Description	Default
`crawl`	`str`	The crawl identifier, e.g. "CC-MAIN-2025-33".	required
`segment`	`str \| None`	Specific segment to fetch within the crawl. If not provided, defaults to all segments in the crawl.	`None`
`content`	`Literal['raw', 'text', 'metadata', 'warc', 'wet', 'wat']`	Specifies the type of content to load. Options are: + "raw" or "warc": Raw WARC files containing full HTTP responses + "text" or "wet": Extracted plain text content + "metadata" or "wat": Metadata about crawled pages	`'raw'`
`num_files`	`int \| None`	Limit the number of files to process. If not provided, processes all matching files.	`None`
`io_config`	`IOConfig \| None`	IO configuration for accessing storage.	`None`
`in_aws`	`bool`	DEPRECATED - please use `source="s3"` instead. Fetch from AWS S3 (default: s3://commoncrawl/...\`). If running in AWS, set toTrue`for optimal performance. Set to`False`` when running outside AWS to avoid S3 egress fees. If running in AWS, make sure you're in the "us-east-1" region.	`False`
`source`	`Literal['s3', 'hf', 'http'] \| None`	Source of the Common Crawl data. Options are: + "s3": AWS S3 + "hf": HuggingFace + "http": HTTP + None: Automatically chooses HuggingFace if the crawl is available, otherwise uses HTTP. S3 is an explicit choice due to S3 egress fees.	`None`

Returns:

Type	Description
`DataFrame`	A DataFrame containing the requested Common Crawl data.

Examples:

>>> # Create a dataframe from raw WARC data from a specific crawl
>>> daft.datasets.common_crawl("CC-MAIN-2025-33")

╭────────────────┬─────────────────┬───────────┬────────────────────┬────────────┬────────────────────┬──────────────┬──────────────╮
│ WARC-Record-ID ┆ WARC-Target-URI ┆ WARC-Type ┆ WARC-Date          ┆      …     ┆ WARC-Identified-Pa ┆ warc_content ┆ warc_headers │
│ ---            ┆ ---             ┆ ---       ┆ ---                ┆            ┆ yload-Type         ┆ ---          ┆ ---          │
│ String         ┆ String          ┆ String    ┆ Timestamp[ns,      ┆ (1 hidden) ┆ ---                ┆ Binary       ┆ String       │
│                ┆                 ┆           ┆ "Etc/UTC"]         ┆            ┆ String             ┆              ┆              │
╰────────────────┴─────────────────┴───────────┴────────────────────┴────────────┴────────────────────┴──────────────┴──────────────╯
(No data to display: Dataframe not materialized, use .collect() to materialize)

>>> # Show a sample of extracted text content
>>> daft.datasets.common_crawl("CC-MAIN-2025-33", content="text").limit(2).show()

╭─────────────────┬─────────────────┬────────────┬─────────────────┬────────────┬─────────────────┬────────────────┬────────────────╮
│ WARC-Record-ID  ┆ WARC-Target-URI ┆ WARC-Type  ┆ WARC-Date       ┆      …     ┆ WARC-Identified ┆ warc_content   ┆ warc_headers   │
│ ---             ┆ ---             ┆ ---        ┆ ---             ┆            ┆ -Payload-Type   ┆ ---            ┆ ---            │
│ String          ┆ String          ┆ String     ┆ Timestamp[ns    ┆ (1 hidden) ┆ ---             ┆ Binary         ┆ String         │
│                 ┆                 ┆            ┆ "Etc/UTC"]      ┆            ┆ String          ┆                ┆                │
╞═════════════════╪═════════════════╪════════════╪═════════════════╪════════════╪═════════════════╪════════════════╪════════════════╡
│ 0cb039e8-d357-4 ┆ None            ┆ warcinfo   ┆ 2025-08-16      ┆ …          ┆ None            ┆ b"Software-Inf ┆ {"Content-Type │
│ 85f-95dd-cdfdb… ┆                 ┆            ┆ 01:03:20 UTC    ┆            ┆                 ┆ o:             ┆ ":"application │
│                 ┆                 ┆            ┆                 ┆            ┆                 ┆ ia-web-commo…  ┆ /…             │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ af55e6ef-eeda-4 ┆ http://010ganji ┆ conversion ┆ 2025-08-02      ┆ …          ┆ None            ┆ b"ETF\xe9\x80\ ┆ {"Content-Type │
│ bf7-a599-581bc… ┆ .com/html/ying… ┆            ┆ 23:06:24 UTC    ┆            ┆                 ┆ x89\xe6\x8b\xa ┆ ":"text/plain" │
│                 ┆                 ┆            ┆                 ┆            ┆                 ┆ 9…             ┆ ,…             │
╰─────────────────┴─────────────────┴────────────┴─────────────────┴────────────┴─────────────────┴────────────────┴────────────────╯
(Showing first 2 of 2 rows)

>>> # Sample a single file from a specific segment in a crawl for testing
>>> (
...     daft.datasets.common_crawl("CC-MAIN-2025-33", segment="1754151579063.98", num_files=1).limit(2).show()
... )

╭─────────────────┬─────────────────┬───────────┬─────────────────┬────────────┬─────────────────┬─────────────────┬────────────────╮
│ WARC-Record-ID  ┆ WARC-Target-URI ┆ WARC-Type ┆ WARC-Date       ┆      …     ┆ WARC-Identified ┆ warc_content    ┆ warc_headers   │
│ ---             ┆ ---             ┆ ---       ┆ ---             ┆            ┆ -Payload-Type   ┆ ---             ┆ ---            │
│ String          ┆ String          ┆ String    ┆ Timestamp[ns    ┆ (1 hidden) ┆ ---             ┆ Binary          ┆ String         │
│                 ┆                 ┆           ┆ "Etc/UTC"]      ┆            ┆ String          ┆                 ┆                │
╞═════════════════╪═════════════════╪═══════════╪═════════════════╪════════════╪═════════════════╪═════════════════╪════════════════╡
│ b6238b9c-8db0-4 ┆ None            ┆ warcinfo  ┆ 2025-08-15      ┆ …          ┆ None            ┆ b"isPartOf: CC- ┆ {"Content-Type │
│ 5ac-a6ef-c3cb0… ┆                 ┆           ┆ 20:42:38 UTC    ┆            ┆                 ┆ MAIN-2025-33\r… ┆ ":"application │
│                 ┆                 ┆           ┆                 ┆            ┆                 ┆                 ┆ /…             │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ b29da11b-5976-4 ┆ http://0.woxav. ┆ request   ┆ 2025-08-15      ┆ …          ┆ None            ┆ b"GET /forum-12 ┆ {"Content-Type │
│ f3b-82c4-71fdd… ┆ com/forum-120-… ┆           ┆ 22:33:40 UTC    ┆            ┆                 ┆ 0-1.html HTTP/… ┆ ":"application │
│                 ┆                 ┆           ┆                 ┆            ┆                 ┆                 ┆ /…             │
╰─────────────────┴─────────────────┴───────────┴─────────────────┴────────────┴─────────────────┴─────────────────┴────────────────╯
(Showing first 2 of 2 rows)

Source code in daft/datasets/common_crawl.py

def common_crawl(
    crawl: str,
    segment: str | None = None,
    content: Literal["raw", "text", "metadata", "warc", "wet", "wat"] = "raw",
    num_files: int | None = None,
    io_config: IOConfig | None = None,
    *,
    in_aws: bool = False,
    source: Literal["s3", "hf", "http"] | None = None,
) -> DataFrame:
    r"""Load Common Crawl data as a DataFrame.

    This function automatically resolves the specified crawl and segment into the appropriate Common Crawl files
    and loads them as a DataFrame, handling the WARC reading process internally.

    Args:
        crawl: The crawl identifier, e.g. "CC-MAIN-2025-33".
        segment: Specific segment to fetch within the crawl. If not provided, defaults to all segments in the crawl.
        content: Specifies the type of content to load. Options are:
            + "raw" or "warc": Raw WARC files containing full HTTP responses
            + "text" or "wet": Extracted plain text content
            + "metadata" or "wat": Metadata about crawled pages
        num_files: Limit the number of files to process. If not provided, processes all matching files.
        io_config: IO configuration for accessing storage.
        in_aws: DEPRECATED - please use ``source="s3"`` instead.
                Fetch from AWS S3 (default: ``s3://commoncrawl/...\`). If running in AWS, set to ``True`` for optimal
                performance. Set to ``False`` when running outside AWS to avoid S3 egress fees.
                If running in AWS, make sure you're in the "us-east-1" region.
        source: Source of the Common Crawl data. Options are:
            + "s3": AWS S3
            + "hf": HuggingFace
            + "http": HTTP
            + None: Automatically chooses HuggingFace if the crawl is available, otherwise uses HTTP. S3 is an explicit
            choice due to S3 egress fees.

    Returns:
        A DataFrame containing the requested Common Crawl data.

    Examples:
        >>> # Create a dataframe from raw WARC data from a specific crawl
        >>> daft.datasets.common_crawl("CC-MAIN-2025-33")  # doctest: +SKIP
        ╭────────────────┬─────────────────┬───────────┬────────────────────┬────────────┬────────────────────┬──────────────┬──────────────╮
        │ WARC-Record-ID ┆ WARC-Target-URI ┆ WARC-Type ┆ WARC-Date          ┆      …     ┆ WARC-Identified-Pa ┆ warc_content ┆ warc_headers │
        │ ---            ┆ ---             ┆ ---       ┆ ---                ┆            ┆ yload-Type         ┆ ---          ┆ ---          │
        │ String         ┆ String          ┆ String    ┆ Timestamp[ns,      ┆ (1 hidden) ┆ ---                ┆ Binary       ┆ String       │
        │                ┆                 ┆           ┆ "Etc/UTC"]         ┆            ┆ String             ┆              ┆              │
        ╰────────────────┴─────────────────┴───────────┴────────────────────┴────────────┴────────────────────┴──────────────┴──────────────╯
        <BLANKLINE>
        (No data to display: Dataframe not materialized, use .collect() to materialize)

        >>> # Show a sample of extracted text content
        >>> daft.datasets.common_crawl("CC-MAIN-2025-33", content="text").limit(2).show()  # doctest: +SKIP
        ╭─────────────────┬─────────────────┬────────────┬─────────────────┬────────────┬─────────────────┬────────────────┬────────────────╮
        │ WARC-Record-ID  ┆ WARC-Target-URI ┆ WARC-Type  ┆ WARC-Date       ┆      …     ┆ WARC-Identified ┆ warc_content   ┆ warc_headers   │
        │ ---             ┆ ---             ┆ ---        ┆ ---             ┆            ┆ -Payload-Type   ┆ ---            ┆ ---            │
        │ String          ┆ String          ┆ String     ┆ Timestamp[ns    ┆ (1 hidden) ┆ ---             ┆ Binary         ┆ String         │
        │                 ┆                 ┆            ┆ "Etc/UTC"]      ┆            ┆ String          ┆                ┆                │
        ╞═════════════════╪═════════════════╪════════════╪═════════════════╪════════════╪═════════════════╪════════════════╪════════════════╡
        │ 0cb039e8-d357-4 ┆ None            ┆ warcinfo   ┆ 2025-08-16      ┆ …          ┆ None            ┆ b"Software-Inf ┆ {"Content-Type │
        │ 85f-95dd-cdfdb… ┆                 ┆            ┆ 01:03:20 UTC    ┆            ┆                 ┆ o:             ┆ ":"application │
        │                 ┆                 ┆            ┆                 ┆            ┆                 ┆ ia-web-commo…  ┆ /…             │
        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ af55e6ef-eeda-4 ┆ http://010ganji ┆ conversion ┆ 2025-08-02      ┆ …          ┆ None            ┆ b"ETF\xe9\x80\ ┆ {"Content-Type │
        │ bf7-a599-581bc… ┆ .com/html/ying… ┆            ┆ 23:06:24 UTC    ┆            ┆                 ┆ x89\xe6\x8b\xa ┆ ":"text/plain" │
        │                 ┆                 ┆            ┆                 ┆            ┆                 ┆ 9…             ┆ ,…             │
        ╰─────────────────┴─────────────────┴────────────┴─────────────────┴────────────┴─────────────────┴────────────────┴────────────────╯
        <BLANKLINE>
        (Showing first 2 of 2 rows)

        >>> # Sample a single file from a specific segment in a crawl for testing
        >>> (
        ...     daft.datasets.common_crawl("CC-MAIN-2025-33", segment="1754151579063.98", num_files=1).limit(2).show()
        ... )  # doctest: +SKIP
        ╭─────────────────┬─────────────────┬───────────┬─────────────────┬────────────┬─────────────────┬─────────────────┬────────────────╮
        │ WARC-Record-ID  ┆ WARC-Target-URI ┆ WARC-Type ┆ WARC-Date       ┆      …     ┆ WARC-Identified ┆ warc_content    ┆ warc_headers   │
        │ ---             ┆ ---             ┆ ---       ┆ ---             ┆            ┆ -Payload-Type   ┆ ---             ┆ ---            │
        │ String          ┆ String          ┆ String    ┆ Timestamp[ns    ┆ (1 hidden) ┆ ---             ┆ Binary          ┆ String         │
        │                 ┆                 ┆           ┆ "Etc/UTC"]      ┆            ┆ String          ┆                 ┆                │
        ╞═════════════════╪═════════════════╪═══════════╪═════════════════╪════════════╪═════════════════╪═════════════════╪════════════════╡
        │ b6238b9c-8db0-4 ┆ None            ┆ warcinfo  ┆ 2025-08-15      ┆ …          ┆ None            ┆ b"isPartOf: CC- ┆ {"Content-Type │
        │ 5ac-a6ef-c3cb0… ┆                 ┆           ┆ 20:42:38 UTC    ┆            ┆                 ┆ MAIN-2025-33\r… ┆ ":"application │
        │                 ┆                 ┆           ┆                 ┆            ┆                 ┆                 ┆ /…             │
        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ b29da11b-5976-4 ┆ http://0.woxav. ┆ request   ┆ 2025-08-15      ┆ …          ┆ None            ┆ b"GET /forum-12 ┆ {"Content-Type │
        │ f3b-82c4-71fdd… ┆ com/forum-120-… ┆           ┆ 22:33:40 UTC    ┆            ┆                 ┆ 0-1.html HTTP/… ┆ ":"application │
        │                 ┆                 ┆           ┆                 ┆            ┆                 ┆                 ┆ /…             │
        ╰─────────────────┴─────────────────┴───────────┴─────────────────┴────────────┴─────────────────┴─────────────────┴────────────────╯
        <BLANKLINE>
        (Showing first 2 of 2 rows)
    """
    if num_files is not None and num_files <= 0:
        raise ValueError("num_files must be a positive integer")

    content_type_map: dict[str, Literal["warc", "wet", "wat"]] = {
        "raw": "warc",
        "text": "wet",
        "metadata": "wat",
        "warc": "warc",
        "wet": "wet",
        "wat": "wat",
    }
    if content not in content_type_map:
        raise ValueError(f"Invalid content type for daft.datasets.common_crawl: {content}")
    file_type = content_type_map[content]

    if in_aws and source is not None:
        warnings.warn(
            "`daft.datasets.common_crawl`: Both keyword arguments `in_aws` and `source` were set. Currently, `in_aws` takes precedence, so `daft.datasets.common_crawl` will read from S3. `in_aws` is deprecated and will be removed in v0.9.0, so please set argument `source='s3'` instead."
        )
    elif in_aws:
        warnings.warn(
            "`daft.datasets.common_crawl`: Keyword argument `in_aws` is deprecated and will be removed in v0.9.0. Please set argument `source='s3'` instead."
        )

    if in_aws:
        source = "s3"

    warc_paths = _get_common_crawl_paths(
        crawl=crawl,
        segment=segment,
        file_type=file_type,
        num_files=num_files,
        io_config=io_config,
        source=source,
    )

    return read_warc(warc_paths, io_config=io_config)

LeRobot v3#

See the LeRobot v3 dataset guide for episode vs frame workflows and Hub/local paths.

read #

read(dataset_uri: str, io_config: IOConfig | None = None, include_stats: bool = False, load_video_frames: str | list[str] | bool = False) -> DataFrame

Read a LeRobot v3 dataset as a lazy DataFrame with one row per frame.

Reads the per-episode metadata under meta/episodes and the per-frame sensor data under data, joins them on episode_index, and broadcasts each episode's metadata across its frames. Optionally decodes the matching video frame for one or more camera keys into an image column.

Parameters:

Name	Type	Description	Default
`dataset_uri`	`str`	Huggingface repo id (`org/name`), or a local / remote directory (`s3://...`, `hf://datasets/...`).	required
`io_config`	`IOConfig \| None`	Optional IO configuration for remote reads.	`None`
`include_stats`	`bool`	If True, keep the per-episode `stats/*` columns (per-feature min/max/mean/std/quantiles). Defaults to False.	`False`
`load_video_frames`	`str \| list[str] \| bool`	Which camera keys to decode into image columns, aligned to each frame's timestamp. Defaults to False (decode nothing). Pass True to decode every video feature, a single key (`"observation.image"`), or a list of keys. Decoding requires the optional `av` (PyAV) and `Pillow` dependencies.	`False`

Returns:

Type	Description
`DataFrame`	Lazy DataFrame with one row per frame: the frame's sensor columns, the
`DataFrame`	broadcast episode metadata, and one image column per decoded video key.

Source code in daft/datasets/lerobot.py

@PublicAPI
def read(
    dataset_uri: str,
    io_config: IOConfig | None = None,
    include_stats: bool = False,
    load_video_frames: str | list[str] | bool = False,
) -> DataFrame:
    """Read a LeRobot v3 dataset as a lazy DataFrame with one row per frame.

    Reads the per-episode metadata under ``meta/episodes`` and the per-frame
    sensor data under ``data``, joins them on ``episode_index``, and broadcasts
    each episode's metadata across its frames. Optionally decodes the matching
    video frame for one or more camera keys into an image column.

    Args:
        dataset_uri: Huggingface repo id (``org/name``), or a local / remote
            directory (``s3://...``, ``hf://datasets/...``).
        io_config: Optional IO configuration for remote reads.
        include_stats: If True, keep the per-episode ``stats/*`` columns
            (per-feature min/max/mean/std/quantiles). Defaults to False.
        load_video_frames: Which camera keys to decode into image columns,
            aligned to each frame's timestamp. Defaults to False (decode
            nothing). Pass True to decode every video feature, a single key
            (``"observation.image"``), or a list of keys. Decoding requires the
            optional ``av`` (PyAV) and ``Pillow`` dependencies.

    Returns:
        Lazy DataFrame with one row per frame: the frame's sensor columns, the
        broadcast episode metadata, and one image column per decoded video key.
    """
    root = _normalize_dataset_root(dataset_uri)
    info = _read_info(root, io_config=io_config)

    # Keep the per-episode video metadata (notably `videos/{key}/from_timestamp`,
    # the time within the shard where each episode's footage begins). We need it
    # to translate episode-local frame timestamps into absolute shard timestamps
    # when decoding, and drop these internal columns again before returning.
    episode_df = read_episodes(
        dataset_uri, io_config=io_config, include_stats=include_stats, include_video_metadata=True
    )
    df = load_episode_frames(episode_df, dataset_uri, io_config=io_config)

    # Load video frames into memory
    if load_video_frames is not False:
        if load_video_frames is True:
            video_keys = [name for name, feat_info in info["features"].items() if feat_info["dtype"] == "video"]
        elif isinstance(load_video_frames, str):
            video_keys = [load_video_frames]
        elif isinstance(load_video_frames, list) and all(isinstance(k, str) for k in load_video_frames):
            video_keys = load_video_frames
        else:
            raise ValueError(f"Invalid value provided for argument load_video_frames=`{load_video_frames}`")

        # An MP4 shard packs many episodes back to back, so the shard's internal
        # frame numbering is NOT the parquet's episode-local `frame_index` (which
        # resets to 0 each episode). Seeking by `frame_index` only happens to work
        # for the first episode in each shard. Instead, seek by absolute timestamp:
        # `from_timestamp` (where this episode begins in the shard) + the per-frame
        # episode-local `timestamp`. That keeps a single coordinate system end to end.
        fps = float(info["fps"])
        tolerance_s = 1.0 / fps / 2.0  # half a frame period: any closer frame is unambiguously "the" frame

        df = df.into_batches(_DECODE_BATCH_SIZE)
        for k in video_keys:
            df = df.with_column(
                k,
                _decode_lerobot_video_timestamp(
                    col(f"videos/{k}/video"),
                    col(f"videos/{k}/from_timestamp"),
                    col("timestamp"),
                    tolerance_s,
                    0,  # image_width: 0 disables resize (decode at native resolution)
                    0,  # image_height: 0 disables resize
                ),
            )
            df = df.exclude(f"videos/{k}/video")

    # Drop the internal per-episode video metadata we kept above (chunk/file index,
    # from/to timestamp). This restores read_episodes' default of hiding these.
    df = df.exclude(*(c for c in df.column_names if c.startswith("videos/") and not c.endswith("/video")))

    return df

read_episodes #

read_episodes(dataset_uri: str, io_config: IOConfig | None = None, include_meta: bool = False, include_stats: bool = False, include_video_metadata: bool = False) -> DataFrame

Read LeRobot v3 episode metadata as a lazy DataFrame (one row per episode).

This reads the meta/episodes/**/*.parquet path under the dataset root.

Parameters:

Name	Type	Description	Default
`dataset_uri`	`str`	Huggingface repo id (`org/name`), or a local / remote directory (`s3://...`, `hf://datasets/...`)	required
`io_config`	`IOConfig \| None`	Optional IO configuration for remote reads.	`None`
`include_meta`	`bool`	If True, keep the internal `meta/episodes/*` columns (the chunk/file indices locating each episode's own metadata shard). These are bookkeeping for random access into the sharded metadata and carry no analytical value once the rows are loaded. Defaults to False.	`False`
`include_stats`	`bool`	If True, keep the per-episode `stats/*` columns (per-feature min/max/mean/std/quantiles). Defaults to False.	`False`
`include_video_metadata`	`bool`	If True, keep the per-episode `videos/{key}/*` columns (the chunk/file indices and from/to timestamps locating each episode's footage within its video shard). Defaults to False.	`False`

Returns:

Type	Description
`DataFrame`	Lazy DataFrame of episode metadata, one row per episode. Always includes
`DataFrame`	a `videos/{key}/video` file-handle column per video feature; the
`DataFrame`	`include_*` flags control which additional column families are kept.

Source code in daft/datasets/lerobot.py

@PublicAPI
def read_episodes(
    dataset_uri: str,
    io_config: IOConfig | None = None,
    include_meta: bool = False,
    include_stats: bool = False,
    include_video_metadata: bool = False,
) -> DataFrame:
    """Read LeRobot v3 episode metadata as a lazy DataFrame (one row per episode).

    This reads the `meta/episodes/**/*.parquet` path under the dataset root.

    Args:
        dataset_uri: Huggingface repo id (`org/name`),
            or a local / remote directory (`s3://...`, `hf://datasets/...`)
        io_config: Optional IO configuration for remote reads.
        include_meta: If True, keep the internal ``meta/episodes/*`` columns
            (the chunk/file indices locating each episode's own metadata shard).
            These are bookkeeping for random access into the sharded metadata
            and carry no analytical value once the rows are loaded. Defaults to
            False.
        include_stats: If True, keep the per-episode ``stats/*`` columns
            (per-feature min/max/mean/std/quantiles). Defaults to False.
        include_video_metadata: If True, keep the per-episode ``videos/{key}/*``
            columns (the chunk/file indices and from/to timestamps locating each
            episode's footage within its video shard). Defaults to False.

    Returns:
        Lazy DataFrame of episode metadata, one row per episode. Always includes
        a ``videos/{key}/video`` file-handle column per video feature; the
        ``include_*`` flags control which additional column families are kept.
    """
    root = _normalize_dataset_root(dataset_uri)
    info = _read_info(root, io_config=io_config)
    df = daft.read_parquet(f"{root}/meta/episodes/**/*.parquet", io_config=io_config)
    if not include_meta:
        df = df.exclude(*(c for c in df.column_names if c.startswith("meta/")))
    if not include_stats:
        df = df.exclude(*(c for c in df.column_names if c.startswith("stats/")))

    # Get the video keys
    video_keys = set(name for name, feat_info in info["features"].items() if feat_info["dtype"] == "video")

    for key in video_keys:
        file_name_expr = (
            lit(f"{root}/videos/{key}/chunk-")
            + lpad(col(f"videos/{key}/chunk_index").cast(DataType.string), 3, "0")
            + lit("/file-")
            + lpad(col(f"videos/{key}/file_index").cast(DataType.string), 3, "0")
            + lit(".mp4")
        )

        df = df.with_column(f"videos/{key}/video", video_file(file_name_expr, verify=False, io_config=io_config))

    if not include_video_metadata:
        df = df.exclude(*(c for c in df.column_names if c.startswith("videos/") and not c.endswith("/video")))

    return df

load_episode_frames #

load_episode_frames(episodes: DataFrame, dataset_uri: str, io_config: IOConfig | None = None) -> DataFrame

Expand an episode-level DataFrame into a frame-level DataFrame.

Reads the per-frame parquet under data/** and joins it to the provided episode metadata on episode_index, producing one row per frame. Episode metadata is broadcast across each episode's frames.

Filter episodes before calling this to expand only the episodes you need; only the surviving episodes contribute to the join.

Parameters:

Name	Type	Description	Default
`episodes`	`DataFrame`	Episode-level DataFrame, typically from :func:`read_episodes` (optionally filtered). Must contain an `episode_index` column.	required
`dataset_uri`	`str`	The same dataset identifier passed to :func:`read_episodes` (Huggingface repo id `org/name`, or a local / remote directory such as `s3://...` or `hf://datasets/...`).	required
`io_config`	`IOConfig \| None`	Optional IO configuration for remote reads.	`None`

Returns:

Type	Description
`DataFrame`	Lazy DataFrame with one row per frame.

Source code in daft/datasets/lerobot.py

@PublicAPI
def load_episode_frames(
    episodes: DataFrame,
    dataset_uri: str,
    io_config: IOConfig | None = None,
) -> DataFrame:
    """Expand an episode-level DataFrame into a frame-level DataFrame.

    Reads the per-frame parquet under ``data/**`` and joins it to the provided
    episode metadata on ``episode_index``, producing one row per frame. Episode
    metadata is broadcast across each episode's frames.

    Filter ``episodes`` before calling this to expand only the episodes you need;
    only the surviving episodes contribute to the join.

    Args:
        episodes: Episode-level DataFrame, typically from :func:`read_episodes`
            (optionally filtered). Must contain an ``episode_index`` column.
        dataset_uri: The same dataset identifier passed to :func:`read_episodes`
            (Huggingface repo id ``org/name``, or a local / remote directory such
            as ``s3://...`` or ``hf://datasets/...``).
        io_config: Optional IO configuration for remote reads.

    Returns:
        Lazy DataFrame with one row per frame.
    """
    root = _normalize_dataset_root(dataset_uri)

    frame_df = daft.read_parquet(f"{root}/data/**", io_config=io_config)
    df = episodes.join(frame_df, on=["episode_index"])
    df = df.exclude("data/chunk_index", "data/file_index")
    return df

read_tasks #

read_tasks(dataset_uri: str, io_config: IOConfig | None = None) -> DataFrame

Load task metadata as a DataFrame.

Prefers meta/tasks.parquet (current LeRobot default). Falls back to legacy meta/tasks.jsonl when the Parquet file is missing.

Source code in daft/datasets/lerobot.py

@PublicAPI
def read_tasks(dataset_uri: str, io_config: IOConfig | None = None) -> DataFrame:
    """Load task metadata as a DataFrame.

    Prefers ``meta/tasks.parquet`` (current LeRobot default). Falls back to legacy
    ``meta/tasks.jsonl`` when the Parquet file is missing.
    """
    root = _normalize_dataset_root(dataset_uri)

    pq_url = f"{root}/meta/tasks.parquet"
    try:
        return daft.read_parquet(pq_url, io_config=io_config)
    except (OSError, DaftCoreException, FileNotFoundError):
        return daft.read_json(f"{root}/meta/tasks.jsonl", io_config=io_config)

DROID#

Check out our DROID dataset guide for more examples!

raw #

raw(path: str = _PUBLIC_GCS_BUCKET, io_config: IOConfig | None = None) -> DataFrame

Load the raw DROID robotics dataset as a lazy episode-level DataFrame.

This function discovers episodes by globbing metadata_*.json files under the provided dataset root, reads the episode metadata, and attaches lazy file references to the per-episode trajectory HDF5 file and MP4 camera recordings.

Note

The public dataset is missing camera recordings for some episodes. Those that are missing will be set to None. Additionally, to read the test or train split only, specify a more restricted glob path such as: - gs://gresearch/robotics/droid_raw/test/**/metadata_*.json - gs://gresearch/robotics/droid_raw/train/**/metadata_*.json

The default is gs://gresearch/robotics/droid_raw/**/metadata_*.json.

Parameters:

Name	Type	Description	Default
`path`	`str`	Root path to the raw DROID dataset. Defaults to the official public GCS release at `gs://gresearch/robotics/droid_raw`. Also supports local paths and other remote object stores.	`_PUBLIC_GCS_BUCKET`
`io_config`	`IOConfig \| None`	IO configuration for accessing remote storage.	`None`

Returns:

Type	Description
`DataFrame`	A DataFrame with one row per episode. Metadata fields from each episode's JSON
`DataFrame`	file are unnested into top-level columns, along with:
`DataFrame`	`episode_dir`: path to the episode directory
`DataFrame`	`trajectory`: lazy `daft.Hdf5File` reference to the trajectory HDF5 file
`DataFrame`	`wrist_cam_video`: lazy `daft.VideoFile` reference to the wrist camera MP4 file
`DataFrame`	`ext1_cam_video`: lazy `daft.VideoFile` reference to the external camera 1 MP4 file Often the left camera feed.
`DataFrame`	`ext2_cam_video`: lazy `daft.VideoFile` reference to the external camera 2 MP4 file Often the right camera feed.

Examples:

>>> import daft
>>> df = daft.datasets.droid.raw()
>>> df.select("episode_dir", "ext1_cam_video").show()

Source code in daft/datasets/droid.py

@PublicAPI
def raw(
    # By default, use the official public GCS bucket
    path: str = _PUBLIC_GCS_BUCKET,
    io_config: IOConfig | None = None,
    # include_stereo: bool = False,
    # TODO: Add support for SVO camera recordings
) -> DataFrame:
    r"""Load the raw DROID robotics dataset as a lazy episode-level DataFrame.

    This function discovers episodes by globbing ``metadata_*.json`` files under the
    provided dataset root, reads the episode metadata, and attaches lazy file references
    to the per-episode trajectory HDF5 file and MP4 camera recordings.

    Note:
        The public dataset is missing camera recordings for some episodes. Those that are missing
        will be set to `None`. Additionally, to read the test or train split only, specify a more
        restricted glob path such as:
        - ``gs://gresearch/robotics/droid_raw/test/**/metadata_*.json``
        - ``gs://gresearch/robotics/droid_raw/train/**/metadata_*.json``

        The default is ``gs://gresearch/robotics/droid_raw/**/metadata_*.json``.

    Args:
        path: Root path to the raw DROID dataset. Defaults to the official public
            GCS release at `gs://gresearch/robotics/droid_raw`. Also supports
            local paths and other remote object stores.
        io_config: IO configuration for accessing remote storage.

    Returns:
        A DataFrame with one row per episode. Metadata fields from each episode's JSON
        file are unnested into top-level columns, along with:

        - `episode_dir`: path to the episode directory
        - `trajectory`: lazy `daft.Hdf5File` reference to the trajectory HDF5 file
        - `wrist_cam_video`: lazy `daft.VideoFile` reference to the wrist camera MP4 file
        - `ext1_cam_video`: lazy `daft.VideoFile` reference to the external camera 1 MP4 file
            Often the left camera feed.
        - `ext2_cam_video`: lazy `daft.VideoFile` reference to the external camera 2 MP4 file
            Often the right camera feed.

    Examples:
        >>> import daft
        >>> df = daft.datasets.droid.raw()  # doctest: +SKIP
        >>> df.select("episode_dir", "ext1_cam_video").show()  # doctest: +SKIP
    """
    # Configure IO config with anonymous access to the public GCS bucket
    if io_config is None and path == _PUBLIC_GCS_BUCKET:
        io_config = IOConfig(gcs=GCSConfig(anonymous=True))

    episodes = (
        daft.from_glob_path(f"{path.rstrip('/')}/**/metadata_*.json", io_config=io_config)
        .select(
            col("path")
            .download(io_config=io_config)
            .cast(DataType.string)
            .try_deserialize("json", _METADATA_DTYPE)
            .alias("metadata"),
            regexp_replace(col("path"), r"/metadata_[^/]+\.json$", "").alias("episode_dir"),
        )
        .select(unnest(col("metadata")), "episode_dir")
    )

    # Create VideoFile and Hdf5File columns for MP4 camera recordings and trajectory HDF5 file.
    episodes = episodes.with_columns(
        {
            "trajectory": hdf5_file(
                format("{}/trajectory.h5", col("episode_dir")),
                io_config=io_config,
            ),
            "wrist_cam_video": video_file(
                format("{}/recordings/MP4/{}.mp4", col("episode_dir"), col("wrist_cam_serial")),
                io_config=io_config,
            ),
            "ext1_cam_video": video_file(
                format("{}/recordings/MP4/{}.mp4", col("episode_dir"), col("ext1_cam_serial")),
                io_config=io_config,
            ),
            "ext2_cam_video": video_file(
                format("{}/recordings/MP4/{}.mp4", col("episode_dir"), col("ext2_cam_serial")),
                io_config=io_config,
            ),
        }
    ).with_columns(
        {
            "trajectory": when(file_exists(col("trajectory")), col("trajectory")).otherwise(lit(None)),
            "wrist_cam_video": when(file_exists(col("wrist_cam_video")), col("wrist_cam_video")).otherwise(lit(None)),
            "ext1_cam_video": when(file_exists(col("ext1_cam_video")), col("ext1_cam_video")).otherwise(lit(None)),
            "ext2_cam_video": when(file_exists(col("ext2_cam_video")), col("ext2_cam_video")).otherwise(lit(None)),
        }
    )

    # Select fields grouped by metadata and each camera, for easier access.
    return episodes.select(
        # Metadata columns
        "uuid",
        "lab",
        "date",
        "timestamp",
        "scene_id",
        "trajectory_length",
        "current_task",
        "success",
        "episode_dir",
        "user",
        "user_id",
        "building",
        "robot_serial",
        "r2d2_version",
        "trajectory",
        # Wrist camera columns
        "wrist_cam_serial",
        "wrist_cam_extrinsics",
        "wrist_cam_video",
        # Ext1 camera columns
        "ext1_cam_serial",
        "ext1_cam_extrinsics",
        "ext1_cam_video",
        # Ext2 camera columns
        "ext2_cam_serial",
        "ext2_cam_extrinsics",
        "ext2_cam_video",
    )

scenes #

scenes(*, io_config: IOConfig | None = None) -> DataFrame

Load the DROID scene classification table as a lazy DataFrame.

The table maps DROID scene_id values to GPT-4V scene_classification labels. Join it onto episode-level DataFrames from :func:raw or :func:trajectory when you need scene labels.

Note

This helper uses a copy of the original file hosted on Hugging Face datasets. Keeping this classification table in sync is best-effort and may not be up-to-date. See https://huggingface.co/datasets/Eventual-Inc/droid-scene-classifications for mirror details and original source attribution.

Parameters:

Name	Type	Description	Default
`io_config`	`IOConfig \| None`	IO configuration for reading the Hugging Face classification table.	`None`

Returns:

Type	Description
`DataFrame`	A DataFrame with `scene_id` and `scene_classification` columns.

Examples:

>>> import daft
>>> from daft.datasets.droid import raw, scenes
>>> kitchen_scenes = scenes().where(daft.col("scene_classification") == "Home kitchen")
>>> kitchen = raw().join(kitchen_scenes, on="scene_id", how="inner").limit(5)
>>> kitchen.select("uuid", "scene_id", "scene_classification").show()

Source code in daft/datasets/droid.py

@PublicAPI
def scenes(
    *,
    io_config: IOConfig | None = None,
) -> DataFrame:
    r"""Load the DROID scene classification table as a lazy DataFrame.

    The table maps DROID ``scene_id`` values to GPT-4V
    ``scene_classification`` labels. Join it onto episode-level DataFrames from
    :func:`raw` or :func:`trajectory` when you need scene labels.

    Note:
        This helper uses a copy of the original file hosted on Hugging Face
        datasets. Keeping this classification table in sync is best-effort and
        may not be up-to-date. See
        https://huggingface.co/datasets/Eventual-Inc/droid-scene-classifications
        for mirror details and original source attribution.

    Args:
        io_config: IO configuration for reading the Hugging Face classification
            table.

    Returns:
        A DataFrame with ``scene_id`` and ``scene_classification`` columns.

    Examples:
        >>> import daft
        >>> from daft.datasets.droid import raw, scenes
        >>> kitchen_scenes = scenes().where(daft.col("scene_classification") == "Home kitchen")  # doctest: +SKIP
        >>> kitchen = raw().join(kitchen_scenes, on="scene_id", how="inner").limit(5)  # doctest: +SKIP
        >>> kitchen.select("uuid", "scene_id", "scene_classification").show()  # doctest: +SKIP
    """
    return daft.read_parquet(_HF_SCENE_CLASSIFICATIONS_PATH, io_config=io_config)

trajectory #

trajectory(episodes: DataFrame, fields: Sequence[str] = _DEFAULT_TRAJECTORY_FIELDS) -> DataFrame

Read selected trajectory datasets from episode-level HDF5 files.

This helper takes the lazy episode catalog produced by :func:raw and adds tensor columns for the requested HDF5 datasets. Each output row still corresponds to one episode; use filters such as limit on episodes before calling this function to avoid reading more data than needed.

Parameters:

Name	Type	Description	Default
`episodes`	`DataFrame`	Episode-level DataFrame from :func:`raw` containing a `trajectory` `Hdf5File` column.	required
`fields`	`Sequence[str]`	HDF5 dataset paths to read, such as `"action/joint_position"`. Defaults to a curated set of common action and observation fields.	`_DEFAULT_TRAJECTORY_FIELDS`

Returns:

Type	Description
`DataFrame`	The input DataFrame with one tensor column per requested field. Rows with
`DataFrame`	a null `trajectory` file are skipped before reading.

Examples:

>>> import daft
>>> from daft.datasets.droid import raw, trajectory
>>> episodes = raw().where(daft.col("success")).limit(1)
>>> traj = trajectory(
...     episodes,
...     fields=["action/joint_position", "action/gripper_position"],
... )
>>> traj.select("uuid", "action/joint_position", "action/gripper_position").collect()

Source code in daft/datasets/droid.py

@PublicAPI
def trajectory(
    episodes: DataFrame,
    fields: Sequence[str] = _DEFAULT_TRAJECTORY_FIELDS,
) -> DataFrame:
    r"""Read selected trajectory datasets from episode-level HDF5 files.

    This helper takes the lazy episode catalog produced by :func:`raw` and adds
    tensor columns for the requested HDF5 datasets. Each output row still
    corresponds to one episode; use filters such as ``limit`` on ``episodes``
    before calling this function to avoid reading more data than needed.

    Args:
        episodes: Episode-level DataFrame from :func:`raw` containing a ``trajectory``
            ``Hdf5File`` column.
        fields: HDF5 dataset paths to read, such as ``"action/joint_position"``.
            Defaults to a curated set of common action and observation fields.

    Returns:
        The input DataFrame with one tensor column per requested field. Rows with
        a null ``trajectory`` file are skipped before reading.

    Examples:
        >>> import daft
        >>> from daft.datasets.droid import raw, trajectory
        >>> episodes = raw().where(daft.col("success")).limit(1)  # doctest: +SKIP
        >>> traj = trajectory(  # doctest: +SKIP
        ...     episodes,
        ...     fields=["action/joint_position", "action/gripper_position"],
        ... )
        >>> traj.select("uuid", "action/joint_position", "action/gripper_position").collect()  # doctest: +SKIP
    """
    from daft.dependencies import h5py

    if not h5py.module_available():  # ty:ignore[unresolved-attribute]
        raise ImportError(
            "The 'daft[hdf5]' extra is required to read DROID HDF5 trajectory files. "
            "Please install it with: pip install 'daft[hdf5]'"
        )

    # Validation checks
    fields = tuple(fields)
    if "trajectory" not in episodes.schema().column_names():
        raise ValueError("Expected an episode DataFrame with a `trajectory` column.")

    if len(fields) == 0:
        raise ValueError("fields must contain at least one HDF5 dataset path")

    unknown = [f for f in fields if f not in _TRAJECTORY_DTYPES]
    if unknown:
        raise ValueError(f"Unknown trajectory field(s): {unknown}")

    # Build the UDF that will read the trajectory data and return a struct of the requested fields
    @daft.func(
        return_dtype=DataType.struct({field: _TRAJECTORY_DTYPES[field] for field in fields}),
        use_process=False,
        unnest=True,
    )
    def read_droid_trajectory(file: Hdf5File) -> dict[str, object]:
        with file.to_tempfile() as tmp, h5py.File(tmp.name, "r") as h5:
            return {field: h5[field][()] for field in fields}

    # Select the columns we need and apply the UDF to the trajectory column
    return episodes.where(col("trajectory").not_null()).select(
        "uuid",
        "scene_id",
        "robot_serial",
        "r2d2_version",
        "current_task",
        "success",
        "trajectory_length",
        read_droid_trajectory(col("trajectory")),
        "wrist_cam_video",
        "wrist_cam_extrinsics",
        "ext1_cam_video",
        "ext1_cam_extrinsics",
        "ext2_cam_video",
        "ext2_cam_extrinsics",
    )

camera_frames #

camera_frames(episodes: DataFrame, cameras: str | Sequence[str] = ('wrist', 'ext1', 'ext2'), *, start_time: float = 0, end_time: float | None = None, width: int | None = None, height: int | None = None, is_key_frame: bool | None = None, sample_interval_seconds: float | None = None) -> DataFrame

Decode DROID camera videos into per-episode frame-list columns.

This helper takes an episode-level DataFrame from raw or trajectory and appends one frame-list column per requested camera. It keeps one row per episode; each frame-list column contains the structs returned by :func:daft.functions.video_frames, including frame metadata and image data.

Parameters:

Name	Type	Description	Default
`episodes`	`DataFrame`	Episode-level DataFrame containing DROID camera `VideoFile` columns.	required
`cameras`	`str \| Sequence[str]`	Camera or cameras to decode. May be a single camera string or a sequence of camera names. Supported values are `"wrist"`, `"ext1"`, and `"ext2"`. Defaults to all three cameras.	`('wrist', 'ext1', 'ext2')`
`start_time`	`float`	Start of the time range in seconds. Defaults to 0.	`0`
`end_time`	`float \| None`	End of the time range in seconds. Defaults to None, meaning all frames.	`None`
`width`	`int \| None`	Target width for resizing frames. Must be provided with `height`.	`None`
`height`	`int \| None`	Target height for resizing frames. Must be provided with `width`.	`None`
`is_key_frame`	`bool \| None`	If True, decode only keyframes. If False, decode only non-keyframes. If None, decode all frames.	`None`
`sample_interval_seconds`	`float \| None`	If provided, sample frames at approximately this time interval in seconds.	`None`

Returns:

Type	Description
`DataFrame`	The input DataFrame with `<camera>_cam_frames` columns appended.

Examples:

>>> import daft
>>> from daft.datasets.droid import camera_frames, raw
>>> episodes = raw().where(daft.col("success")).limit(1)
>>> frames = camera_frames(episodes, width=224, height=224)
>>> frames.select("uuid", "wrist_cam_frames").collect()

Source code in daft/datasets/droid.py

@PublicAPI
def camera_frames(
    episodes: DataFrame,
    cameras: str | Sequence[str] = ("wrist", "ext1", "ext2"),  # _CAMERAS
    *,
    start_time: float = 0,
    end_time: float | None = None,
    width: int | None = None,
    height: int | None = None,
    is_key_frame: bool | None = None,
    sample_interval_seconds: float | None = None,
) -> DataFrame:
    r"""Decode DROID camera videos into per-episode frame-list columns.

    This helper takes an episode-level DataFrame from `raw` or `trajectory`
    and appends one frame-list column per requested camera. It keeps one row per
    episode; each frame-list column contains the structs returned by
    :func:`daft.functions.video_frames`, including frame metadata and image data.

    Args:
        episodes: Episode-level DataFrame containing DROID camera `VideoFile` columns.
        cameras: Camera or cameras to decode. May be a single camera string or a
            sequence of camera names. Supported values are ``"wrist"``, ``"ext1"``,
            and ``"ext2"``. Defaults to all three cameras.
        start_time: Start of the time range in seconds. Defaults to 0.
        end_time: End of the time range in seconds. Defaults to None, meaning all frames.
        width: Target width for resizing frames. Must be provided with ``height``.
        height: Target height for resizing frames. Must be provided with ``width``.
        is_key_frame: If True, decode only keyframes. If False, decode only non-keyframes.
            If None, decode all frames.
        sample_interval_seconds: If provided, sample frames at approximately this time
            interval in seconds.

    Returns:
        The input DataFrame with ``<camera>_cam_frames`` columns appended.

    Examples:
        >>> import daft
        >>> from daft.datasets.droid import camera_frames, raw
        >>> episodes = raw().where(daft.col("success")).limit(1)  # doctest: +SKIP
        >>> frames = camera_frames(episodes, width=224, height=224)  # doctest: +SKIP
        >>> frames.select("uuid", "wrist_cam_frames").collect()  # doctest: +SKIP
    """
    from daft.dependencies import av

    if not av.module_available():  # ty:ignore[unresolved-attribute]
        raise ImportError(
            "The 'daft[video]' extra is required to decode DROID camera frames. "
            "Please install it with: pip install 'daft[video]'"
        )

    selected_cameras = (cameras,) if isinstance(cameras, str) else tuple(cameras)
    if len(selected_cameras) == 0:
        raise ValueError("cameras must contain at least one camera")

    unknown = [camera for camera in selected_cameras if camera not in _CAMERAS]
    if unknown:
        raise ValueError(f"Unknown camera(s): {unknown}. Expected one or more of: {', '.join(_CAMERAS)}.")
    selected_cameras = tuple(dict.fromkeys(selected_cameras))

    input_columns = {field.name for field in episodes.schema()}
    missing_columns = [
        f"{camera}_cam_video" for camera in selected_cameras if f"{camera}_cam_video" not in input_columns
    ]
    if missing_columns:
        raise ValueError(
            f"Expected an episode DataFrame with DROID camera video columns. Missing columns: {missing_columns}."
        )

    frame_columns = {
        f"{camera}_cam_frames": video_frames(
            col(f"{camera}_cam_video"),
            start_time=start_time,
            end_time=end_time,
            width=width,
            height=height,
            is_key_frame=is_key_frame,
            sample_interval_seconds=sample_interval_seconds,
        )
        for camera in selected_cameras
    }

    return episodes.with_columns(frame_columns)