Skip to content

File Types

The File DataType provides first-class support for handling file data across local and remote storage, enabling seamless file operations in distributed environments.

File #

File(url: str, io_config: IOConfig | None = None, media_type: MediaType = unknown(), offset: int | None = None, length: int | None = None)

A file-like object for working with file contents in Daft.

This is an abstract base class that provides a standard file interface compatible with Python's file protocol.

The File object can be used with most Python libraries that accept file-like objects, and implements the standard read/seek/tell interface. Files are read-only in the current implementation.

Examples:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
>>> import daft
>>> from daft.functions import file
>>> df = daft.from_pydict({"paths": ["data.json"]})
>>> df = df.select(file(df["paths"]))
>>>
>>> @daft.func
>>> def read_json(file: daft.File) -> str:
>>>     import json
>>>     with file.open() as f:
>>>         data = json.load(f)
>>>         return data["text"]

Methods:

Name Description
as_audio

Convert to AudioFile if this file contains audio data.

as_image

Convert to ImageFile if this file contains image data.

as_video

Convert to VideoFile if this file contains video data.

is_audio
is_image
is_video
isatty
mime_type

Attempts to determine the MIME type of the file.

open
readable
seekable
size
to_tempfile

Create a temporary file with the contents of this file.

writable

Attributes:

Name Type Description
length int | None

The byte length for range reads, or None for full-file reads.

name str

The filename (basename) extracted from the file path or URL.

offset int | None

The byte offset for range reads, or None for full-file reads.

path str

The full path or URL of the file.

Source code in daft/file/file.py
55
56
57
58
59
60
61
62
63
def __init__(
    self,
    url: str,
    io_config: IOConfig | None = None,
    media_type: MediaType = MediaType.unknown(),
    offset: int | None = None,
    length: int | None = None,
) -> None:
    self._inner = PyFileReference._from_tuple((media_type._media_type, url, io_config, offset, length))  # type: ignore

length #

length: int | None

The byte length for range reads, or None for full-file reads.

name #

name: str

The filename (basename) extracted from the file path or URL.

Returns:

Name Type Description
str str

The filename without directory components.

Example

import daft f = daft.File("s3://bucket/path/to/data.csv") f.name 'data.csv'

offset #

offset: int | None

The byte offset for range reads, or None for full-file reads.

path #

path: str

The full path or URL of the file.

Returns:

Name Type Description
str str

The file path or URL.

Example

import daft f = daft.File("s3://bucket/path/to/data.csv") f.path 's3://bucket/path/to/data.csv'

as_audio #

as_audio() -> AudioFile

Convert to AudioFile if this file contains audio data.

Source code in daft/file/file.py
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
def as_audio(self) -> AudioFile:
    """Convert to AudioFile if this file contains audio data."""
    if not sf.module_available():
        raise ImportError(
            "The 'soundfile' module is required to convert files to audio. "
            "Please install it with: pip install 'daft[audio]'"
        )
    # this is purposely inside the function, and after the `sf` check
    # because using AudioFile means that the user has `sf` installed
    from daft.file.audio import AudioFile

    if not self.is_audio():
        raise ValueError(f"File {self} is not an audio file")

    cls = AudioFile.__new__(AudioFile)
    cls._inner = self._inner

    return cls

as_image #

as_image() -> ImageFile

Convert to ImageFile if this file contains image data.

Source code in daft/file/file.py
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
def as_image(self) -> ImageFile:
    """Convert to ImageFile if this file contains image data."""
    if not pil_image.module_available():
        raise ImportError(
            "The 'pillow' module is required to convert files to images. "
            "Please install it with: pip install 'daft[image]'"
        )
    from daft.file.image import ImageFile

    if not self.is_image():
        raise ValueError(f"File {self} is not an image file")

    cls = ImageFile.__new__(ImageFile)
    cls._inner = self._inner

    return cls

as_video #

as_video() -> VideoFile

Convert to VideoFile if this file contains video data.

Source code in daft/file/file.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
def as_video(self) -> VideoFile:
    """Convert to VideoFile if this file contains video data."""
    if not av.module_available():
        raise ImportError("The 'av' module is required to convert files to video.")
    # this is purposely inside the function, and after the `av` check
    # because using VideoFile means that the user has `av` installed
    from daft.file.video import VideoFile

    if not self.is_video():
        raise ValueError(f"File {self} is not a video file")

    cls = VideoFile.__new__(VideoFile)
    cls._inner = self._inner

    return cls

is_audio #

is_audio() -> bool
Source code in daft/file/file.py
169
170
171
172
173
def is_audio(self) -> bool:
    mimetype = self.mime_type()
    if mimetype.startswith("audio/"):
        return True
    return False

is_image #

is_image() -> bool
Source code in daft/file/file.py
175
176
177
178
179
def is_image(self) -> bool:
    mimetype = self.mime_type()
    if mimetype.startswith("image/"):
        return True
    return False

is_video #

is_video() -> bool
Source code in daft/file/file.py
163
164
165
166
167
def is_video(self) -> bool:
    mimetype = self.mime_type()
    if mimetype.startswith("video/"):
        return True
    return False

isatty #

isatty() -> bool
Source code in daft/file/file.py
80
81
def isatty(self) -> bool:
    return False

mime_type #

mime_type() -> str

Attempts to determine the MIME type of the file.

If the MIME type is undetectable, returns 'application/octet-stream'.

Source code in daft/file/file.py
126
127
128
129
130
131
132
133
def mime_type(self) -> str:
    """Attempts to determine the MIME type of the file.

    If the MIME type is undetectable, returns 'application/octet-stream'.
    """
    with self.open(buffer_size=BUFFER_SNIFF) as f:
        maybe_mime_type = f.guess_mime_type()
        return maybe_mime_type if maybe_mime_type else "application/octet-stream"

open #

open(buffer_size: int | None = None) -> PyDaftFile
Source code in daft/file/file.py
65
66
def open(self, buffer_size: int | None = None) -> PyDaftFile:
    return PyDaftFile._from_file_reference(self._inner, buffer_size=buffer_size)

readable #

readable() -> bool
Source code in daft/file/file.py
71
72
def readable(self) -> bool:
    return True

seekable #

seekable() -> bool
Source code in daft/file/file.py
77
78
def seekable(self) -> bool:
    return True

size #

size() -> int
Source code in daft/file/file.py
123
124
def size(self) -> int:
    return PyDaftFile._from_file_reference(self._inner, buffer_size=BUFFER_SNIFF).size()

to_tempfile #

to_tempfile() -> _TemporaryFileWrapper[bytes]

Create a temporary file with the contents of this file.

Returns:

Type Description
_TemporaryFileWrapper[bytes]

_TemporaryFileWrapper[bytes]: The temporary file object.

The temporary file will be automatically deleted when the returned context manager is closed.

It's important to note that to_tempfile closes the original file object, so it CANNOT be used after calling this method.

Source code in daft/file/file.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def to_tempfile(self) -> _TemporaryFileWrapper[bytes]:
    """Create a temporary file with the contents of this file.

    Returns:
        _TemporaryFileWrapper[bytes]: The temporary file object.

    The temporary file will be automatically deleted when the returned context manager is closed.

    It's important to note that `to_tempfile` closes the original file object, so it CANNOT be used after calling this method.
    """
    with self.open() as f:
        temp_file = tempfile.NamedTemporaryFile(
            prefix="daft_",
        )
        f.seek(0)

        size = f.size()
        # if its either a really small file, or doesn't support range requests. Just read it normally
        if not f._supports_range_requests() or size < 1024:
            temp_file.write(f.read())
        else:
            shutil.copyfileobj(f, temp_file, length=size)
        # close it as `to_tempfile` is a consuming method
        f.close()
        temp_file.seek(0)

        return temp_file

writable #

writable() -> bool
Source code in daft/file/file.py
74
75
def writable(self) -> bool:
    return False

AudioFile #

AudioFile(url: str, io_config: IOConfig | None = None)

An audio-specific file interface that provides audio operations.

Methods:

Name Description
metadata

Extract basic audio metadata from container headers.

resample

Resample the audio file to the given sample rate.

to_numpy

Convert the audio file to a numpy array.

Source code in daft/file/audio.py
25
26
27
28
29
30
31
def __init__(self, url: str, io_config: IOConfig | None = None) -> None:
    if not sf.module_available():
        raise ImportError(
            "The 'soundfile' module is required to create audio files. "
            "Please add 'daft[audio]' to your dependencies or install it with: pip install 'daft[audio]'"
        )
    super().__init__(url, io_config, MediaType.audio())

metadata #

metadata() -> AudioMetadata

Extract basic audio metadata from container headers.

Returns:

Name Type Description
AudioMetadata AudioMetadata

Audio metadata object containing: - sample_rate: int - The sample rate of the audio file - channels: int - The number of channels in the audio file - frames: int - The number of frames in the audio file - format: str - The format of the audio file - subtype: str | None - The subtype of the audio file

Source code in daft/file/audio.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def metadata(self) -> AudioMetadata:
    """Extract basic audio metadata from container headers.

    Returns:
        AudioMetadata: Audio metadata object containing:
            - sample_rate: int - The sample rate of the audio file
            - channels: int - The number of channels in the audio file
            - frames: int - The number of frames in the audio file
            - format: str - The format of the audio file
            - subtype: str | None - The subtype of the audio file
    """
    with self.open(buffer_size=BUFFER_METADATA) as f:
        with sf.SoundFile(f) as af:
            return AudioMetadata(
                sample_rate=af.samplerate,
                channels=af.channels,
                frames=af.frames,
                format=af.format,
                subtype=af.subtype,
            )

resample #

resample(sample_rate: int) -> ndarray[Any, dtype[float64]]

Resample the audio file to the given sample rate.

Parameters:

Name Type Description Default
sample_rate int

The new sample rate.

required

Returns:

Name Type Description
AudioFile ndarray[Any, dtype[float64]]

The resampled audio file.

Source code in daft/file/audio.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
def resample(self, sample_rate: int) -> np.ndarray[Any, np.dtype[np.float64]]:
    """Resample the audio file to the given sample rate.

    Args:
        sample_rate (int): The new sample rate.

    Returns:
        AudioFile: The resampled audio file.

    """
    if not librosa.module_available():
        raise ImportError(
            "The 'librosa' module is required to resample audio files. "
            "Please install it with: pip install 'daft[audio]'"
        )
    if not sf.module_available():
        raise ImportError(
            "The 'soundfile' module is required to resample audio files. "
            "Please install it with: pip install 'daft[audio]'"
        )
    with self.to_tempfile() as f:
        data, samplerate = sf.read(f)
        if samplerate != sample_rate:
            resampled_data = librosa.resample(data, orig_sr=samplerate, target_sr=sample_rate)
            return resampled_data
        else:
            return data

to_numpy #

to_numpy() -> ndarray[Any, dtype[float64]]

Convert the audio file to a numpy array.

Returns:

Type Description
ndarray[Any, dtype[float64]]

np.ndarray[Any, Any]: The audio data as a numpy array.

Source code in daft/file/audio.py
58
59
60
61
62
63
64
65
66
67
def to_numpy(self) -> np.ndarray[Any, np.dtype[np.float64]]:
    """Convert the audio file to a numpy array.

    Returns:
        np.ndarray[Any, Any]: The audio data as a numpy array.

    """
    with self.to_tempfile() as tmp:
        audio, _ = sf.read(tmp)
        return audio

ImageFile #

ImageFile(url: str, io_config: IOConfig | None = None)

An image-specific file interface that provides image operations.

Methods:

Name Description
decode

Decode the image file into a PIL Image.

metadata

Extract basic image metadata from file headers.

Source code in daft/file/image.py
27
28
29
30
31
32
33
def __init__(self, url: str, io_config: IOConfig | None = None) -> None:
    if not pil_image.module_available():
        raise ImportError(
            "The 'pillow' module is required to create image files. "
            "Please install it with: pip install 'daft[image]'"
        )
    super().__init__(url, io_config, MediaType.image())

decode #

decode(mode: str | None = None) -> Image

Decode the image file into a PIL Image.

Parameters:

Name Type Description Default
mode str | None

Optional image mode to convert to (e.g. "RGB", "RGBA", "L").

None

Returns:

Type Description
Image

PIL.Image.Image: The decoded image.

Source code in daft/file/image.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
def decode(self, mode: str | None = None) -> PIL.Image.Image:
    """Decode the image file into a PIL Image.

    Args:
        mode: Optional image mode to convert to (e.g. "RGB", "RGBA", "L").

    Returns:
        PIL.Image.Image: The decoded image.
    """
    with self.open() as f:
        img = pil_image.open(f)
        img.load()
        if mode is not None and img.mode != mode:
            img = img.convert(mode)
        return img

metadata #

metadata() -> ImageMetadata

Extract basic image metadata from file headers.

PIL's Image.open() is lazy -- it reads only the file header to determine dimensions, format, and mode without decoding pixel data.

Returns:

Name Type Description
ImageMetadata ImageMetadata

Image metadata containing width, height, format, mode.

Source code in daft/file/image.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def metadata(self) -> ImageMetadata:
    """Extract basic image metadata from file headers.

    PIL's Image.open() is lazy -- it reads only the file header to
    determine dimensions, format, and mode without decoding pixel data.

    Returns:
        ImageMetadata: Image metadata containing width, height, format, mode.
    """
    with self.open(buffer_size=BUFFER_METADATA) as f:
        img = pil_image.open(f)
        return ImageMetadata(
            width=img.width,
            height=img.height,
            format=img.format,
            mode=img.mode,
        )

VideoFile #

VideoFile(url: str, io_config: IOConfig | None = None)

A video-specific file interface that provides video operations.

Methods:

Name Description
frames

Lazy iterator of all decoded frames with metadata within time range.

keyframes

Lazy iterator of keyframes as PIL Images within time range.

metadata

Extract basic video metadata from container headers.

Source code in daft/file/video.py
29
30
31
32
def __init__(self, url: str, io_config: IOConfig | None = None) -> None:
    if not av.module_available():
        raise ImportError("The 'av' module is required to create video files.")
    super().__init__(url, io_config, MediaType.video())

frames #

frames(start_time: float = 0, end_time: float | None = None, width: int | None = None, height: int | None = None, is_key_frame: bool | None = None) -> Iterator[VideoFrameData]

Lazy iterator of all decoded frames with metadata within time range.

Mirrors the per-frame schema of daft.read_video_frames().

Parameters:

Name Type Description Default
start_time float

Start of the time range in seconds. Defaults to 0.

0
end_time float | None

End of the time range in seconds. Defaults to None (end of video).

None
width int | None

Optional target width for resizing frames. Must be provided with height.

None
height int | None

Optional target height for resizing frames. Must be provided with width.

None
is_key_frame bool | None

If True, emit only keyframes. If False, emit only non-keyframes. If None, emit all decoded frames.

None

Yields:

Type Description
VideoFrameData

VideoFrameData dicts with keys: frame_index, frame_time, frame_time_base,

VideoFrameData

frame_pts, frame_dts, frame_duration, is_key_frame, data (PIL Image).

Source code in daft/file/video.py
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
def frames(
    self,
    start_time: float = 0,
    end_time: float | None = None,
    width: int | None = None,
    height: int | None = None,
    is_key_frame: bool | None = None,
) -> Iterator[VideoFrameData]:
    """Lazy iterator of all decoded frames with metadata within time range.

    Mirrors the per-frame schema of ``daft.read_video_frames()``.

    Args:
        start_time: Start of the time range in seconds. Defaults to 0.
        end_time: End of the time range in seconds. Defaults to None (end of video).
        width: Optional target width for resizing frames. Must be provided with ``height``.
        height: Optional target height for resizing frames. Must be provided with ``width``.
        is_key_frame: If True, emit only keyframes. If False, emit only non-keyframes.
            If None, emit all decoded frames.

    Yields:
        VideoFrameData dicts with keys: frame_index, frame_time, frame_time_base,
        frame_pts, frame_dts, frame_duration, is_key_frame, data (PIL Image).
    """
    if not pil_image.module_available():
        raise ImportError(
            "The 'pillow' module is required for frame decoding. Install it with `pip install daft[video]`."
        )
    if (width is None) != (height is None):
        raise ValueError("Both width and height must be specified together for resizing.")
    with self.open() as f:
        with av.open(f) as container:
            video = next(
                (stream for stream in container.streams if stream.type == "video"),
                None,
            )
            if video is None:
                raise ValueError("No video stream found")

            if is_key_frame:
                video.codec_context.skip_frame = "NONKEY"

            # Seek to start time
            if start_time > 0 and video.time_base:
                seek_timestamp = int(start_time / float(video.time_base))
                container.seek(seek_timestamp, stream=video)

            time_base = float(video.time_base) if video.time_base else None
            fps = float(video.average_rate) if video.average_rate else None
            if fps is None and video.guessed_rate:
                fps = float(video.guessed_rate)
            start_pts = video.start_time or 0
            frame_index: int = 0
            for frame in container.decode(video):
                # Skip frames before start_time (seek may land earlier)
                if frame.time is not None and frame.time < start_time:
                    frame_index += 1
                    continue

                # Stop at end_time
                if end_time is not None:
                    if frame.time is not None and frame.time > end_time:
                        break

                if is_key_frame is False and frame.key_frame:
                    frame_index += 1
                    continue

                # Resize if requested
                output_frame = frame
                if width is not None and height is not None:
                    output_frame = frame.reformat(width=width, height=height)

                current_frame_index = frame_index
                if frame.pts is not None and time_base is not None and fps is not None:
                    current_frame_index = int(round((frame.pts - start_pts) * time_base * fps))

                yield VideoFrameData(
                    frame_index=current_frame_index,
                    frame_time=frame.time,
                    frame_time_base=str(frame.time_base) if frame.time_base else None,
                    frame_pts=frame.pts,
                    frame_dts=frame.dts,
                    frame_duration=frame.duration,
                    is_key_frame=frame.key_frame,
                    data=output_frame.to_image(),
                )

                frame_index += 1

keyframes #

keyframes(start_time: float = 0, end_time: float | None = None) -> Iterator[Image]

Lazy iterator of keyframes as PIL Images within time range.

Source code in daft/file/video.py
 99
100
101
102
def keyframes(self, start_time: float = 0, end_time: float | None = None) -> Iterator[PIL.Image.Image]:
    """Lazy iterator of keyframes as PIL Images within time range."""
    for frame in self.frames(start_time=start_time, end_time=end_time, is_key_frame=True):
        yield frame["data"]

metadata #

metadata() -> VideoMetadata

Extract basic video metadata from container headers.

Returns:

Name Type Description
VideoMetadata VideoMetadata

Video metadata object containing width, height, fps, frame_count, time_base, keyframe_pts, keyframe_indices

Source code in daft/file/video.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def metadata(self) -> VideoMetadata:
    """Extract basic video metadata from container headers.

    Returns:
        VideoMetadata: Video metadata object containing width, height, fps, frame_count, time_base, keyframe_pts, keyframe_indices

    """
    with self.open(buffer_size=BUFFER_METADATA) as f:
        with av.open(f, mode="r", metadata_encoding="utf-8") as container:
            video = next(
                (stream for stream in container.streams if stream.type == "video"),
                None,
            )
            if video is None:
                return VideoMetadata(
                    width=None,
                    height=None,
                    fps=None,
                    duration=None,
                    frame_count=None,
                    time_base=None,
                )

            # Basic stream properties ----------
            width = video.width
            height = video.height
            time_base = float(video.time_base) if video.time_base else None

            # Frame rate -----------------------
            fps = None
            if video.average_rate:
                fps = float(video.average_rate)
            elif video.guessed_rate:
                fps = float(video.guessed_rate)

            # Duration -------------------------
            duration = None
            if container.duration and container.duration > 0:
                duration = container.duration / 1_000_000.0
            elif video.duration:
                # Fallback time_base only for duration computation if missing
                tb_for_dur = float(video.time_base) if video.time_base else (1.0 / 1_000_000.0)
                duration = float(video.duration * tb_for_dur)

            # Frame count -----------------------
            frame_count = video.frames
            if not frame_count or frame_count <= 0:
                if duration and fps:
                    frame_count = int(round(duration * fps))
                else:
                    frame_count = None

            return VideoMetadata(
                width=width,
                height=height,
                fps=fps,
                duration=duration,
                frame_count=frame_count,
                time_base=time_base,
            )