download(expr: Expression, max_connections: int = 32, on_error: Literal['raise', 'null'] = 'raise', io_config: IOConfig | None = None) -> Expression
Treats each string as a URL, and downloads the bytes contents as a bytes column.
Parameters:
| Name | Type | Description | Default |
expr | Expression | The expression to download. | required |
max_connections | int | The maximum number of connections to use per thread to use for downloading URLs. Defaults to 32. | 32 |
on_error | Literal['raise', 'null'] | Behavior when a URL download error is encountered - "raise" to raise the error immediately or "null" to log the error but fallback to a Null value. Defaults to "raise". | 'raise' |
io_config | IOConfig | None | IOConfig to use when accessing remote storage. Note that the S3Config's max_connections parameter will be overridden with max_connections that is passed in as a kwarg. | None |
Returns:
| Name | Type | Description |
Expression | Expression | a Binary expression which is the bytes contents of the URL, or None if an error occurred during download |
Note
If you are observing excessive S3 issues (such as timeouts, DNS errors or slowdown errors) during URL downloads, you may wish to reduce the value of max_connections (defaults to 32) to reduce the amount of load you are placing on your S3 servers.
Alternatively, if you are running on machines with lower number of cores but very high network bandwidth, you can increase max_connections to get higher throughput with additional parallelism
Source code in daft/functions/url.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94 | def download(
expr: Expression,
max_connections: int = 32,
on_error: Literal["raise", "null"] = "raise",
io_config: IOConfig | None = None,
) -> Expression:
"""Treats each string as a URL, and downloads the bytes contents as a bytes column.
Args:
expr: The expression to download.
max_connections: The maximum number of connections to use per thread to use for downloading URLs. Defaults to 32.
on_error: Behavior when a URL download error is encountered - "raise" to raise the error immediately or "null" to log
the error but fallback to a Null value. Defaults to "raise".
io_config: IOConfig to use when accessing remote storage. Note that the S3Config's `max_connections` parameter will be overridden
with `max_connections` that is passed in as a kwarg.
Returns:
Expression: a Binary expression which is the bytes contents of the URL, or None if an error occurred during download
Note:
If you are observing excessive S3 issues (such as timeouts, DNS errors or slowdown errors) during URL downloads,
you may wish to reduce the value of ``max_connections`` (defaults to 32) to reduce the amount of load you are placing
on your S3 servers.
Alternatively, if you are running on machines with lower number of cores but very high network bandwidth, you can increase
``max_connections`` to get higher throughput with additional parallelism
"""
multi_thread = _should_use_multithreading_tokio_runtime()
io_config = _override_io_config_max_connections(max_connections, io_config)
if io_config.unity.endpoint is None:
try:
from daft.catalog.__unity import UnityCatalog
except ImportError:
pass
else:
from daft.session import current_catalog
catalog = current_catalog()
if isinstance(catalog, UnityCatalog):
unity_catalog = catalog._inner
io_config = io_config.replace(unity=unity_catalog.to_io_config().unity)
return Expression._call_builtin_scalar_fn(
"url_download",
expr,
multi_thread=multi_thread,
on_error=on_error,
max_connections=max_connections,
io_config=io_config,
)
|