Skip to content

daft.functions.embed_text#

embed_text #

embed_text(text: Expression, *, provider: str | Provider | None = None, model: str | None = None, dimensions: int | None = None, **options: Unpack[EmbedTextOptions]) -> Expression

Returns an expression that embeds text using the specified embedding model and provider.

Parameters:

Name Type Description Default
text String Expression

The input text column expression.

required
provider str | Provider | None

The provider to use for the embedding model. If None, the default provider is used.

None
model str | None

The embedding model to use. Can be a model instance or a model name. If None, the default model is used.

None
dimensions int | None

Number of dimensions the output embeddings should have, if the provider and model support specifying. If None, will use the default for the model.

None
**options Unpack[EmbedTextOptions]

Any additional options to pass for the model.

{}
Note

Make sure the required provider packages are installed (e.g. vllm, transformers, openai).

Returns:

Name Type Description
Expression Embedding Expression

An expression representing the embedded text vectors.

Examples:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
>>> import daft
>>> from daft.functions import embed_text
>>> df = daft.read_huggingface("togethercomputer/RedPajama-Data-1T")
>>> # Embed Text with Defaults
>>> df = df.with_column(
...     "embeddings",
...     embed_text(
...         daft.col("text"),
...         provider="transformers",
...         model="sentence-transformers/all-MiniLM-L6-v2",
...     ),
... )
>>> df.limit(3).show()
╭────────────────────────────────┬────────────────────────────────┬───────────────────┬──────────────────────────╮
│ text                           ┆ meta                           ┆ red_pajama_subset ┆ embeddings               │
│ ---                            ┆ ---                            ┆ ---               ┆ ---                      │
│ String                         ┆ String                         ┆ String            ┆ Embedding[Float32; 384]  │
╞════════════════════════════════╪════════════════════════════════╪═══════════════════╪══════════════════════════╡
│ Григорианският календар (поня… ┆ {'title': 'Григориански кален… ┆ wikipedia         ┆ ▃▆█▆▆▆█▇▆▅▃▆▆▅▅▆▅▅▂▂▇▇▄▁ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ GNU General Public License (н… ┆ {'title': 'GNU General Public… ┆ wikipedia         ┆ ▆▁▇█▄▅▄▅▄▄▁▆▃▅▂▃▆▃▄▃█▆▇▅ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Лицензът за свободна документ… ┆ {'title': 'Лиценз за свободна… ┆ wikipedia         ┆ ▄▆██▇▇▇█▇▆▂▇▄▁▅▃▇▇▃▃▆▆▅▂ │
╰────────────────────────────────┴────────────────────────────────┴───────────────────┴──────────────────────────╯
(Showing first 3 of 3 rows)
Source code in daft/functions/ai/__init__.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def embed_text(
    text: Expression,
    *,
    provider: str | Provider | None = None,
    model: str | None = None,
    dimensions: int | None = None,
    **options: Unpack[EmbedTextOptions],
) -> Expression:
    """Returns an expression that embeds text using the specified embedding model and provider.

    Args:
        text (String Expression):
            The input text column expression.
        provider (str | Provider | None):
            The provider to use for the embedding model. If None, the default provider is used.
        model (str | None):
            The embedding model to use. Can be a model instance or a model name. If None, the default model is used.
        dimensions (int | None):
            Number of dimensions the output embeddings should have, if the provider and model support specifying. If None, will use the default for the model.
        **options: Any additional options to pass for the model.

    Note:
        Make sure the required provider packages are installed (e.g. vllm, transformers, openai).

    Returns:
        Expression (Embedding Expression): An expression representing the embedded text vectors.

    Examples:
        >>> import daft
        >>> from daft.functions import embed_text
        >>> df = daft.read_huggingface("togethercomputer/RedPajama-Data-1T")
        >>> # Embed Text with Defaults
        >>> df = df.with_column(
        ...     "embeddings",
        ...     embed_text(
        ...         daft.col("text"),
        ...         provider="transformers",
        ...         model="sentence-transformers/all-MiniLM-L6-v2",
        ...     ),
        ... )
        >>> df.limit(3).show()
        ╭────────────────────────────────┬────────────────────────────────┬───────────────────┬──────────────────────────╮
        │ text                           ┆ meta                           ┆ red_pajama_subset ┆ embeddings               │
        │ ---                            ┆ ---                            ┆ ---               ┆ ---                      │
        │ String                         ┆ String                         ┆ String            ┆ Embedding[Float32; 384]  │
        ╞════════════════════════════════╪════════════════════════════════╪═══════════════════╪══════════════════════════╡
        │ Григорианският календар (поня… ┆ {'title': 'Григориански кален… ┆ wikipedia         ┆ ▃▆█▆▆▆█▇▆▅▃▆▆▅▅▆▅▅▂▂▇▇▄▁ │
        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ GNU General Public License (н… ┆ {'title': 'GNU General Public… ┆ wikipedia         ┆ ▆▁▇█▄▅▄▅▄▄▁▆▃▅▂▃▆▃▄▃█▆▇▅ │
        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ Лицензът за свободна документ… ┆ {'title': 'Лиценз за свободна… ┆ wikipedia         ┆ ▄▆██▇▇▇█▇▆▂▇▄▁▅▃▇▇▃▃▆▆▅▂ │
        ╰────────────────────────────────┴────────────────────────────────┴───────────────────┴──────────────────────────╯
        <BLANKLINE>
        (Showing first 3 of 3 rows)
    """
    from daft.ai._expressions import _TextEmbedderExpression

    # load a TextEmbedderDescriptor from the resolved provider
    text_embedder = _resolve_provider(provider, "transformers").get_text_embedder(model, dimensions, **options)

    udf_options = text_embedder.get_udf_options()

    # Choose synchronous or asynchronous call implementation based on the embedder
    is_async = text_embedder.is_async()
    call_impl = _TextEmbedderExpression._call_async if is_async else _TextEmbedderExpression._call_sync

    # Decorate the selected call method with @daft.method to specify return_dtype
    _TextEmbedderExpression.__call__ = method.batch(  # type: ignore[method-assign]
        method=call_impl,
        return_dtype=text_embedder.get_dimensions().as_dtype(),
        batch_size=udf_options.batch_size,
    )
    wrapped_cls = daft_cls(
        _TextEmbedderExpression,
        max_concurrency=udf_options.concurrency,
        gpus=udf_options.num_gpus or 0,
        max_retries=udf_options.max_retries,
        on_error=udf_options.on_error,
        name_override="embed_text",
    )

    expr = wrapped_cls(text_embedder)
    return expr(text)