daft.functions.classify_text#

classify_text #

classify_text(text: Expression, labels: Label | list[Label], *, provider: str | Provider | None = None, model: str | None = None, **options: Unpack[ClassifyTextOptions]) -> Expression

Returns an expression that classifies text using the specified model and provider.

Parameters:

Name	Type	Description	Default
`text`	`String Expression`	The input text column expression.	required
`labels`	`str \| list[str]`	Label(s) for classification.	required
`provider`	`str \| Provider \| None`	The provider to use for the embedding model. By default this will use 'transformers' provider	`None`
`model`	`str \| None`	The classifier model to use. Can be a model instance or a model name. By default this will use `zero-shot-classification` model	`None`
`**options`	`Unpack[ClassifyTextOptions]`	Any additional options to pass for the model.	`{}`

Note

Make sure the required provider packages are installed (e.g. vllm, transformers, openai).

Returns:

Name	Type	Description
`Expression`	`String Expression`	An expression representing the most-probable label string.

Examples:

>>> import daft
>>> from daft.functions import classify_text
>>> df = daft.from_pydict({"text": ["Daft is wicked fast!"]})
>>> df = df.with_column(
...     "label",
...     classify_text(
...         daft.col("text"),
...         labels=["Positive", "Negative"],
...         provider="transformers",
...         model="tabularisai/multilingual-sentiment-analysis",
...     ),
... )
>>> df.show()

╭─────────────────────┬───────────╮
│ text                ┆ label     │
│ ---                 ┆ ---       │
│ String              ┆ String    │
╞═════════════════════╪═══════════╡
│ Daft is wicked fast!┆ Positive  │
╰─────────────────────┴───────────╯
(Showing first 1 of 1 rows)

Source code in daft/functions/ai/__init__.py

def classify_text(
    text: Expression,
    labels: Label | list[Label],
    *,
    provider: str | Provider | None = None,
    model: str | None = None,
    **options: Unpack[ClassifyTextOptions],
) -> Expression:
    """Returns an expression that classifies text using the specified model and provider.

    Args:
        text (String Expression):
            The input text column expression.
        labels (str | list[str]):
            Label(s) for classification.
        provider (str | Provider | None):
            The provider to use for the embedding model.
            By default this will use 'transformers' provider
        model (str | None):
            The classifier model to use. Can be a model instance or a model name.
            By default this will use `zero-shot-classification` model
        **options:
            Any additional options to pass for the model.

    Note:
        Make sure the required provider packages are installed (e.g. vllm, transformers, openai).

    Returns:
        Expression (String Expression): An expression representing the most-probable label string.

    Examples:
        >>> import daft
        >>> from daft.functions import classify_text
        >>> df = daft.from_pydict({"text": ["Daft is wicked fast!"]})
        >>> df = df.with_column(
        ...     "label",
        ...     classify_text(
        ...         daft.col("text"),
        ...         labels=["Positive", "Negative"],
        ...         provider="transformers",
        ...         model="tabularisai/multilingual-sentiment-analysis",
        ...     ),
        ... )
        >>> df.show()
        ╭─────────────────────┬───────────╮
        │ text                ┆ label     │
        │ ---                 ┆ ---       │
        │ String              ┆ String    │
        ╞═════════════════════╪═══════════╡
        │ Daft is wicked fast!┆ Positive  │
        ╰─────────────────────┴───────────╯
        <BLANKLINE>
        (Showing first 1 of 1 rows)
    """
    from daft.ai._expressions import _TextClassificationExpression

    text_classifier = _resolve_provider(provider, "transformers").get_text_classifier(model, **options)

    # TODO(rchowell): classification with structured outputs will be more interesting
    label_list = [labels] if isinstance(labels, str) else labels

    udf_options = text_classifier.get_udf_options()
    # Decorate the __call__ method with @daft.method to specify return_dtype
    _TextClassificationExpression.__call__ = method.batch(  # type: ignore[method-assign]
        method=_TextClassificationExpression.__call__, return_dtype=DataType.string()
    )
    wrapped_cls = daft_cls(
        _TextClassificationExpression,
        max_concurrency=udf_options.concurrency,
        gpus=udf_options.num_gpus or 0,
        max_retries=udf_options.max_retries,
        on_error=udf_options.on_error,
        name_override="classify_text",
    )

    expr = wrapped_cls(text_classifier, label_list)
    return expr(text)