Skip to content

AI#

Feature coverage table:

Feature OpenAI Google vLLM Prefix Caching LM Studio Transformers
Prompt x
Embedding x x
Classification x x x x
Image Classification x x x x

Note

Open an issue on our GitHub if you would like to see support for a feature.

Functions#

prompt #

prompt(messages: list[Expression] | Expression, return_format: BaseModel | None = None, *, system_message: str | None = None, provider: Literal['openai'] | OpenAIProvider, model: str | None = None, **options: Unpack[OpenAIPromptOptions]) -> Expression
prompt(messages: list[Expression] | Expression, return_format: BaseModel | None = None, *, system_message: str | None = None, provider: str | None, model: str | None = None, **options: Unpack[PromptOptions]) -> Expression
prompt(messages: list[Expression] | Expression, return_format: BaseModel | None = None, *, system_message: str | None = None, provider: str | Provider | None = None, model: str | None = None, **options: Any) -> Expression

Returns an expression that prompts a large language model using the specified model and provider.

Parameters:

Name Type Description Default
messages list[Expression] | Expression

The list of messages to prompt the model with. Each expression can be either: - Plain text strings (always treated as input_text) - Image data (numpy arrays, bytes, or File objects - detected by MIME type) - Files (PDF, TXT, HTML, audio, video, etc.) as bytes or File objects (detected by MIME type)

required
return_format BaseModel | None

The return format for the prompt. Use a Pydantic model for structured outputs.

None
system_message str | None

The system message for the prompt.

None
provider str | Provider | None

The provider to use for the prompt (default: "openai").

None
model str | None

The model to use for the prompt.

None
**options Any

Any additional options to pass for the prompt.

{}

Returns:

Name Type Description
Expression String Expression

An expression representing the prompt result.

Examples:

Basic Usage:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
>>> import daft
>>> from daft.ai.openai.provider import OpenAIProvider
>>> from daft.functions.ai import prompt
>>> # Create a dataframe with the quotes
>>> df = daft.from_pydict(
...     {
...         "quote": [
...             "I am going to be the king of the pirates!",
...             "I'm going to be the next Hokage!",
...         ],
...     }
... )
>>> # Use the prompt function to classify the quotes
>>> df = df.with_column(
...     "response",
...     prompt(
...         daft.col("quote"),
...         system_message="Classify the anime from the quote and return the show, character name, and explanation.",
...         provider="openai",  # Make sure OPENAI_API_KEY is set
...         model="gpt-5-nano",
...     ),
... )
>>> df.show(format="fancy", max_width=120)
╭───────────────────────────────────────────┬─────────────────────────────────────────────────────────╮
│ quote                                     ┆ response                                                │
╞═══════════════════════════════════════════╪═════════════════════════════════════════════════════════╡
│ I am going to be the king of the pirates! ┆ **Anime Name:** *One Piece*                             │
│                                           ┆ **Character:** Monkey D. Luffy                          │
│                                           ┆ **Quote:** "I am going to be the king of the pirates!"… │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ I'm going to be the next Hokage!          ┆ **Name:** Naruto                                        │
│                                           ┆ **Character:** Naruto Uzumaki                           │
│                                           ┆ **Quote:** *"I'm going to be the next Hokage!"*         │
│                                           ┆                                                         │
│                                           ┆ This quote refl…                                        │
╰───────────────────────────────────────────┴─────────────────────────────────────────────────────────╯

Structured Outputs with Custom OpenAI Provider:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
>>> import os
>>> from dotenv import load_dotenv
>>> import daft
>>> from daft.ai.openai.provider import OpenAIProvider
>>> from daft.functions.ai import prompt
>>> from daft.functions import unnest
>>> from daft.session import Session
>>> from pydantic import BaseModel, Field
>>> # Load environment variables
>>> load_dotenv()
>>> class Anime(BaseModel):
>>>     show: str = Field(description="The name of the anime show")
>>>     character: str = Field(description="The name of the character who says the quote")
>>>     explanation: str = Field(description="Why the character says the quote")
...
>>> # Create an OpenRouter provider
>>> openrouter_provider = OpenAIProvider(
...     name="OpenRouter", base_url="https://openrouter.ai/api/v1", api_key=os.environ.get("OPENROUTER_API_KEY")
... )
>>> # Create a session and attach the provider
>>> sess = Session()
>>> sess.attach_provider(openrouter_provider)
>>> sess.set_provider("OpenRouter")
>>> # Create a dataframe with the quotes
>>> df = daft.from_pydict(
...     {
...         "quote": [
...             "I am going to be the king of the pirates!",
...             "I'm going to be the next Hokage!",
...         ],
...     }
... )
>>> # Use the prompt function to classify the quotes
>>> df = df.with_column(
...     "nemotron-response",
...     prompt(
...         daft.col("quote"),
...         system_message="Classify the anime from the quote and return the show, character name, and explanation.",
...         return_format=Anime,
...         provider=sess.get_provider("OpenRouter"),
...         model="nvidia/nemotron-nano-9b-v2:free",
...     ),
... ).select("quote", unnest(daft.col("nemotron-response")))
>>> df.show(format="fancy", max_width=120)
╭───────────────────────────────────────────┬───────────┬─────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
│ quote                                     ┆ show      ┆ character       ┆ explanation                                                                                                            │
╞═══════════════════════════════════════════╪═══════════╪═════════════════╪════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╡
│ I am going to be the king of the pirates! ┆ One Piece ┆ Monkey D. Luffy ┆ Luffy famously states his dream of becoming the Pirate King throughout the series.                                     │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ I'm going to be the next Hokage!          ┆ Naruto    ┆ Naruto Uzumaki  ┆ The phrase 'I'm going to be the next Hokage!' is a recurring aspiration in the *Naruto* series, particularly voiced b… │
╰───────────────────────────────────────────┴───────────┴─────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
(Showing first 2 of 2 rows)
Source code in daft/functions/ai/__init__.py
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
def prompt(
    messages: list[Expression] | Expression,
    return_format: BaseModel | None = None,
    *,
    system_message: str | None = None,
    provider: str | Provider | None = None,
    model: str | None = None,
    **options: Any,
) -> Expression:
    """Returns an expression that prompts a large language model using the specified model and provider.

    Args:
        messages (list[Expression] | Expression): The list of messages to prompt the model with. Each expression can be either:
            - Plain text strings (always treated as input_text)
            - Image data (numpy arrays, bytes, or File objects - detected by MIME type)
            - Files (PDF, TXT, HTML, audio, video, etc.) as bytes or File objects (detected by MIME type)
        return_format (BaseModel | None): The return format for the prompt. Use a Pydantic model for structured outputs.
        system_message (str | None): The system message for the prompt.
        provider (str | Provider | None): The provider to use for the prompt (default: "openai").
        model (str | None): The model to use for the prompt.
        **options: Any additional options to pass for the prompt.

    Returns:
        Expression (String Expression): An expression representing the prompt result.

    Examples:
        Basic Usage:
        >>> import daft
        >>> from daft.ai.openai.provider import OpenAIProvider
        >>> from daft.functions.ai import prompt
        >>> # Create a dataframe with the quotes
        >>> df = daft.from_pydict(
        ...     {
        ...         "quote": [
        ...             "I am going to be the king of the pirates!",
        ...             "I'm going to be the next Hokage!",
        ...         ],
        ...     }
        ... )
        >>> # Use the prompt function to classify the quotes
        >>> df = df.with_column(
        ...     "response",
        ...     prompt(
        ...         daft.col("quote"),
        ...         system_message="Classify the anime from the quote and return the show, character name, and explanation.",
        ...         provider="openai",  # Make sure OPENAI_API_KEY is set
        ...         model="gpt-5-nano",
        ...     ),
        ... )
        >>> df.show(format="fancy", max_width=120)
        ╭───────────────────────────────────────────┬─────────────────────────────────────────────────────────╮
        │ quote                                     ┆ response                                                │
        ╞═══════════════════════════════════════════╪═════════════════════════════════════════════════════════╡
        │ I am going to be the king of the pirates! ┆ **Anime Name:** *One Piece*                             │
        │                                           ┆ **Character:** Monkey D. Luffy                          │
        │                                           ┆ **Quote:** "I am going to be the king of the pirates!"… │
        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ I'm going to be the next Hokage!          ┆ **Name:** Naruto                                        │
        │                                           ┆ **Character:** Naruto Uzumaki                           │
        │                                           ┆ **Quote:** *"I'm going to be the next Hokage!"*         │
        │                                           ┆                                                         │
        │                                           ┆ This quote refl…                                        │
        ╰───────────────────────────────────────────┴─────────────────────────────────────────────────────────╯

        Structured Outputs with Custom OpenAI Provider:
        >>> import os
        >>> from dotenv import load_dotenv
        >>> import daft
        >>> from daft.ai.openai.provider import OpenAIProvider
        >>> from daft.functions.ai import prompt
        >>> from daft.functions import unnest
        >>> from daft.session import Session
        >>> from pydantic import BaseModel, Field
        >>> # Load environment variables
        >>> load_dotenv()
        >>> class Anime(BaseModel):
        >>>     show: str = Field(description="The name of the anime show")
        >>>     character: str = Field(description="The name of the character who says the quote")
        >>>     explanation: str = Field(description="Why the character says the quote")
        ...
        >>> # Create an OpenRouter provider
        >>> openrouter_provider = OpenAIProvider(
        ...     name="OpenRouter", base_url="https://openrouter.ai/api/v1", api_key=os.environ.get("OPENROUTER_API_KEY")
        ... )
        >>> # Create a session and attach the provider
        >>> sess = Session()
        >>> sess.attach_provider(openrouter_provider)
        >>> sess.set_provider("OpenRouter")
        >>> # Create a dataframe with the quotes
        >>> df = daft.from_pydict(
        ...     {
        ...         "quote": [
        ...             "I am going to be the king of the pirates!",
        ...             "I'm going to be the next Hokage!",
        ...         ],
        ...     }
        ... )
        >>> # Use the prompt function to classify the quotes
        >>> df = df.with_column(
        ...     "nemotron-response",
        ...     prompt(
        ...         daft.col("quote"),
        ...         system_message="Classify the anime from the quote and return the show, character name, and explanation.",
        ...         return_format=Anime,
        ...         provider=sess.get_provider("OpenRouter"),
        ...         model="nvidia/nemotron-nano-9b-v2:free",
        ...     ),
        ... ).select("quote", unnest(daft.col("nemotron-response")))
        >>> df.show(format="fancy", max_width=120)
        ╭───────────────────────────────────────────┬───────────┬─────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
        │ quote                                     ┆ show      ┆ character       ┆ explanation                                                                                                            │
        ╞═══════════════════════════════════════════╪═══════════╪═════════════════╪════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╡
        │ I am going to be the king of the pirates! ┆ One Piece ┆ Monkey D. Luffy ┆ Luffy famously states his dream of becoming the Pirate King throughout the series.                                     │
        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ I'm going to be the next Hokage!          ┆ Naruto    ┆ Naruto Uzumaki  ┆ The phrase 'I'm going to be the next Hokage!' is a recurring aspiration in the *Naruto* series, particularly voiced b… │
        ╰───────────────────────────────────────────┴───────────┴─────────────────┴────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
        <BLANKLINE>
        (Showing first 2 of 2 rows)
    """
    from daft.ai._expressions import _PrompterExpression

    # Clean the Pydantic model to avoid Colab serialization issues
    if return_format is not None and IS_COLAB:
        return_format = clean_pydantic_model(return_format)

    # Load a PrompterDescriptor from the resolved provider
    # Pass return_format and system_message as explicit named arguments
    prompter_descriptor = _resolve_provider(provider, "openai").get_prompter(
        model,
        return_format=return_format,
        system_message=system_message,
        **options,
    )

    # Check if this is a vLLM provider - if so, use PyExpr.vllm directly
    from daft.ai.vllm.protocols.prompter import VLLMPrefixCachingPrompterDescriptor

    if isinstance(prompter_descriptor, VLLMPrefixCachingPrompterDescriptor):
        if return_format is not None:
            raise ValueError("return_format is not supported for vLLM provider")

        if system_message is not None:
            raise ValueError("system_message is not supported for vLLM provider")

        if isinstance(messages, list):
            raise ValueError("vLLM provider does not support multiple messages")

        vllm_options = prompter_descriptor.get_options()
        return Expression._from_pyexpr(
            messages._expr.vllm(
                prompter_descriptor.model_name,
                vllm_options["concurrency"],
                vllm_options["gpus_per_actor"],
                vllm_options["do_prefix_routing"],
                vllm_options["max_buffer_size"],
                vllm_options["min_bucket_size"],
                vllm_options["prefix_match_threshold"],
                vllm_options["load_balance_threshold"],
                vllm_options["batch_size"],
                vllm_options["engine_args"],
                vllm_options["generate_args"],
            )
        )

    # For non-vLLM providers, use the standard UDF-based execution path
    from daft.udf import method

    # Determine return dtype
    if return_format is not None:
        try:
            return_dtype = DataType.infer_from_type(return_format)
        except Exception:
            return_dtype = DataType.string()
    else:
        return_dtype = DataType.string()

    # Get UDF options from the descriptor
    udf_options = prompter_descriptor.get_udf_options()

    # Decorate the __call__ method with @daft.method to specify return_dtype
    _PrompterExpression.__call__ = method(method=_PrompterExpression.prompt, return_dtype=return_dtype)  # type: ignore[method-assign]

    # Wrap the class with @daft.cls
    wrapped_cls = daft_cls(
        _PrompterExpression,
        gpus=udf_options.num_gpus or 0,
        max_concurrency=udf_options.concurrency,
        max_retries=udf_options.max_retries,
        on_error=udf_options.on_error,
        name_override="prompt",
    )

    # Instantiate the wrapped class with the prompter descriptor
    instance = wrapped_cls(prompter_descriptor)

    # Call the instance (which calls __call__ method) with the messages expression
    if isinstance(messages, list):
        return instance(*messages)
    else:
        return instance(messages)

embed_text #

embed_text(text: Expression, *, provider: str | Provider | None = None, model: str | None = None, dimensions: int | None = None, **options: Unpack[EmbedTextOptions]) -> Expression

Returns an expression that embeds text using the specified embedding model and provider.

Parameters:

Name Type Description Default
text String Expression

The input text column expression.

required
provider str | Provider | None

The provider to use for the embedding model. If None, the default provider is used.

None
model str | None

The embedding model to use. Can be a model instance or a model name. If None, the default model is used.

None
dimensions int | None

Number of dimensions the output embeddings should have, if the provider and model support specifying. If None, will use the default for the model.

None
**options Unpack[EmbedTextOptions]

Any additional options to pass for the model.

{}
Note

Make sure the required provider packages are installed (e.g. vllm, transformers, openai).

Returns:

Name Type Description
Expression Embedding Expression

An expression representing the embedded text vectors.

Examples:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
>>> import daft
>>> from daft.functions import embed_text
>>> df = daft.read_huggingface("togethercomputer/RedPajama-Data-1T")
>>> # Embed Text with Defaults
>>> df = df.with_column(
...     "embeddings",
...     embed_text(
...         daft.col("text"),
...         provider="transformers",
...         model="sentence-transformers/all-MiniLM-L6-v2",
...     ),
... )
>>> df.limit(3).show()
╭────────────────────────────────┬────────────────────────────────┬───────────────────┬──────────────────────────╮
│ text                           ┆ meta                           ┆ red_pajama_subset ┆ embeddings               │
│ ---                            ┆ ---                            ┆ ---               ┆ ---                      │
│ String                         ┆ String                         ┆ String            ┆ Embedding[Float32; 384]  │
╞════════════════════════════════╪════════════════════════════════╪═══════════════════╪══════════════════════════╡
│ Григорианският календар (поня… ┆ {'title': 'Григориански кален… ┆ wikipedia         ┆ ▃▆█▆▆▆█▇▆▅▃▆▆▅▅▆▅▅▂▂▇▇▄▁ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ GNU General Public License (н… ┆ {'title': 'GNU General Public… ┆ wikipedia         ┆ ▆▁▇█▄▅▄▅▄▄▁▆▃▅▂▃▆▃▄▃█▆▇▅ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ Лицензът за свободна документ… ┆ {'title': 'Лиценз за свободна… ┆ wikipedia         ┆ ▄▆██▇▇▇█▇▆▂▇▄▁▅▃▇▇▃▃▆▆▅▂ │
╰────────────────────────────────┴────────────────────────────────┴───────────────────┴──────────────────────────╯
(Showing first 3 of 3 rows)
Source code in daft/functions/ai/__init__.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def embed_text(
    text: Expression,
    *,
    provider: str | Provider | None = None,
    model: str | None = None,
    dimensions: int | None = None,
    **options: Unpack[EmbedTextOptions],
) -> Expression:
    """Returns an expression that embeds text using the specified embedding model and provider.

    Args:
        text (String Expression):
            The input text column expression.
        provider (str | Provider | None):
            The provider to use for the embedding model. If None, the default provider is used.
        model (str | None):
            The embedding model to use. Can be a model instance or a model name. If None, the default model is used.
        dimensions (int | None):
            Number of dimensions the output embeddings should have, if the provider and model support specifying. If None, will use the default for the model.
        **options: Any additional options to pass for the model.

    Note:
        Make sure the required provider packages are installed (e.g. vllm, transformers, openai).

    Returns:
        Expression (Embedding Expression): An expression representing the embedded text vectors.

    Examples:
        >>> import daft
        >>> from daft.functions import embed_text
        >>> df = daft.read_huggingface("togethercomputer/RedPajama-Data-1T")
        >>> # Embed Text with Defaults
        >>> df = df.with_column(
        ...     "embeddings",
        ...     embed_text(
        ...         daft.col("text"),
        ...         provider="transformers",
        ...         model="sentence-transformers/all-MiniLM-L6-v2",
        ...     ),
        ... )
        >>> df.limit(3).show()
        ╭────────────────────────────────┬────────────────────────────────┬───────────────────┬──────────────────────────╮
        │ text                           ┆ meta                           ┆ red_pajama_subset ┆ embeddings               │
        │ ---                            ┆ ---                            ┆ ---               ┆ ---                      │
        │ String                         ┆ String                         ┆ String            ┆ Embedding[Float32; 384]  │
        ╞════════════════════════════════╪════════════════════════════════╪═══════════════════╪══════════════════════════╡
        │ Григорианският календар (поня… ┆ {'title': 'Григориански кален… ┆ wikipedia         ┆ ▃▆█▆▆▆█▇▆▅▃▆▆▅▅▆▅▅▂▂▇▇▄▁ │
        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ GNU General Public License (н… ┆ {'title': 'GNU General Public… ┆ wikipedia         ┆ ▆▁▇█▄▅▄▅▄▄▁▆▃▅▂▃▆▃▄▃█▆▇▅ │
        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ Лицензът за свободна документ… ┆ {'title': 'Лиценз за свободна… ┆ wikipedia         ┆ ▄▆██▇▇▇█▇▆▂▇▄▁▅▃▇▇▃▃▆▆▅▂ │
        ╰────────────────────────────────┴────────────────────────────────┴───────────────────┴──────────────────────────╯
        <BLANKLINE>
        (Showing first 3 of 3 rows)
    """
    from daft.ai._expressions import _TextEmbedderExpression

    # load a TextEmbedderDescriptor from the resolved provider
    text_embedder = _resolve_provider(provider, "transformers").get_text_embedder(model, dimensions, **options)

    udf_options = text_embedder.get_udf_options()

    # Choose synchronous or asynchronous call implementation based on the embedder
    is_async = text_embedder.is_async()
    call_impl = _TextEmbedderExpression._call_async if is_async else _TextEmbedderExpression._call_sync

    # Decorate the selected call method with @daft.method to specify return_dtype
    _TextEmbedderExpression.__call__ = method.batch(  # type: ignore[method-assign]
        method=call_impl,
        return_dtype=text_embedder.get_dimensions().as_dtype(),
        batch_size=udf_options.batch_size,
    )
    wrapped_cls = daft_cls(
        _TextEmbedderExpression,
        max_concurrency=udf_options.concurrency,
        gpus=udf_options.num_gpus or 0,
        max_retries=udf_options.max_retries,
        on_error=udf_options.on_error,
        name_override="embed_text",
    )

    expr = wrapped_cls(text_embedder)
    return expr(text)

embed_image #

embed_image(image: Expression, *, provider: str | Provider | None = None, model: str | None = None, **options: Unpack[EmbedImageOptions]) -> Expression

Returns an expression that embeds images using the specified image model and provider.

Parameters:

Name Type Description Default
image Image Expression

The input image column expression.

required
provider str | Provider | None

The provider to use for the image model. If None, the default provider is used.

None
model str | None

The image model to use. Can be a model instance or a model name. If None, the default model is used.

None
**options Unpack[EmbedImageOptions]

Any additional options to pass for the model.

{}
Note

Make sure the required provider packages are installed (e.g. vllm, transformers, openai).

Returns:

Name Type Description
Expression Embedding Expression

An expression representing the embedded image vectors.

Examples:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
>>> import daft
>>> from daft.functions import embed_image, decode_image
>>> df = (
...     # Discover a few images from HuggingFace
...     daft.from_glob_path("hf://datasets/datasets-examples/doc-image-3/images")
...     # Read the 4 PNG, JPEG, TIFF, WEBP Images
...     .with_column("image_bytes", daft.col("path").download())
...     # Decode the image bytes into a daft Image DataType
...     .with_column("image_type", decode_image(daft.col("image_bytes")))
...     # Convert Image to RGB and resize the image to 288x288
...     .with_column("image_resized", daft.col("image_type").convert_image("RGB").resize(288, 288))
...     # Embed the image
...     .with_column(
...         "image_embeddings",
...         embed_image(
...             daft.col("image_resized"), provider="transformers", model="apple/aimv2-large-patch14-224-lit"
...         ),
...     )
... )
>>> df.show()
╭────────────────────────────────┬─────────┬───────────────┬──────────────┬───────────────────────┬──────────────────────────╮
│ path                           ┆ size    ┆ image_bytes   ┆ image_type   ┆ image_resized         ┆ image_embeddings         │
│ ---                            ┆ ---     ┆ ---           ┆ ---          ┆ ---                   ┆ ---                      │
│ String                         ┆ Int64   ┆ Binary        ┆ Image[MIXED] ┆ Image[RGB; 288 x 288] ┆ Embedding[Float32; 768]  │
╞════════════════════════════════╪═════════╪═══════════════╪══════════════╪═══════════════════════╪══════════════════════════╡
│ hf://datasets/datasets-exampl… ┆ 113469  ┆ ...           ┆ <Image>      ┆ <FixedShapeImage>     ┆ ▃▅▅▆▆▂▅▆▅▇█▂▂▄▅▂▆▃▃▅▁▇▃▅ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ hf://datasets/datasets-exampl… ┆ 206898  ┆ ...           ┆ <Image>      ┆ <FixedShapeImage>     ┆ ▃▃▄▆▄▅▃▄▅▅▅▃▂▇▁▁▁▂▃▅▄█▃▅ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ hf://datasets/datasets-exampl… ┆ 1871034 ┆ ...           ┆ <Image>      ┆ <FixedShapeImage>     ┆ ▂▃▃▃▄▄▃▆▆▄▅▂▁▃▁▄▃▅▄▄▂█▆▆ │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ hf://datasets/datasets-exampl… ┆ 22022   ┆ ...           ┆ <Image>      ┆ <FixedShapeImage>     ┆ ▄▂▂▅▆▆▅▇▆▄▅▆▃▅▅▁▃▄▄▄▃█▃▆ │
╰────────────────────────────────┴─────────┴───────────────┴──────────────┴───────────────────────┴──────────────────────────╯
(Showing first 4 of 4 rows)
Source code in daft/functions/ai/__init__.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
def embed_image(
    image: Expression,
    *,
    provider: str | Provider | None = None,
    model: str | None = None,
    **options: Unpack[EmbedImageOptions],
) -> Expression:
    """Returns an expression that embeds images using the specified image model and provider.

    Args:
        image (Image Expression): The input image column expression.
        provider (str | Provider | None): The provider to use for the image model. If None, the default provider is used.
        model (str | None): The image model to use. Can be a model instance or a model name. If None, the default model is used.
        **options: Any additional options to pass for the model.

    Note:
        Make sure the required provider packages are installed (e.g. vllm, transformers, openai).

    Returns:
        Expression (Embedding Expression): An expression representing the embedded image vectors.

    Examples:
        >>> import daft
        >>> from daft.functions import embed_image, decode_image
        >>> df = (
        ...     # Discover a few images from HuggingFace
        ...     daft.from_glob_path("hf://datasets/datasets-examples/doc-image-3/images")
        ...     # Read the 4 PNG, JPEG, TIFF, WEBP Images
        ...     .with_column("image_bytes", daft.col("path").download())
        ...     # Decode the image bytes into a daft Image DataType
        ...     .with_column("image_type", decode_image(daft.col("image_bytes")))
        ...     # Convert Image to RGB and resize the image to 288x288
        ...     .with_column("image_resized", daft.col("image_type").convert_image("RGB").resize(288, 288))
        ...     # Embed the image
        ...     .with_column(
        ...         "image_embeddings",
        ...         embed_image(
        ...             daft.col("image_resized"), provider="transformers", model="apple/aimv2-large-patch14-224-lit"
        ...         ),
        ...     )
        ... )
        >>> df.show()
        ╭────────────────────────────────┬─────────┬───────────────┬──────────────┬───────────────────────┬──────────────────────────╮
        │ path                           ┆ size    ┆ image_bytes   ┆ image_type   ┆ image_resized         ┆ image_embeddings         │
        │ ---                            ┆ ---     ┆ ---           ┆ ---          ┆ ---                   ┆ ---                      │
        │ String                         ┆ Int64   ┆ Binary        ┆ Image[MIXED] ┆ Image[RGB; 288 x 288] ┆ Embedding[Float32; 768]  │
        ╞════════════════════════════════╪═════════╪═══════════════╪══════════════╪═══════════════════════╪══════════════════════════╡
        │ hf://datasets/datasets-exampl… ┆ 113469  ┆ ...           ┆ <Image>      ┆ <FixedShapeImage>     ┆ ▃▅▅▆▆▂▅▆▅▇█▂▂▄▅▂▆▃▃▅▁▇▃▅ │
        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ hf://datasets/datasets-exampl… ┆ 206898  ┆ ...           ┆ <Image>      ┆ <FixedShapeImage>     ┆ ▃▃▄▆▄▅▃▄▅▅▅▃▂▇▁▁▁▂▃▅▄█▃▅ │
        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ hf://datasets/datasets-exampl… ┆ 1871034 ┆ ...           ┆ <Image>      ┆ <FixedShapeImage>     ┆ ▂▃▃▃▄▄▃▆▆▄▅▂▁▃▁▄▃▅▄▄▂█▆▆ │
        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ hf://datasets/datasets-exampl… ┆ 22022   ┆ ...           ┆ <Image>      ┆ <FixedShapeImage>     ┆ ▄▂▂▅▆▆▅▇▆▄▅▆▃▅▅▁▃▄▄▄▃█▃▆ │
        ╰────────────────────────────────┴─────────┴───────────────┴──────────────┴───────────────────────┴──────────────────────────╯
        <BLANKLINE>
        (Showing first 4 of 4 rows)
    """
    from daft.ai._expressions import _ImageEmbedderExpression

    image_embedder = _resolve_provider(provider, "transformers").get_image_embedder(model, **options)

    udf_options = image_embedder.get_udf_options()

    # Choose synchronous or asynchronous call implementation based on the embedder
    is_async = image_embedder.is_async()
    call_impl = _ImageEmbedderExpression._call_async if is_async else _ImageEmbedderExpression._call_sync

    # Decorate the selected call method with @daft.method to specify return_dtype
    _ImageEmbedderExpression.__call__ = method.batch(  # type: ignore[method-assign]
        method=call_impl,
        return_dtype=image_embedder.get_dimensions().as_dtype(),
        batch_size=udf_options.batch_size,
    )

    wrapped_cls = daft_cls(
        _ImageEmbedderExpression,
        max_concurrency=udf_options.concurrency,
        gpus=udf_options.num_gpus or 0,
        max_retries=udf_options.max_retries,
        on_error=udf_options.on_error,
        name_override="embed_image",
    )

    expr = wrapped_cls(image_embedder)
    return expr(image)

classify_text #

classify_text(text: Expression, labels: Label | list[Label], *, provider: str | Provider | None = None, model: str | None = None, **options: Unpack[ClassifyTextOptions]) -> Expression

Returns an expression that classifies text using the specified model and provider.

Parameters:

Name Type Description Default
text String Expression

The input text column expression.

required
labels str | list[str]

Label(s) for classification.

required
provider str | Provider | None

The provider to use for the embedding model. By default this will use 'transformers' provider

None
model str | None

The classifier model to use. Can be a model instance or a model name. By default this will use zero-shot-classification model

None
**options Unpack[ClassifyTextOptions]

Any additional options to pass for the model.

{}
Note

Make sure the required provider packages are installed (e.g. vllm, transformers, openai).

Returns:

Name Type Description
Expression String Expression

An expression representing the most-probable label string.

Examples:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
>>> import daft
>>> from daft.functions import classify_text
>>> df = daft.from_pydict({"text": ["Daft is wicked fast!"]})
>>> df = df.with_column(
...     "label",
...     classify_text(
...         daft.col("text"),
...         labels=["Positive", "Negative"],
...         provider="transformers",
...         model="tabularisai/multilingual-sentiment-analysis",
...     ),
... )
>>> df.show()
╭─────────────────────┬───────────╮
│ text                ┆ label     │
│ ---                 ┆ ---       │
│ String              ┆ String    │
╞═════════════════════╪═══════════╡
│ Daft is wicked fast!┆ Positive  │
╰─────────────────────┴───────────╯
(Showing first 1 of 1 rows)
Source code in daft/functions/ai/__init__.py
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
def classify_text(
    text: Expression,
    labels: Label | list[Label],
    *,
    provider: str | Provider | None = None,
    model: str | None = None,
    **options: Unpack[ClassifyTextOptions],
) -> Expression:
    """Returns an expression that classifies text using the specified model and provider.

    Args:
        text (String Expression):
            The input text column expression.
        labels (str | list[str]):
            Label(s) for classification.
        provider (str | Provider | None):
            The provider to use for the embedding model.
            By default this will use 'transformers' provider
        model (str | None):
            The classifier model to use. Can be a model instance or a model name.
            By default this will use `zero-shot-classification` model
        **options:
            Any additional options to pass for the model.

    Note:
        Make sure the required provider packages are installed (e.g. vllm, transformers, openai).

    Returns:
        Expression (String Expression): An expression representing the most-probable label string.

    Examples:
        >>> import daft
        >>> from daft.functions import classify_text
        >>> df = daft.from_pydict({"text": ["Daft is wicked fast!"]})
        >>> df = df.with_column(
        ...     "label",
        ...     classify_text(
        ...         daft.col("text"),
        ...         labels=["Positive", "Negative"],
        ...         provider="transformers",
        ...         model="tabularisai/multilingual-sentiment-analysis",
        ...     ),
        ... )
        >>> df.show()
        ╭─────────────────────┬───────────╮
        │ text                ┆ label     │
        │ ---                 ┆ ---       │
        │ String              ┆ String    │
        ╞═════════════════════╪═══════════╡
        │ Daft is wicked fast!┆ Positive  │
        ╰─────────────────────┴───────────╯
        <BLANKLINE>
        (Showing first 1 of 1 rows)
    """
    from daft.ai._expressions import _TextClassificationExpression

    text_classifier = _resolve_provider(provider, "transformers").get_text_classifier(model, **options)

    # TODO(rchowell): classification with structured outputs will be more interesting
    label_list = [labels] if isinstance(labels, str) else labels

    udf_options = text_classifier.get_udf_options()
    # Decorate the __call__ method with @daft.method to specify return_dtype
    _TextClassificationExpression.__call__ = method.batch(  # type: ignore[method-assign]
        method=_TextClassificationExpression.__call__, return_dtype=DataType.string()
    )
    wrapped_cls = daft_cls(
        _TextClassificationExpression,
        max_concurrency=udf_options.concurrency,
        gpus=udf_options.num_gpus or 0,
        max_retries=udf_options.max_retries,
        on_error=udf_options.on_error,
        name_override="classify_text",
    )

    expr = wrapped_cls(text_classifier, label_list)
    return expr(text)

classify_image #

classify_image(image: Expression, labels: Label | list[Label], *, provider: str | Provider | None = None, model: str | None = None, **options: Unpack[ClassifyImageOptions]) -> Expression

Returns an expression that classifies images using the specified model and provider.

Parameters:

Name Type Description Default
image Image Expression

The input image column expression.

required
labels str | list[str]

Label(s) for classification.

required
provider str | Provider | None

The provider to use for the embedding model. By default this will use 'transformers' provider

None
model str | None

The classifier model to use. Can be a model instance or a model name. By default this will use zero-shot-classification model

None
**options Unpack[ClassifyImageOptions]

Any additional options to pass for the model.

{}
Note

Make sure the required provider packages are installed (e.g. vllm, transformers, openai).

Returns:

Name Type Description
Expression String Expression

An expression representing the most-probable label string.

Examples:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
>>> import daft
>>> from daft.functions import classify_image, decode_image
>>> df = (
...     # Discover a few images from HuggingFace
...     daft.from_glob_path("hf://datasets/datasets-examples/doc-image-3/images")
...     # Read the 4 PNG, JPEG, TIFF, WEBP Images
...     .with_column("image_bytes", daft.col("path").download())
...     # Decode the image bytes into a daft Image DataType
...     .with_column("image_type", decode_image(daft.col("image_bytes")))
...     # Convert Image to RGB and resize the image to 288x288
...     .with_column("image_resized", daft.col("image_type").convert_image("RGB").resize(288, 288))
...     # Classify the image
...     .with_column(
...         "image_label",
...         classify_image(
...             daft.col("image_resized"),
...             labels=["bulbasaur", "catapie", "voltorb", "electrode"],
...             provider="transformers",
...             model="openai/clip-vit-base-patch32",
...         ),
...     )
... )
>>> df.show()
╭────────────────────────────────┬─────────┬────────────────┬──────────────┬───────────────────────┬───────────────╮
│ path                           ┆ size    ┆ image_bytes    ┆ image_type   ┆ image_resized         ┆ image_labels  │
│ ---                            ┆ ---     ┆ ---            ┆ ---          ┆ ---                   ┆ ---           │
│ String                         ┆ Int64   ┆ Binary         ┆ Image[MIXED] ┆ Image[RGB; 288 x 288] ┆ String        │
╞════════════════════════════════╪═════════╪════════════════╪══════════════╪═══════════════════════╪═══════════════╡
│ hf://datasets/datasets-exampl… ┆ 113469  ┆ ...            ┆ <Image>      ┆ <FixedShapeImage>     ┆ bulbasaur     │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ hf://datasets/datasets-exampl… ┆ 206898  ┆ ...            ┆ <Image>      ┆ <FixedShapeImage>     ┆ catapie       │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ hf://datasets/datasets-exampl… ┆ 1871034 ┆ ...            ┆ <Image>      ┆ <FixedShapeImage>     ┆ voltorb       │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ hf://datasets/datasets-exampl… ┆ 22022   ┆ ...            ┆ <Image>      ┆ <FixedShapeImage>     ┆ electrode     │
╰────────────────────────────────┴─────────┴────────────────┴──────────────┴───────────────────────┴───────────────╯
(Showing first 4 of 4 rows)
Source code in daft/functions/ai/__init__.py
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
def classify_image(
    image: Expression,
    labels: Label | list[Label],
    *,
    provider: str | Provider | None = None,
    model: str | None = None,
    **options: Unpack[ClassifyImageOptions],
) -> Expression:
    """Returns an expression that classifies images using the specified model and provider.

    Args:
        image (Image Expression):
            The input image column expression.
        labels (str | list[str]):
            Label(s) for classification.
        provider (str | Provider | None):
            The provider to use for the embedding model.
            By default this will use 'transformers' provider
        model (str | None):
            The classifier model to use. Can be a model instance or a model name.
            By default this will use `zero-shot-classification` model
        **options:
            Any additional options to pass for the model.

    Note:
        Make sure the required provider packages are installed (e.g. vllm, transformers, openai).

    Returns:
        Expression (String Expression): An expression representing the most-probable label string.

    Examples:
        >>> import daft
        >>> from daft.functions import classify_image, decode_image
        >>> df = (
        ...     # Discover a few images from HuggingFace
        ...     daft.from_glob_path("hf://datasets/datasets-examples/doc-image-3/images")
        ...     # Read the 4 PNG, JPEG, TIFF, WEBP Images
        ...     .with_column("image_bytes", daft.col("path").download())
        ...     # Decode the image bytes into a daft Image DataType
        ...     .with_column("image_type", decode_image(daft.col("image_bytes")))
        ...     # Convert Image to RGB and resize the image to 288x288
        ...     .with_column("image_resized", daft.col("image_type").convert_image("RGB").resize(288, 288))
        ...     # Classify the image
        ...     .with_column(
        ...         "image_label",
        ...         classify_image(
        ...             daft.col("image_resized"),
        ...             labels=["bulbasaur", "catapie", "voltorb", "electrode"],
        ...             provider="transformers",
        ...             model="openai/clip-vit-base-patch32",
        ...         ),
        ...     )
        ... )
        >>> df.show()
        ╭────────────────────────────────┬─────────┬────────────────┬──────────────┬───────────────────────┬───────────────╮
        │ path                           ┆ size    ┆ image_bytes    ┆ image_type   ┆ image_resized         ┆ image_labels  │
        │ ---                            ┆ ---     ┆ ---            ┆ ---          ┆ ---                   ┆ ---           │
        │ String                         ┆ Int64   ┆ Binary         ┆ Image[MIXED] ┆ Image[RGB; 288 x 288] ┆ String        │
        ╞════════════════════════════════╪═════════╪════════════════╪══════════════╪═══════════════════════╪═══════════════╡
        │ hf://datasets/datasets-exampl… ┆ 113469  ┆ ...            ┆ <Image>      ┆ <FixedShapeImage>     ┆ bulbasaur     │
        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ hf://datasets/datasets-exampl… ┆ 206898  ┆ ...            ┆ <Image>      ┆ <FixedShapeImage>     ┆ catapie       │
        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ hf://datasets/datasets-exampl… ┆ 1871034 ┆ ...            ┆ <Image>      ┆ <FixedShapeImage>     ┆ voltorb       │
        ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
        │ hf://datasets/datasets-exampl… ┆ 22022   ┆ ...            ┆ <Image>      ┆ <FixedShapeImage>     ┆ electrode     │
        ╰────────────────────────────────┴─────────┴────────────────┴──────────────┴───────────────────────┴───────────────╯
        <BLANKLINE>
        (Showing first 4 of 4 rows)
    """
    from daft.ai._expressions import _ImageClassificationExpression

    image_classifier = _resolve_provider(provider, "transformers").get_image_classifier(model, **options)

    # TODO: classification with structured outputs will be more interesting
    label_list = [labels] if isinstance(labels, str) else labels
    # Decorate the __call__ method with @daft.method to specify return_dtype
    _ImageClassificationExpression.__call__ = method.batch(  # type: ignore[method-assign]
        method=_ImageClassificationExpression.__call__,
        return_dtype=DataType.string(),
    )
    # implemented as a class-based udf for now
    udf_options = image_classifier.get_udf_options()
    wrapped_cls = daft_cls(
        _ImageClassificationExpression,
        max_concurrency=udf_options.concurrency,
        gpus=udf_options.num_gpus or 0,
        max_retries=udf_options.max_retries,
        on_error=udf_options.on_error,
        name_override="classify_image",
    )
    instance = wrapped_cls(image_classifier, label_list)
    return instance(image)

Model Protocols#

Prompter #

Protocol for prompt/chat completion implementations.

Methods:

Name Description
prompt

Generates responses for a batch of message strings.

prompt #

prompt(messages: tuple[Any, ...]) -> Any

Generates responses for a batch of message strings.

Source code in daft/ai/protocols.py
85
86
87
async def prompt(self, messages: tuple[Any, ...]) -> Any:
    """Generates responses for a batch of message strings."""
    ...

TextEmbedder #

Protocol for text embedding implementations.

Methods:

Name Description
embed_text

Embeds a batch of text strings into an embedding vector.

embed_text #

embed_text(text: list[str]) -> list[Embedding] | Awaitable[list[Embedding]]

Embeds a batch of text strings into an embedding vector.

Source code in daft/ai/protocols.py
18
19
def embed_text(self, text: list[str]) -> list[Embedding] | Awaitable[list[Embedding]]:
    """Embeds a batch of text strings into an embedding vector."""

TextEmbedderDescriptor #

Descriptor for a TextEmbedder implementation.

Methods:

Name Description
get_dimensions

Returns the dimensions of the embeddings produced by the described TextEmbedder.

is_async

Whether the described TextEmbedder produces awaitable results.

get_dimensions #

get_dimensions() -> EmbeddingDimensions

Returns the dimensions of the embeddings produced by the described TextEmbedder.

Source code in daft/ai/protocols.py
25
26
27
@abstractmethod
def get_dimensions(self) -> EmbeddingDimensions:
    """Returns the dimensions of the embeddings produced by the described TextEmbedder."""

is_async #

is_async() -> bool

Whether the described TextEmbedder produces awaitable results.

Source code in daft/ai/protocols.py
29
30
31
def is_async(self) -> bool:
    """Whether the described TextEmbedder produces awaitable results."""
    return False

ImageEmbedder #

Protocol for image embedding implementations.

Methods:

Name Description
embed_image

Embeds a batch of images into an embedding vector.

embed_image #

embed_image(images: list[Image]) -> list[Embedding] | Awaitable[list[Embedding]]

Embeds a batch of images into an embedding vector.

Source code in daft/ai/protocols.py
38
39
40
def embed_image(self, images: list[Image]) -> list[Embedding] | Awaitable[list[Embedding]]:
    """Embeds a batch of images into an embedding vector."""
    ...

ImageEmbedderDescriptor #

Descriptor for an ImageEmbedder implementation.

Methods:

Name Description
get_dimensions

Returns the dimensions of the embeddings produced by the described ImageEmbedder.

is_async

Whether the described ImageEmbedder produces awaitable results.

get_dimensions #

get_dimensions() -> EmbeddingDimensions

Returns the dimensions of the embeddings produced by the described ImageEmbedder.

Source code in daft/ai/protocols.py
46
47
48
@abstractmethod
def get_dimensions(self) -> EmbeddingDimensions:
    """Returns the dimensions of the embeddings produced by the described ImageEmbedder."""

is_async #

is_async() -> bool

Whether the described ImageEmbedder produces awaitable results.

Source code in daft/ai/protocols.py
50
51
52
def is_async(self) -> bool:
    """Whether the described ImageEmbedder produces awaitable results."""
    return False

TextClassifier #

Protocol for text classification implementations.

Methods:

Name Description
classify_text

Classifies a batch of text strings using the given label(s).

classify_text #

classify_text(text: list[str], labels: Label | list[Label]) -> list[Label]

Classifies a batch of text strings using the given label(s).

Source code in daft/ai/protocols.py
59
60
61
def classify_text(self, text: list[str], labels: Label | list[Label]) -> list[Label]:
    """Classifies a batch of text strings using the given label(s)."""
    ...

TextClassifierDescriptor #

Descriptor for a TextClassifier implementation.

Providers#

Provider #

Provider is the base class for resolving implementations for the various AI/ML protocols.

Handles integration with model providers such as OpenAI, LM Studio, Hugging Face Transformers, etc. Provides a unified interface for model access and execution regardless of the underlying implementation.

Note

We will need to move instantiation from the TextEmbedderDesriptor to the Provider or other. It is not set at the moment, and instantiation directly from the descriptor is the easiest. We could opt to include a factory method location (descriptor's init) in the serialization.

Methods:

Name Description
get_image_classifier

Returns an ImageClassifierDescriptor for this provider.

get_image_embedder

Returns an ImageEmbedderDescriptor for this provider.

get_prompter

Returns a PrompterDescriptor for this provider.

get_text_classifier

Returns a TextClassifierDescriptor for this provider.

get_text_embedder

Returns a TextEmbedderDescriptor for this provider.

Attributes:

Name Type Description
name str

Returns the provider's name.

name #

name: str

Returns the provider's name.

get_image_classifier #

get_image_classifier(model: str | None = None, **options: Unpack[ClassifyImageOptions]) -> ImageClassifierDescriptor

Returns an ImageClassifierDescriptor for this provider.

Source code in daft/ai/provider.py
135
136
137
138
139
def get_image_classifier(
    self, model: str | None = None, **options: Unpack[ClassifyImageOptions]
) -> ImageClassifierDescriptor:
    """Returns an ImageClassifierDescriptor for this provider."""
    raise not_implemented_err(self, method="classify_image")

get_image_embedder #

get_image_embedder(model: str | None = None, **options: Unpack[EmbedImageOptions]) -> ImageEmbedderDescriptor

Returns an ImageEmbedderDescriptor for this provider.

Source code in daft/ai/provider.py
129
130
131
132
133
def get_image_embedder(
    self, model: str | None = None, **options: Unpack[EmbedImageOptions]
) -> ImageEmbedderDescriptor:
    """Returns an ImageEmbedderDescriptor for this provider."""
    raise not_implemented_err(self, method="embed_image")

get_prompter #

get_prompter(model: str | None = None, return_format: Any | None = None, system_message: str | None = None, **options: Unpack[PromptOptions]) -> PrompterDescriptor

Returns a PrompterDescriptor for this provider.

Source code in daft/ai/provider.py
147
148
149
150
151
152
153
154
155
def get_prompter(
    self,
    model: str | None = None,
    return_format: Any | None = None,
    system_message: str | None = None,
    **options: Unpack[PromptOptions],
) -> PrompterDescriptor:
    """Returns a PrompterDescriptor for this provider."""
    raise not_implemented_err(self, method="prompt")

get_text_classifier #

get_text_classifier(model: str | None = None, **options: Unpack[ClassifyTextOptions]) -> TextClassifierDescriptor

Returns a TextClassifierDescriptor for this provider.

Source code in daft/ai/provider.py
141
142
143
144
145
def get_text_classifier(
    self, model: str | None = None, **options: Unpack[ClassifyTextOptions]
) -> TextClassifierDescriptor:
    """Returns a TextClassifierDescriptor for this provider."""
    raise not_implemented_err(self, method="classify_text")

get_text_embedder #

get_text_embedder(model: str | None = None, dimensions: int | None = None, **options: Unpack[EmbedTextOptions]) -> TextEmbedderDescriptor

Returns a TextEmbedderDescriptor for this provider.

Source code in daft/ai/provider.py
123
124
125
126
127
def get_text_embedder(
    self, model: str | None = None, dimensions: int | None = None, **options: Unpack[EmbedTextOptions]
) -> TextEmbedderDescriptor:
    """Returns a TextEmbedderDescriptor for this provider."""
    raise not_implemented_err(self, method="embed_text")

load_provider #

load_provider(provider: str, name: str | None = None, **options: Any) -> Provider
Source code in daft/ai/provider.py
94
95
96
97
def load_provider(provider: str, name: str | None = None, **options: Any) -> Provider:
    if provider not in PROVIDERS:
        raise ValueError(f"Provider '{provider}' is not yet supported.")
    return PROVIDERS[provider](name, **options)  # type: ignore

load_google #

load_google(name: str | None = None, **options: Unpack[GoogleProviderOptions]) -> Provider
Source code in daft/ai/provider.py
39
40
41
42
43
44
45
def load_google(name: str | None = None, **options: Unpack[GoogleProviderOptions]) -> Provider:
    try:
        from daft.ai.google.provider import GoogleProvider

        return GoogleProvider(name, **options)
    except ImportError as e:
        raise ProviderImportError("google") from e

load_lm_studio #

load_lm_studio(name: str | None = None, **options: Unpack[OpenAIProviderOptions]) -> Provider
Source code in daft/ai/provider.py
48
49
50
51
52
53
54
def load_lm_studio(name: str | None = None, **options: Unpack[OpenAIProviderOptions]) -> Provider:
    try:
        from daft.ai.lm_studio.provider import LMStudioProvider

        return LMStudioProvider(name, **options)
    except ImportError as e:
        raise ProviderImportError("openai") from e

load_openai #

load_openai(name: str | None = None, **options: Unpack[OpenAIProviderOptions]) -> Provider
Source code in daft/ai/provider.py
57
58
59
60
61
62
63
def load_openai(name: str | None = None, **options: Unpack[OpenAIProviderOptions]) -> Provider:
    try:
        from daft.ai.openai.provider import OpenAIProvider

        return OpenAIProvider(name, **options)
    except ImportError as e:
        raise ProviderImportError("openai") from e

load_transformers #

load_transformers(name: str | None = None, **options: Any) -> Provider
Source code in daft/ai/provider.py
66
67
68
69
70
71
72
def load_transformers(name: str | None = None, **options: Any) -> Provider:
    try:
        from daft.ai.transformers.provider import TransformersProvider

        return TransformersProvider(name, **options)
    except ImportError as e:
        raise ProviderImportError("transformers") from e

load_vllm_prefix_caching #

load_vllm_prefix_caching(name: str | None = None, **options: Any) -> Provider
Source code in daft/ai/provider.py
75
76
77
78
79
80
81
def load_vllm_prefix_caching(name: str | None = None, **options: Any) -> Provider:
    try:
        from daft.ai.vllm.provider import VLLMPrefixCachingProvider

        return VLLMPrefixCachingProvider(name, **options)
    except ImportError as e:
        raise ProviderImportError("vllm") from e