stanfordnlp · TomeHirata · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025
diff --git a/dspy/__init__.py b/dspy/__init__.py
@@ -6,7 +6,7 @@
 
 from dspy.evaluate import Evaluate  # isort: skip
 from dspy.clients import *  # isort: skip
-from dspy.adapters import Adapter, ChatAdapter, JSONAdapter, XMLAdapter, TwoStepAdapter, Image, Audio, History, Type, Tool, ToolCalls, Code  # isort: skip
+from dspy.adapters import Adapter, ChatAdapter, JSONAdapter, XMLAdapter, TwoStepAdapter, Image, Audio, History, Type, Tool, ToolCalls, Code, Citations, Document  # isort: skip
 from dspy.utils.logging_utils import configure_dspy_loggers, disable_logging, enable_logging
 from dspy.utils.asyncify import asyncify
 from dspy.utils.syncify import syncify

diff --git a/dspy/adapters/__init__.py b/dspy/adapters/__init__.py
@@ -2,7 +2,7 @@
 from dspy.adapters.chat_adapter import ChatAdapter
 from dspy.adapters.json_adapter import JSONAdapter
 from dspy.adapters.two_step_adapter import TwoStepAdapter
-from dspy.adapters.types import Audio, Code, History, Image, Tool, ToolCalls, Type
+from dspy.adapters.types import Audio, Citations, Code, Document, History, Image, Tool, ToolCalls, Type
 from dspy.adapters.xml_adapter import XMLAdapter
 
 __all__ = [
@@ -13,6 +13,8 @@
     "Image",
     "Audio",
     "Code",
+    "Citations",
+    "Document",
     "JSONAdapter",
     "XMLAdapter",
     "TwoStepAdapter",

diff --git a/dspy/adapters/base.py b/dspy/adapters/base.py
@@ -4,7 +4,7 @@
 import json_repair
 import litellm
 
-from dspy.adapters.types import History
+from dspy.adapters.types import Citations, History
 from dspy.adapters.types.base_type import split_message_content_for_custom_types
 from dspy.adapters.types.tool import Tool, ToolCalls
 from dspy.signatures.signature import Signature
@@ -63,6 +63,10 @@ def _call_preprocess(
 
                 return signature_for_native_function_calling
 
+        citation_output_field_name = self._get_citation_output_field_name(signature)
+        if citation_output_field_name:
+            signature = signature.delete(citation_output_field_name)
+
         return signature
 
     def _call_postprocess(
@@ -74,16 +78,19 @@ def _call_postprocess(
         values = []
 
         tool_call_output_field_name = self._get_tool_call_output_field_name(original_signature)
+        citation_output_field_name = self._get_citation_output_field_name(original_signature)
 
         for output in outputs:
             output_logprobs = None
             tool_calls = None
+            citations = None
             text = output
 
             if isinstance(output, dict):
                 text = output["text"]
                 output_logprobs = output.get("logprobs")
                 tool_calls = output.get("tool_calls")
+                citations = output.get("citations")
 
             if text:
                 value = self.parse(processed_signature, text)
@@ -106,6 +113,10 @@ def _call_postprocess(
                 ]
                 value[tool_call_output_field_name] = ToolCalls.from_dict_list(tool_calls)
 
+            if citations and citation_output_field_name:
+                citations_obj = Citations.from_dict_list(citations)
+                value[citation_output_field_name] = citations_obj
+
             if output_logprobs:
                 value["logprobs"] = output_logprobs
 
@@ -390,6 +401,13 @@ def _get_tool_call_output_field_name(self, signature: type[Signature]) -> bool:
                 return name
         return None
 
+    def _get_citation_output_field_name(self, signature: type[Signature]) -> str | None:
+        """Find the Citations output field in the signature."""
+        for name, field in signature.output_fields.items():
+            if field.annotation == Citations:
+                return name
+        return None
+
     def format_conversation_history(
         self,
         signature: type[Signature],

diff --git a/dspy/adapters/types/__init__.py b/dspy/adapters/types/__init__.py
@@ -1,8 +1,10 @@
 from dspy.adapters.types.audio import Audio
 from dspy.adapters.types.base_type import Type
+from dspy.adapters.types.citation import Citations
 from dspy.adapters.types.code import Code
+from dspy.adapters.types.document import Document
 from dspy.adapters.types.history import History
 from dspy.adapters.types.image import Image
 from dspy.adapters.types.tool import Tool, ToolCalls
 
-__all__ = ["History", "Image", "Audio", "Type", "Tool", "ToolCalls", "Code"]
+__all__ = ["History", "Image", "Audio", "Type", "Tool", "ToolCalls", "Code", "Citations", "Document"]
diff --git a/dspy/adapters/types/citation.py b/dspy/adapters/types/citation.py
@@ -0,0 +1,165 @@
+from typing import Any
+
+import pydantic
+
+from dspy.adapters.types.base_type import Type
+
+
+class Citations(Type):
+    """Citations extracted from an LM response with source references.
+
+    This type represents citations returned by language models that support
+    citation extraction, particularly Anthropic's Citations API through LiteLLM.
+    Citations include the quoted text and source information.
+
+    Example:
+        ```python
+        import dspy
+        from dspy.signatures import Signature
+
+        class AnswerWithSources(Signature):
+            '''Answer questions using provided documents with citations.'''
+            documents: list[dspy.Document] = dspy.InputField()
+            question: str = dspy.InputField()
+            answer: str = dspy.OutputField()
+            citations: dspy.Citations = dspy.OutputField()
+
+        # Create documents to provide as sources
+        docs = [
+            dspy.Document(
+                data="The Earth orbits the Sun in an elliptical path.",
+                title="Basic Astronomy Facts"
+            ),
+            dspy.Document(
+                data="Water boils at 100°C at standard atmospheric pressure.",
+                title="Physics Fundamentals",
+                metadata={"author": "Dr. Smith", "year": 2023}
+            )
+        ]
+
+        # Use with a model that supports citations like Claude
+        lm = dspy.LM("anthropic/claude-opus-4-1-20250805")
+        predictor = dspy.Predict(AnswerWithSources, lm=lm)
+        result = predictor(documents=docs, question="What temperature does water boil?")
+
+        for citation in result.citations.citations:
+            print(citation.format())
+        ```
+    """
+
+    class Citation(Type):
+        """Individual citation with character location information."""
+        type: str = "char_location"
+        cited_text: str
+        document_index: int
+        document_title: str | None = None
+        start_char_index: int
+        end_char_index: int
+        supported_text: str | None = None
+
+        def format(self) -> dict[str, Any]:
+            """Format citation as dictionary for LM consumption.
+
+            Returns:
+                A dictionary in the format expected by citation APIs.
+            """
+            citation_dict = {
+                "type": self.type,
+                "cited_text": self.cited_text,
+                "document_index": self.document_index,
+                "start_char_index": self.start_char_index,
+                "end_char_index": self.end_char_index
+            }
+
+            if self.document_title:
+                citation_dict["document_title"] = self.document_title
+
+            if self.supported_text:
+                citation_dict["supported_text"] = self.supported_text
+
+            return citation_dict
+
+    citations: list[Citation]
+
+    @classmethod
+    def from_dict_list(cls, citations_dicts: list[dict[str, Any]]) -> "Citations":
+        """Convert a list of dictionaries to a Citations instance.
+
+        Args:
+            citations_dicts: A list of dictionaries, where each dictionary should have 'cited_text' key
+                and 'document_index', 'start_char_index', 'end_char_index' keys.
+
+        Returns:
+            A Citations instance.
+
+        Example:
+            ```python
+            citations_dict = [
+                {
+                    "cited_text": "The sky is blue",
+                    "document_index": 0,
+                    "document_title": "Weather Guide",
+                    "start_char_index": 0,
+                    "end_char_index": 15,
+                    "supported_text": "The sky was blue yesterday."
+                }
+            ]
+            citations = Citations.from_dict_list(citations_dict)
+            ```
+        """
+        citations = [cls.Citation(**item) for item in citations_dicts]
+        return cls(citations=citations)
+
+    @classmethod
+    def description(cls) -> str:
+        """Description of the citations type for use in prompts."""
+        return (
+            "Citations with quoted text and source references. "
+            "Include the exact text being cited and information about its source."
+        )
+
+    def format(self) -> list[dict[str, Any]]:
+        """Format citations as a list of dictionaries."""
+        return [citation.format() for citation in self.citations]
+
+    @pydantic.model_validator(mode="before")
+    @classmethod
+    def validate_input(cls, data: Any):
+        if isinstance(data, cls):
+            return data
+
+        # Handle case where data is a list of dicts with citation info
+        if isinstance(data, list) and all(
+            isinstance(item, dict) and "cited_text" in item for item in data
+        ):
+            return {"citations": [cls.Citation(**item) for item in data]}
+
+        # Handle case where data is a dict
+        elif isinstance(data, dict):
+            if "citations" in data:
+                # Handle case where data is a dict with "citations" key
+                citations_data = data["citations"]
+                if isinstance(citations_data, list):
+                    return {
+                        "citations": [
+                            cls.Citation(**item) if isinstance(item, dict) else item
+                            for item in citations_data
+                        ]
+                    }
+            elif "cited_text" in data:
+                # Handle case where data is a single citation dict
+                return {"citations": [cls.Citation(**data)]}
+
+        raise ValueError(f"Received invalid value for `dspy.Citations`: {data}")
+
+    def __iter__(self):
+        """Allow iteration over citations."""
+        return iter(self.citations)
+
+    def __len__(self):
+        """Return the number of citations."""
+        return len(self.citations)
+
+    def __getitem__(self, index):
+        """Allow indexing into citations."""
+        return self.citations[index]
diff --git a/dspy/adapters/types/document.py b/dspy/adapters/types/document.py
@@ -0,0 +1,111 @@
+from typing import Any, Literal
+
+import pydantic
+
+from dspy.adapters.types.base_type import Type
+
+
+class Document(Type):
+    """A document type for providing content that can be cited by language models.
+
+    This type represents documents that can be passed to language models for citation-enabled
+    responses, particularly useful with Anthropic's Citations API. Documents include the content
+    and metadata that helps the LM understand and reference the source material.
+
+    Attributes:
+        data: The text content of the document
+        title: Optional title for the document (used in citations)
+        media_type: MIME type of the document content (defaults to "text/plain")
+        context: Optional context information about the document
+
+    Example:
+        ```python
+        import dspy
+        from dspy.signatures import Signature
+
+        class AnswerWithSources(Signature):
+            '''Answer questions using provided documents with citations.'''
+            documents: list[dspy.Document] = dspy.InputField()
+            question: str = dspy.InputField()
+            answer: str = dspy.OutputField()
+            citations: dspy.Citations = dspy.OutputField()
+
+        # Create documents
+        docs = [
+            dspy.Document(
+                data="The Earth orbits the Sun in an elliptical path.",
+                title="Basic Astronomy Facts"
+            ),
+            dspy.Document(
+                data="Water boils at 100°C at standard atmospheric pressure.",
+                title="Physics Fundamentals",
+            )
+        ]
+
+        # Use with a citation-supporting model
+        lm = dspy.LM("anthropic/claude-opus-4-1-20250805")
+        predictor = dspy.Predict(AnswerWithSources)
+        result = predictor(documents=docs, question="What temperature does water boil?", lm=lm)
+        print(result.citations)
+        ```
+    """
+
+    data: str
+    title: str | None = None
+    media_type: Literal["text/plain", "application/pdf"] = "text/plain"
+    context: str | None = None
+
+    def format(self) -> list[dict[str, Any]]:
+        """Format document for LM consumption.
+
+        Returns:
+            A list containing the document block in the format expected by citation-enabled language models.
+        """
+        document_block = {
+            "type": "document",
+            "source": {
+                "type": "text",
+                "media_type": self.media_type,
+                "data": self.data
+            },
+            "citations": {"enabled": True}
+        }
+
+        if self.title:
+            document_block["title"] = self.title
+
+        if self.context:
+            document_block["context"] = self.context
+
+        return [document_block]
+
+
+
+    @classmethod
+    def description(cls) -> str:
+        """Description of the document type for use in prompts."""
+        return (
+            "A document containing text content that can be referenced and cited. "
+            "Include the full text content and optionally a title for proper referencing."
+        )
+
+    @pydantic.model_validator(mode="before")
+    @classmethod
+    def validate_input(cls, data: Any):
+        if isinstance(data, cls):
+            return data
+
+        # Handle case where data is just a string (data only)
+        if isinstance(data, str):
+            return {"data": data}
+
+        # Handle case where data is a dict
+        elif isinstance(data, dict):
+            return data
+
+        raise ValueError(f"Received invalid value for `dspy.Document`: {data}")
+
+    def __str__(self) -> str:
+        """String representation showing title and content length."""
+        title_part = f"'{self.title}': " if self.title else ""
+        return f"Document({title_part}{len(self.data)} chars)"