crmne · tpaulshippy · Jun 9, 2025 · Jun 9, 2025 · Jun 9, 2025 · Jun 9, 2025
diff --git a/docs/guides/chat.md b/docs/guides/chat.md
@@ -462,6 +462,30 @@ puts "Total Conversation Tokens: #{total_conversation_tokens}"
 
 Refer to the [Working with Models Guide]({% link guides/models.md %}) for details on accessing model-specific pricing.
 
+## Prompt Caching
+
+### Enabling
+For Anthropic models, you can opt-in to prompt caching which is documented more fully in the [Anthropic API docs](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching).
+
+Enable prompt caching using the `cache_prompts` method on your chat instance:
+
+```ruby
+chat = RubyLLM.chat(model: 'claude-3-5-haiku-20241022')
+
+# Enable caching for different types of content
+chat.cache_prompts(
+  system: true,  # Cache system instructions
+  user: true,    # Cache user messages
+  tools: true    # Cache tool definitions
+)
+```
+
+### Checking cached token counts
+For Anthropic, OpenAI, and Gemini, you can see the number of tokens read from cache by looking at the `cached_tokens` property on the output messages. 
+
+For Anthropic, you can see the tokens written to cache by looking at the `cache_creation_tokens` property.
+
+
 ## Chat Event Handlers
 
 You can register blocks to be called when certain events occur during the chat lifecycle, useful for UI updates or logging.

diff --git a/lib/ruby_llm/chat.rb b/lib/ruby_llm/chat.rb
@@ -25,6 +25,7 @@ def initialize(model: nil, provider: nil, assume_model_exists: false, context: n
       @temperature = 0.7
       @messages = []
       @tools = {}
+      @cache_prompts = { system: false, user: false, tools: false }
       @params = {}
       @schema = nil
       @on = {
@@ -116,12 +117,18 @@ def each(&)
       messages.each(&)
     end
 
+    def cache_prompts(system: false, user: false, tools: false)
+      @cache_prompts = { system: system, user: user, tools: tools }
+      self
+    end
+
     def complete(&) # rubocop:disable Metrics/PerceivedComplexity
       response = @provider.complete(
         messages,
         tools: @tools,
         temperature: @temperature,
         model: @model.id,
+        cache_prompts: @cache_prompts.dup,
         connection: @connection,
         params: @params,
         schema: @schema,

diff --git a/lib/ruby_llm/message.rb b/lib/ruby_llm/message.rb
@@ -7,7 +7,8 @@ module RubyLLM
   class Message
     ROLES = %i[system user assistant tool].freeze
 
-    attr_reader :role, :tool_calls, :tool_call_id, :input_tokens, :output_tokens, :model_id
+    attr_reader :role, :tool_calls, :tool_call_id, :input_tokens, :output_tokens, :model_id,
+                :cached_tokens, :cache_creation_tokens
     attr_writer :content
 
     def initialize(options = {})
@@ -18,6 +19,8 @@ def initialize(options = {})
       @output_tokens = options[:output_tokens]
       @model_id = options[:model_id]
       @tool_call_id = options[:tool_call_id]
+      @cached_tokens = options[:cached_tokens]
+      @cache_creation_tokens = options[:cache_creation_tokens]
 
       ensure_valid_role
     end
@@ -50,7 +53,9 @@ def to_h
         tool_call_id: tool_call_id,
         input_tokens: input_tokens,
         output_tokens: output_tokens,
-        model_id: model_id
+        model_id: model_id,
+        cache_creation_tokens: cache_creation_tokens,
+        cached_tokens: cached_tokens
       }.compact
     end
 

diff --git a/lib/ruby_llm/provider.rb b/lib/ruby_llm/provider.rb
@@ -10,7 +10,8 @@ module Provider
     module Methods
       extend Streaming
 
-      def complete(messages, tools:, temperature:, model:, connection:, params: {}, schema: nil, &) # rubocop:disable Metrics/ParameterLists
+      def complete(messages, tools:, temperature:, model:, connection:, params: {}, schema: nil, # rubocop:disable Metrics/ParameterLists
+                   cache_prompts: { system: false, user: false, tools: false }, &)
         normalized_temperature = maybe_normalize_temperature(temperature, model)
 
         payload = Utils.deep_merge(
@@ -20,6 +21,7 @@ def complete(messages, tools:, temperature:, model:, connection:, params: {}, sc
             tools: tools,
             temperature: normalized_temperature,
             model: model,
+            cache_prompts: cache_prompts,
             stream: block_given?,
             schema: schema
           )

diff --git a/lib/ruby_llm/providers/anthropic/chat.rb b/lib/ruby_llm/providers/anthropic/chat.rb
@@ -11,42 +11,50 @@ def completion_url
           '/v1/messages'
         end
 
-        def render_payload(messages, tools:, temperature:, model:, stream: false, schema: nil) # rubocop:disable Metrics/ParameterLists,Lint/UnusedMethodArgument
+        def render_payload(messages, tools:, temperature:, model:, stream: false, schema: nil, # rubocop:disable Metrics/ParameterLists,Lint/UnusedMethodArgument
+                           cache_prompts: { system: false, user: false, tools: false })
           system_messages, chat_messages = separate_messages(messages)
-          system_content = build_system_content(system_messages)
+          system_content = build_system_content(system_messages, cache: cache_prompts[:system])
 
-          build_base_payload(chat_messages, temperature, model, stream).tap do |payload|
-            add_optional_fields(payload, system_content:, tools:)
+          build_base_payload(chat_messages, temperature, model, stream, cache: cache_prompts[:user]).tap do |payload|
+            add_optional_fields(payload, system_content: system_content, tools: tools,
+                                         cache_tools: cache_prompts[:tools])
           end
         end
 
         def separate_messages(messages)
           messages.partition { |msg| msg.role == :system }
         end
 
-        def build_system_content(system_messages)
-          if system_messages.length > 1
-            RubyLLM.logger.warn(
-              "Anthropic's Claude implementation only supports a single system message. " \
-              'Multiple system messages will be combined into one.'
-            )
+        def build_system_content(system_messages, cache: false)
+          system_messages.flat_map.with_index do |msg, idx|
+            cache = false unless idx == system_messages.size - 1
+            format_system_message(msg, cache:)
           end
-
-          system_messages.map { |msg| format_message(msg)[:content] }.join("\n\n")
         end
 
-        def build_base_payload(chat_messages, temperature, model, stream)
+        def build_base_payload(chat_messages, temperature, model, stream, cache: false)
+          messages = chat_messages.map.with_index do |msg, idx|
+            cache = false unless idx == chat_messages.size - 1
+            format_message(msg, cache:)
+          end
+
           {
             model: model,
-            messages: chat_messages.map { |msg| format_message(msg) },
+            messages: messages,
             temperature: temperature,
             stream: stream,
             max_tokens: RubyLLM.models.find(model)&.max_tokens || 4096
           }
         end
 
-        def add_optional_fields(payload, system_content:, tools:)
-          payload[:tools] = tools.values.map { |t| Tools.function_for(t) } if tools.any?
+        def add_optional_fields(payload, system_content:, tools:, cache_tools: false)
+          if tools.any?
+            tool_definitions = tools.values.map { |t| Tools.function_for(t) }
+            tool_definitions[-1][:cache_control] = { type: 'ephemeral' } if cache_tools
+            payload[:tools] = tool_definitions
+          end
+
           payload[:system] = system_content unless system_content.empty?
         end
 
@@ -72,24 +80,30 @@ def build_message(data, content, tool_use_blocks)
             tool_calls: Tools.parse_tool_calls(tool_use_blocks),
             input_tokens: data.dig('usage', 'input_tokens'),
             output_tokens: data.dig('usage', 'output_tokens'),
-            model_id: data['model']
+            model_id: data['model'],
+            cache_creation_tokens: data.dig('usage', 'cache_creation_input_tokens'),
+            cached_tokens: data.dig('usage', 'cache_read_input_tokens')
           )
         end
 
-        def format_message(msg)
+        def format_message(msg, cache: false)
           if msg.tool_call?
             Tools.format_tool_call(msg)
           elsif msg.tool_result?
             Tools.format_tool_result(msg)
           else
-            format_basic_message(msg)
+            format_basic_message(msg, cache:)
           end
         end
 
-        def format_basic_message(msg)
+        def format_system_message(msg, cache: false)
+          Media.format_content(msg.content, cache:)
+        end
+
+        def format_basic_message(msg, cache: false)
           {
             role: convert_role(msg.role),
-            content: Media.format_content(msg.content)
+            content: Media.format_content(msg.content, cache:)
           }
         end
 

diff --git a/lib/ruby_llm/providers/anthropic/media.rb b/lib/ruby_llm/providers/anthropic/media.rb
@@ -7,13 +7,13 @@ module Anthropic
       module Media
         module_function
 
-        def format_content(content)
+        def format_content(content, cache: false)
           # Convert Hash/Array back to JSON string for API
-          return [format_text(content.to_json)] if content.is_a?(Hash) || content.is_a?(Array)
-          return [format_text(content)] unless content.is_a?(Content)
+          return [format_text(content.to_json, cache:)] if content.is_a?(Hash) || content.is_a?(Array)
+          return [format_text(content, cache:)] unless content.is_a?(Content)
 
           parts = []
-          parts << format_text(content.text) if content.text
+          parts << format_text(content.text, cache:) if content.text
 
           content.attachments.each do |attachment|
             case attachment.type
@@ -31,60 +31,84 @@ def format_content(content)
           parts
         end
 
-        def format_text(text)
-          {
-            type: 'text',
-            text: text
-          }
+        def format_text(text, cache: false)
+          with_cache_control(
+            {
+              type: 'text',
+              text: text
+            },
+            cache:
+          )
         end
 
-        def format_image(image)
+        def format_image(image, cache: false)
           if image.url?
-            {
-              type: 'image',
-              source: {
-                type: 'url',
-                url: image.source
-              }
-            }
+            with_cache_control(
+              {
+                type: 'image',
+                source: {
+                  type: 'url',
+                  url: image.source
+                }
+              },
+              cache:
+            )
           else
-            {
-              type: 'image',
-              source: {
-                type: 'base64',
-                media_type: image.mime_type,
-                data: image.encoded
-              }
-            }
+            with_cache_control(
+              {
+                type: 'image',
+                source: {
+                  type: 'base64',
+                  media_type: image.mime_type,
+                  data: image.encoded
+                }
+              },
+              cache:
+            )
           end
         end
 
-        def format_pdf(pdf)
+        def format_pdf(pdf, cache: false)
           if pdf.url?
-            {
-              type: 'document',
-              source: {
-                type: 'url',
-                url: pdf.source
-              }
-            }
+            with_cache_control(
+              {
+                type: 'document',
+                source: {
+                  type: 'url',
+                  url: pdf.source
+                }
+              },
+              cache:
+            )
           else
-            {
-              type: 'document',
-              source: {
-                type: 'base64',
-                media_type: pdf.mime_type,
-                data: pdf.encoded
-              }
-            }
+            with_cache_control(
+              {
+                type: 'document',
+                source: {
+                  type: 'base64',
+                  media_type: pdf.mime_type,
+                  data: pdf.encoded
+                }
+              },
+              cache:
+            )
           end
         end
 
-        def format_text_file(text_file)
-          {
-            type: 'text',
-            text: Utils.format_text_file_for_llm(text_file)
-          }
+        def format_text_file(text_file, cache: false)
+          with_cache_control(
+            {
+              type: 'text',
+              text: Utils.format_text_file_for_llm(text_file)
+            },
+            cache:
+          )
+        end
+
+        def with_cache_control(hash, cache: false)
+          return hash unless cache
+
+          hash.merge(cache_control: { type: 'ephemeral' })
         end
       end
     end