redis · abrookins · Jul 26, 2025 · Jul 22, 2025 · Jul 24, 2025 · Jul 25, 2025
diff --git a/agent-memory-client/agent_memory_client/models.py b/agent-memory-client/agent_memory_client/models.py
@@ -215,7 +215,14 @@ class SessionListResponse(BaseModel):
 class WorkingMemoryResponse(WorkingMemory):
     """Response from working memory operations"""
 
-    pass
+    context_percentage_total_used: float | None = Field(
+        default=None,
+        description="Percentage of total context window currently used (0-100)",
+    )
+    context_percentage_until_summarization: float | None = Field(
+        default=None,
+        description="Percentage until auto-summarization triggers (0-100, reaches 100% at summarization threshold)",
+    )
 
 
 class MemoryRecordResult(MemoryRecord):

diff --git a/agent-memory-client/tests/test_client.py b/agent-memory-client/tests/test_client.py
@@ -653,3 +653,130 @@ def test_validation_with_none_values(self, enhanced_test_client):
 
         # Should not raise
         enhanced_test_client.validate_memory_record(memory)
+
+
+class TestContextUsagePercentage:
+    """Tests for context usage percentage functionality."""
+
+    @pytest.mark.asyncio
+    async def test_working_memory_response_with_context_percentages(
+        self, enhanced_test_client
+    ):
+        """Test that WorkingMemoryResponse properly handles both context percentage fields."""
+        session_id = "test-session"
+
+        # Test with both context percentages set
+        working_memory_response = WorkingMemoryResponse(
+            session_id=session_id,
+            messages=[],
+            memories=[],
+            data={},
+            context=None,
+            user_id=None,
+            context_percentage_total_used=45.5,
+            context_percentage_until_summarization=65.0,
+        )
+
+        assert working_memory_response.context_percentage_total_used == 45.5
+        assert working_memory_response.context_percentage_until_summarization == 65.0
+        assert working_memory_response.session_id == session_id
+
+        # Test with None context percentages (default)
+        working_memory_response_none = WorkingMemoryResponse(
+            session_id=session_id,
+            messages=[],
+            memories=[],
+            data={},
+            context=None,
+            user_id=None,
+        )
+
+        assert working_memory_response_none.context_percentage_total_used is None
+        assert (
+            working_memory_response_none.context_percentage_until_summarization is None
+        )
+
+    @pytest.mark.asyncio
+    async def test_context_percentages_serialization(self, enhanced_test_client):
+        """Test that both context percentage fields are properly serialized."""
+        session_id = "test-session"
+
+        # Create response with both context percentages
+        working_memory_response = WorkingMemoryResponse(
+            session_id=session_id,
+            messages=[],
+            memories=[],
+            data={},
+            context=None,
+            user_id=None,
+            context_percentage_total_used=75.0,
+            context_percentage_until_summarization=85.5,
+        )
+
+        # Test model_dump includes both fields
+        dumped = working_memory_response.model_dump()
+        assert "context_percentage_total_used" in dumped
+        assert "context_percentage_until_summarization" in dumped
+        assert dumped["context_percentage_total_used"] == 75.0
+        assert dumped["context_percentage_until_summarization"] == 85.5
+
+        # Test JSON serialization
+        json_data = working_memory_response.model_dump_json()
+        assert "context_percentage_total_used" in json_data
+        assert "context_percentage_until_summarization" in json_data
+        assert "75.0" in json_data
+        assert "85.5" in json_data
+
+    @pytest.mark.asyncio
+    async def test_context_percentages_validation(self, enhanced_test_client):
+        """Test that both context percentage fields accept valid values."""
+        session_id = "test-session"
+
+        # Test valid percentages
+        valid_percentages = [0.0, 25.5, 50.0, 99.9, 100.0, None]
+
+        for percentage in valid_percentages:
+            working_memory_response = WorkingMemoryResponse(
+                session_id=session_id,
+                messages=[],
+                memories=[],
+                data={},
+                context=None,
+                user_id=None,
+                context_percentage_total_used=percentage,
+                context_percentage_until_summarization=percentage,
+            )
+            assert working_memory_response.context_percentage_total_used == percentage
+            assert (
+                working_memory_response.context_percentage_until_summarization
+                == percentage
+            )
+
+    def test_working_memory_response_from_dict_with_context_percentages(self):
+        """Test that WorkingMemoryResponse can be created from dict with both context percentage fields."""
+        session_id = "test-session"
+
+        # Test creating WorkingMemoryResponse from dict (simulating API response parsing)
+        response_dict = {
+            "session_id": session_id,
+            "messages": [],
+            "memories": [],
+            "data": {},
+            "context": None,
+            "user_id": None,
+            "context_percentage_total_used": 33.3,
+            "context_percentage_until_summarization": 47.5,
+            "tokens": 0,
+            "namespace": None,
+            "ttl_seconds": None,
+            "last_accessed": "2024-01-01T00:00:00Z",
+        }
+
+        # This simulates what happens when the API client parses the JSON response
+        result = WorkingMemoryResponse(**response_dict)
+
+        # Verify both context percentage fields are included
+        assert isinstance(result, WorkingMemoryResponse)
+        assert result.context_percentage_total_used == 33.3
+        assert result.context_percentage_until_summarization == 47.5
+        assert result.session_id == session_id
diff --git a/agent_memory_server/api.py b/agent_memory_server/api.py
@@ -63,6 +63,45 @@ def _calculate_messages_token_count(messages: list[MemoryMessage]) -> int:
     return total_tokens
 
 
+def _calculate_context_usage_percentages(
+    messages: list[MemoryMessage],
+    model_name: ModelNameLiteral | None,
+    context_window_max: int | None,
+) -> tuple[float | None, float | None]:
+    """
+    Calculate context usage percentages for total usage and until summarization triggers.
+
+    Args:
+        messages: List of messages to calculate token count for
+        model_name: The client's LLM model name for context window determination
+        context_window_max: Direct specification of context window max tokens
+
+    Returns:
+        Tuple of (total_percentage, until_summarization_percentage)
+        - total_percentage: Percentage (0-100) of total context window used
+        - until_summarization_percentage: Percentage (0-100) until summarization triggers
+        Both values are None if no model info provided
+    """
+    if not messages or (not model_name and not context_window_max):
+        return None, None
+
+    # Calculate current token usage
+    current_tokens = _calculate_messages_token_count(messages)
+
+    # Get effective token limit for the client's model
+    max_tokens = _get_effective_token_limit(model_name, context_window_max)
+
+    # Calculate percentage of total context window used
+    total_percentage = (current_tokens / max_tokens) * 100.0
+
+    # Calculate percentage until summarization threshold
+    token_threshold = int(max_tokens * settings.summarization_threshold)
+    until_summarization_percentage = (current_tokens / token_threshold) * 100.0
+
+    # Cap both at 100% for display purposes
+    return min(total_percentage, 100.0), min(until_summarization_percentage, 100.0)
+
+
 async def _summarize_working_memory(
     memory: WorkingMemory,
     model_name: ModelNameLiteral | None = None,
@@ -88,8 +127,8 @@ async def _summarize_working_memory(
     max_tokens = _get_effective_token_limit(model_name, context_window_max)
 
     # Reserve space for new messages, function calls, and response generation
-    # Use 70% of context window to leave room for new content
-    token_threshold = int(max_tokens * 0.7)
+    # Use configurable threshold to leave room for new content
+    token_threshold = int(max_tokens * settings.summarization_threshold)
 
     if current_tokens <= token_threshold:
         return memory
@@ -269,7 +308,22 @@ async def get_working_memory(
 
     logger.debug(f"Working mem: {working_mem}")
 
-    return working_mem
+    # Calculate context usage percentages
+    total_percentage, until_summarization_percentage = (
+        _calculate_context_usage_percentages(
+            messages=working_mem.messages,
+            model_name=model_name,
+            context_window_max=context_window_max,
+        )
+    )
+
+    # Return WorkingMemoryResponse with both percentage values
+    working_mem_data = working_mem.model_dump()
+    working_mem_data["context_percentage_total_used"] = total_percentage
+    working_mem_data["context_percentage_until_summarization"] = (
+        until_summarization_percentage
+    )
+    return WorkingMemoryResponse(**working_mem_data)
 
 
 @router.put("/v1/working-memory/{session_id}", response_model=WorkingMemoryResponse)
@@ -348,7 +402,22 @@ async def put_working_memory(
             namespace=updated_memory.namespace,
         )
 
-    return updated_memory
+    # Calculate context usage percentages based on the final state (after potential summarization)
+    total_percentage, until_summarization_percentage = (
+        _calculate_context_usage_percentages(
+            messages=updated_memory.messages,
+            model_name=model_name,
+            context_window_max=context_window_max,
+        )
+    )
+
+    # Return WorkingMemoryResponse with both percentage values
+    updated_memory_data = updated_memory.model_dump()
+    updated_memory_data["context_percentage_total_used"] = total_percentage
+    updated_memory_data["context_percentage_until_summarization"] = (
+        until_summarization_percentage
+    )
+    return WorkingMemoryResponse(**updated_memory_data)
 
 
 @router.delete("/v1/working-memory/{session_id}", response_model=AckResponse)

diff --git a/agent_memory_server/config.py b/agent_memory_server/config.py
@@ -119,6 +119,9 @@ class Settings(BaseSettings):
 
     # Working memory settings
     window_size: int = 20  # Default number of recent messages to return
+    summarization_threshold: float = (
+        0.7  # Fraction of context window that triggers summarization
+    )
 
     # Other Application settings
     log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO"

diff --git a/agent_memory_server/models.py b/agent_memory_server/models.py
@@ -222,6 +222,15 @@ class WorkingMemory(BaseModel):
 class WorkingMemoryResponse(WorkingMemory):
     """Response containing working memory"""
 
+    context_percentage_total_used: float | None = Field(
+        default=None,
+        description="Percentage of total context window currently used (0-100)",
+    )
+    context_percentage_until_summarization: float | None = Field(
+        default=None,
+        description="Percentage until auto-summarization triggers (0-100, reaches 100% at summarization threshold)",
+    )
+
 
 class WorkingMemoryRequest(BaseModel):
     """Request parameters for working memory operations"""

diff --git a/docs/memory-types.md b/docs/memory-types.md
@@ -202,11 +202,8 @@ Long-term memory supports three types of memories:
 # Create long-term memories
 POST /v1/long-term-memory/
 
-# Search long-term memories only
+# Search long-term memories
 POST /v1/long-term-memory/search
-
-# Search across all memory types
-POST /v1/memory/search
 ```
 
 ### Search Capabilities

diff --git a/dump.rdb b/dump.rdb