diff --git a/agent-memory-client/agent_memory_client/models.py b/agent-memory-client/agent_memory_client/models.py index bc731c9..a77b1ea 100644 --- a/agent-memory-client/agent_memory_client/models.py +++ b/agent-memory-client/agent_memory_client/models.py @@ -215,7 +215,14 @@ class SessionListResponse(BaseModel): class WorkingMemoryResponse(WorkingMemory): """Response from working memory operations""" - pass + context_percentage_total_used: float | None = Field( + default=None, + description="Percentage of total context window currently used (0-100)", + ) + context_percentage_until_summarization: float | None = Field( + default=None, + description="Percentage until auto-summarization triggers (0-100, reaches 100% at summarization threshold)", + ) class MemoryRecordResult(MemoryRecord): diff --git a/agent-memory-client/tests/test_client.py b/agent-memory-client/tests/test_client.py index e49f615..a77619f 100644 --- a/agent-memory-client/tests/test_client.py +++ b/agent-memory-client/tests/test_client.py @@ -653,3 +653,130 @@ def test_validation_with_none_values(self, enhanced_test_client): # Should not raise enhanced_test_client.validate_memory_record(memory) + + +class TestContextUsagePercentage: + """Tests for context usage percentage functionality.""" + + @pytest.mark.asyncio + async def test_working_memory_response_with_context_percentages( + self, enhanced_test_client + ): + """Test that WorkingMemoryResponse properly handles both context percentage fields.""" + session_id = "test-session" + + # Test with both context percentages set + working_memory_response = WorkingMemoryResponse( + session_id=session_id, + messages=[], + memories=[], + data={}, + context=None, + user_id=None, + context_percentage_total_used=45.5, + context_percentage_until_summarization=65.0, + ) + + assert working_memory_response.context_percentage_total_used == 45.5 + assert working_memory_response.context_percentage_until_summarization == 65.0 + assert working_memory_response.session_id == session_id + + # Test with None context percentages (default) + working_memory_response_none = WorkingMemoryResponse( + session_id=session_id, + messages=[], + memories=[], + data={}, + context=None, + user_id=None, + ) + + assert working_memory_response_none.context_percentage_total_used is None + assert ( + working_memory_response_none.context_percentage_until_summarization is None + ) + + @pytest.mark.asyncio + async def test_context_percentages_serialization(self, enhanced_test_client): + """Test that both context percentage fields are properly serialized.""" + session_id = "test-session" + + # Create response with both context percentages + working_memory_response = WorkingMemoryResponse( + session_id=session_id, + messages=[], + memories=[], + data={}, + context=None, + user_id=None, + context_percentage_total_used=75.0, + context_percentage_until_summarization=85.5, + ) + + # Test model_dump includes both fields + dumped = working_memory_response.model_dump() + assert "context_percentage_total_used" in dumped + assert "context_percentage_until_summarization" in dumped + assert dumped["context_percentage_total_used"] == 75.0 + assert dumped["context_percentage_until_summarization"] == 85.5 + + # Test JSON serialization + json_data = working_memory_response.model_dump_json() + assert "context_percentage_total_used" in json_data + assert "context_percentage_until_summarization" in json_data + assert "75.0" in json_data + assert "85.5" in json_data + + @pytest.mark.asyncio + async def test_context_percentages_validation(self, enhanced_test_client): + """Test that both context percentage fields accept valid values.""" + session_id = "test-session" + + # Test valid percentages + valid_percentages = [0.0, 25.5, 50.0, 99.9, 100.0, None] + + for percentage in valid_percentages: + working_memory_response = WorkingMemoryResponse( + session_id=session_id, + messages=[], + memories=[], + data={}, + context=None, + user_id=None, + context_percentage_total_used=percentage, + context_percentage_until_summarization=percentage, + ) + assert working_memory_response.context_percentage_total_used == percentage + assert ( + working_memory_response.context_percentage_until_summarization + == percentage + ) + + def test_working_memory_response_from_dict_with_context_percentages(self): + """Test that WorkingMemoryResponse can be created from dict with both context percentage fields.""" + session_id = "test-session" + + # Test creating WorkingMemoryResponse from dict (simulating API response parsing) + response_dict = { + "session_id": session_id, + "messages": [], + "memories": [], + "data": {}, + "context": None, + "user_id": None, + "context_percentage_total_used": 33.3, + "context_percentage_until_summarization": 47.5, + "tokens": 0, + "namespace": None, + "ttl_seconds": None, + "last_accessed": "2024-01-01T00:00:00Z", + } + + # This simulates what happens when the API client parses the JSON response + result = WorkingMemoryResponse(**response_dict) + + # Verify both context percentage fields are included + assert isinstance(result, WorkingMemoryResponse) + assert result.context_percentage_total_used == 33.3 + assert result.context_percentage_until_summarization == 47.5 + assert result.session_id == session_id diff --git a/agent_memory_server/api.py b/agent_memory_server/api.py index b5c8a47..578795d 100644 --- a/agent_memory_server/api.py +++ b/agent_memory_server/api.py @@ -63,6 +63,45 @@ def _calculate_messages_token_count(messages: list[MemoryMessage]) -> int: return total_tokens +def _calculate_context_usage_percentages( + messages: list[MemoryMessage], + model_name: ModelNameLiteral | None, + context_window_max: int | None, +) -> tuple[float | None, float | None]: + """ + Calculate context usage percentages for total usage and until summarization triggers. + + Args: + messages: List of messages to calculate token count for + model_name: The client's LLM model name for context window determination + context_window_max: Direct specification of context window max tokens + + Returns: + Tuple of (total_percentage, until_summarization_percentage) + - total_percentage: Percentage (0-100) of total context window used + - until_summarization_percentage: Percentage (0-100) until summarization triggers + Both values are None if no model info provided + """ + if not messages or (not model_name and not context_window_max): + return None, None + + # Calculate current token usage + current_tokens = _calculate_messages_token_count(messages) + + # Get effective token limit for the client's model + max_tokens = _get_effective_token_limit(model_name, context_window_max) + + # Calculate percentage of total context window used + total_percentage = (current_tokens / max_tokens) * 100.0 + + # Calculate percentage until summarization threshold + token_threshold = int(max_tokens * settings.summarization_threshold) + until_summarization_percentage = (current_tokens / token_threshold) * 100.0 + + # Cap both at 100% for display purposes + return min(total_percentage, 100.0), min(until_summarization_percentage, 100.0) + + async def _summarize_working_memory( memory: WorkingMemory, model_name: ModelNameLiteral | None = None, @@ -88,8 +127,8 @@ async def _summarize_working_memory( max_tokens = _get_effective_token_limit(model_name, context_window_max) # Reserve space for new messages, function calls, and response generation - # Use 70% of context window to leave room for new content - token_threshold = int(max_tokens * 0.7) + # Use configurable threshold to leave room for new content + token_threshold = int(max_tokens * settings.summarization_threshold) if current_tokens <= token_threshold: return memory @@ -269,7 +308,22 @@ async def get_working_memory( logger.debug(f"Working mem: {working_mem}") - return working_mem + # Calculate context usage percentages + total_percentage, until_summarization_percentage = ( + _calculate_context_usage_percentages( + messages=working_mem.messages, + model_name=model_name, + context_window_max=context_window_max, + ) + ) + + # Return WorkingMemoryResponse with both percentage values + working_mem_data = working_mem.model_dump() + working_mem_data["context_percentage_total_used"] = total_percentage + working_mem_data["context_percentage_until_summarization"] = ( + until_summarization_percentage + ) + return WorkingMemoryResponse(**working_mem_data) @router.put("/v1/working-memory/{session_id}", response_model=WorkingMemoryResponse) @@ -348,7 +402,22 @@ async def put_working_memory( namespace=updated_memory.namespace, ) - return updated_memory + # Calculate context usage percentages based on the final state (after potential summarization) + total_percentage, until_summarization_percentage = ( + _calculate_context_usage_percentages( + messages=updated_memory.messages, + model_name=model_name, + context_window_max=context_window_max, + ) + ) + + # Return WorkingMemoryResponse with both percentage values + updated_memory_data = updated_memory.model_dump() + updated_memory_data["context_percentage_total_used"] = total_percentage + updated_memory_data["context_percentage_until_summarization"] = ( + until_summarization_percentage + ) + return WorkingMemoryResponse(**updated_memory_data) @router.delete("/v1/working-memory/{session_id}", response_model=AckResponse) diff --git a/agent_memory_server/config.py b/agent_memory_server/config.py index 8fdde4d..73acbb2 100644 --- a/agent_memory_server/config.py +++ b/agent_memory_server/config.py @@ -119,6 +119,9 @@ class Settings(BaseSettings): # Working memory settings window_size: int = 20 # Default number of recent messages to return + summarization_threshold: float = ( + 0.7 # Fraction of context window that triggers summarization + ) # Other Application settings log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO" diff --git a/agent_memory_server/models.py b/agent_memory_server/models.py index 7fda47c..204dfdf 100644 --- a/agent_memory_server/models.py +++ b/agent_memory_server/models.py @@ -222,6 +222,15 @@ class WorkingMemory(BaseModel): class WorkingMemoryResponse(WorkingMemory): """Response containing working memory""" + context_percentage_total_used: float | None = Field( + default=None, + description="Percentage of total context window currently used (0-100)", + ) + context_percentage_until_summarization: float | None = Field( + default=None, + description="Percentage until auto-summarization triggers (0-100, reaches 100% at summarization threshold)", + ) + class WorkingMemoryRequest(BaseModel): """Request parameters for working memory operations""" diff --git a/docs/memory-types.md b/docs/memory-types.md index c1cf549..02bf30d 100644 --- a/docs/memory-types.md +++ b/docs/memory-types.md @@ -202,11 +202,8 @@ Long-term memory supports three types of memories: # Create long-term memories POST /v1/long-term-memory/ -# Search long-term memories only +# Search long-term memories POST /v1/long-term-memory/search - -# Search across all memory types -POST /v1/memory/search ``` ### Search Capabilities diff --git a/dump.rdb b/dump.rdb new file mode 100644 index 0000000..f6dbd0e Binary files /dev/null and b/dump.rdb differ