From b842bc9b7e0ca9b4f7393864dfdcdbad3f1c8b87 Mon Sep 17 00:00:00 2001
From: Kylejeong2 <kylejeong21@gmail.com>
Date: Wed, 9 Jul 2025 16:32:55 -1000
Subject: [PATCH 1/2] v0 evals

---
 evals/README.md                     | 348 ++++++++++++++++++++++++++++
 evals/env.example                   |  27 +++
 evals/mcp-eval-advanced.config.json | 212 +++++++++++++++++
 evals/mcp-eval-minimal.config.json  |  77 ++++++
 evals/mcp-eval.config.json          | 183 +++++++++++++++
 evals/run-evals.ts                  | 314 +++++++++++++++++++++++++
 package.json                        |  13 +-
 pnpm-lock.yaml                      |   9 +
 pnpm-workspace.yaml                 |   5 +-
 src/tools/multiSession.ts           |   2 +-
 src/tools/session.ts                |   2 +-
 tests/.gitkeep                      |   0
 12 files changed, 1188 insertions(+), 4 deletions(-)
 create mode 100644 evals/README.md
 create mode 100644 evals/env.example
 create mode 100644 evals/mcp-eval-advanced.config.json
 create mode 100644 evals/mcp-eval-minimal.config.json
 create mode 100644 evals/mcp-eval.config.json
 create mode 100644 evals/run-evals.ts
 delete mode 100644 tests/.gitkeep

diff --git a/evals/README.md b/evals/README.md
new file mode 100644
index 0000000..45cbfe1
--- /dev/null
+++ b/evals/README.md
@@ -0,0 +1,348 @@
+# Browserbase MCP Server Evaluation Tests
+
+This directory contains comprehensive evaluation tests for the Browserbase MCP Server using [MCPVals](https://github.com/modelcontextprotocol/mcpvals), a testing framework that uses Claude to autonomously execute test workflows based on natural language descriptions.
+
+## Quick Start
+
+1. **Install dependencies:**
+
+   ```bash
+   npm run test:install
+   ```
+
+2. **Set up environment variables:**
+
+   ```bash
+   export BROWSERBASE_API_KEY="your_api_key_here"
+   export BROWSERBASE_PROJECT_ID="your_project_id_here"
+   export ANTHROPIC_API_KEY="your_anthropic_key_here"
+   ```
+
+3. **Run basic tests:**
+   ```bash
+   npm test
+   ```
+
+## Test Configurations
+
+We provide three levels of testing:
+
+### 1. Minimal Tests (`mcp-eval-minimal.config.json`)
+
+- **Purpose**: Quick smoke tests to verify basic functionality
+- **Duration**: ~2-3 minutes
+- **Tests**: 3 workflows covering navigation, extraction, and multi-session basics
+- **Usage**: `npm run test:minimal`
+
+### 2. Standard Tests (`mcp-eval.config.json`)
+
+- **Purpose**: Comprehensive functionality testing
+- **Duration**: ~5-10 minutes
+- **Tests**: 8 workflows covering all major features
+- **Usage**: `npm test`
+
+### 3. Advanced Tests (`mcp-eval-advanced.config.json`)
+
+- **Purpose**: Complex scenarios with LLM judge evaluation
+- **Duration**: ~10-15 minutes
+- **Tests**: 6 workflows with subjective quality assessment
+- **Usage**: `npm run test:advanced` (requires `OPENAI_API_KEY`)
+
+## Available Scripts
+
+| Script                  | Description                       |
+| ----------------------- | --------------------------------- |
+| `npm test`              | Run standard evaluation tests     |
+| `npm run test:minimal`  | Run minimal smoke tests           |
+| `npm run test:advanced` | Run advanced tests with LLM judge |
+| `npm run test:all`      | Run all test suites               |
+| `npm run test:debug`    | Run tests with debug output       |
+| `npm run test:json`     | Run tests with JSON output        |
+| `npm run test:install`  | Install test dependencies         |
+| `npm run test:runner`   | Direct access to test runner      |
+
+## Understanding the Test Framework
+
+### How MCPVals Works
+
+MCPVals uses Claude to autonomously execute test workflows:
+
+1. **Natural Language Instructions**: Tests are written as natural language prompts
+2. **Autonomous Execution**: Claude examines available MCP tools and plans execution
+3. **Tool Invocation**: Claude calls the appropriate MCP tools to accomplish tasks
+4. **Deterministic Evaluation**: Results are evaluated against expected outcomes
+
+### Test Structure
+
+Each test workflow contains:
+
+```json
+{
+  "name": "test-name",
+  "description": "What this test validates",
+  "steps": [
+    {
+      "user": "Natural language instruction",
+      "expectedState": "Expected substring in output"
+    }
+  ],
+  "expectTools": ["list", "of", "expected", "tools"]
+}
+```
+
+### Evaluation Metrics
+
+Each test is evaluated on three metrics:
+
+1. **End-to-End Success** (0-100%): Did the workflow achieve the expected final state?
+2. **Tool Invocation Order** (0-100%): Were the expected tools called in the correct sequence?
+3. **Tool Call Health** (0-100%): Did all tool calls complete successfully without errors?
+
+**Overall Score** = Average of all three metrics
+
+## Test Workflows
+
+### Minimal Test Suite
+
+| Workflow                   | Description              | Expected Tools                                    |
+| -------------------------- | ------------------------ | ------------------------------------------------- |
+| `smoke-test-navigation`    | Basic browser navigation | session_create, navigate, session_close           |
+| `smoke-test-extraction`    | Basic content extraction | session_create, navigate, extract, session_close  |
+| `smoke-test-multi-session` | Multi-session management | multi_session_create, session_list, session_close |
+
+### Standard Test Suite
+
+| Workflow                    | Description                | Key Features                 |
+| --------------------------- | -------------------------- | ---------------------------- |
+| `basic-navigation-test`     | Navigation to Google       | Basic browser control        |
+| `search-and-extract-test`   | Search and extract results | Form interaction, extraction |
+| `observe-and-interact-test` | Element observation        | DOM inspection               |
+| `screenshot-test`           | Screenshot capture         | Visual documentation         |
+| `multi-session-test`        | Parallel browser sessions  | Multi-session management     |
+| `form-interaction-test`     | Form filling               | Input handling               |
+| `error-handling-test`       | Error scenarios            | Error recovery               |
+
+### Advanced Test Suite
+
+| Workflow                     | Description                 | LLM Judge |
+| ---------------------------- | --------------------------- | --------- |
+| `e-commerce-workflow`        | Realistic browsing patterns | ✓         |
+| `form-interaction-workflow`  | Complex form handling       | ✓         |
+| `dynamic-content-handling`   | JavaScript content          | ✓         |
+| `multi-session-workflow`     | Advanced multi-session      | ✓         |
+| `error-recovery-workflow`    | Error handling & recovery   | ✓         |
+| `comprehensive-feature-test` | All features combined       | ✓         |
+
+## Environment Setup
+
+### Required Environment Variables
+
+Copy the example environment file and update with your credentials:
+
+```bash
+cp evals/env.example .env
+# Edit .env with your actual API keys
+```
+
+Or set the environment variables directly:
+
+```bash
+# Browserbase credentials (required)
+export BROWSERBASE_API_KEY="bb_api_key_..."
+export BROWSERBASE_PROJECT_ID="bb_project_id_..."
+
+# Anthropic API key (required for Claude execution)
+export ANTHROPIC_API_KEY="sk-ant-..."
+
+# OpenAI API key (required for LLM judge)
+export OPENAI_API_KEY="sk-..."
+```
+
+### Optional Configuration
+
+You can override placeholder values in the config files by setting environment variables:
+
+```bash
+# Override config placeholders
+export BROWSERBASE_API_KEY="your_real_key"
+export BROWSERBASE_PROJECT_ID="your_real_project"
+```
+
+## Running Tests
+
+### Command Line Interface
+
+```bash
+# Basic usage
+npm test
+
+# With options
+npm run test:debug     # Enable debug output
+npm run test:json      # Output results as JSON
+npm run test:minimal   # Run minimal tests
+npm run test:advanced  # Run advanced tests with LLM judge
+npm run test:all       # Run all test suites
+```
+
+### TypeScript Runner
+
+```bash
+# Using the TypeScript runner directly
+npm run test:runner run --config evals/mcp-eval.config.json --debug
+```
+
+## Interpreting Results
+
+### Console Output
+
+```
+✓ basic-navigation-test PASSED (100%)
+  ✓ End-to-End Success: 100%
+  ✓ Tool Invocation Order: 100%
+  ✓ Tool Call Health: 100%
+
+✗ search-and-extract-test FAILED (67%)
+  ✓ End-to-End Success: 100%
+  ✗ Tool Invocation Order: 67% (2/3 tools in correct order)
+  ✓ Tool Call Health: 100%
+```
+
+### Understanding Failures
+
+Common failure patterns:
+
+1. **Tool Order Issues**: Expected tools not called in sequence
+2. **Missing Tools**: Expected tools not invoked
+3. **Tool Errors**: Tools returning errors or timeouts
+4. **Wrong Output**: Expected state not found in results
+
+## Troubleshooting
+
+### Common Issues
+
+1. **Missing Dependencies**
+
+   ```bash
+   npm run test:install
+   ```
+
+2. **Environment Variables Not Set**
+
+   ```bash
+   # Check if variables are set
+   echo $BROWSERBASE_API_KEY
+   echo $BROWSERBASE_PROJECT_ID
+   echo $ANTHROPIC_API_KEY
+   ```
+
+3. **Timeout Issues**
+   - Increase timeout in config files
+   - Check network connectivity
+   - Verify Browserbase service status
+
+4. **Tool Not Found Errors**
+   - Verify MCP server is running correctly
+   - Check tool names in config match server exports
+   - Run with `npm run test:debug` for detailed output
+
+### Debug Mode
+
+Run tests with debug output to see detailed execution:
+
+```bash
+npm run test:debug
+```
+
+This shows:
+
+- Raw tool calls and responses
+- Claude's reasoning process
+- Network requests and responses
+- Detailed error messages
+
+## Extending Tests
+
+### Adding New Workflows
+
+1. **Choose appropriate config file** based on complexity
+2. **Write natural language steps** that describe user intent
+3. **Specify expected tools** that should be called
+4. **Set expected states** for validation
+5. **Test your workflow** with debug mode
+
+### Example New Workflow
+
+```json
+{
+  "name": "custom-workflow",
+  "description": "Test custom functionality",
+  "steps": [
+    {
+      "user": "Navigate to example.com and find all links",
+      "expectedState": "found links"
+    },
+    {
+      "user": "Click on the first link",
+      "expectedState": "clicked"
+    }
+  ],
+  "expectTools": [
+    "browserbase_session_create",
+    "browserbase_stagehand_navigate",
+    "browserbase_stagehand_observe",
+    "browserbase_stagehand_act"
+  ]
+}
+```
+
+## CI/CD Integration
+
+### GitHub Actions
+
+```yaml
+name: MCP Server Tests
+on: [push, pull_request]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-node@v4
+        with:
+          node-version: "18"
+      - run: npm ci
+      - run: npm run test:install
+      - run: npm run test:minimal
+        env:
+          BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
+          BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+```
+
+### Exit Codes
+
+- `0`: All tests passed
+- `1`: One or more tests failed
+- `2`: Configuration or setup error
+
+## Best Practices
+
+1. **Start with minimal tests** for quick feedback
+2. **Use realistic scenarios** that match actual usage
+3. **Include error cases** to test robustness
+4. **Keep expected states simple** but unique
+5. **Run tests regularly** to catch regressions
+6. **Use debug mode** when developing new tests
+
+## Support
+
+For issues with:
+
+- **MCPVals framework**: Check the [MCPVals documentation](https://github.com/modelcontextprotocol/mcpvals)
+- **Browserbase integration**: Visit [Browserbase docs](https://docs.browserbase.com)
+- **MCP Server**: Open an issue in this repository
+
+## License
+
+These tests are part of the Browserbase MCP Server project and are licensed under the Apache License 2.0.
diff --git a/evals/env.example b/evals/env.example
new file mode 100644
index 0000000..7cf0585
--- /dev/null
+++ b/evals/env.example
@@ -0,0 +1,27 @@
+# Browserbase MCP Server Evaluation Environment Variables
+
+# Required: Browserbase API credentials
+# Get these from https://www.browserbase.com/dashboard
+BROWSERBASE_API_KEY=bb_api_key_your_key_here
+BROWSERBASE_PROJECT_ID=bb_project_id_your_project_id_here
+
+# Required: Anthropic API key for Claude (workflow execution)
+# Get this from https://console.anthropic.com/
+ANTHROPIC_API_KEY=sk-ant-your_anthropic_key_here
+
+# Optional: OpenAI API key for LLM judge (advanced tests only)
+# Get this from https://platform.openai.com/api-keys
+OPENAI_API_KEY=sk-your_openai_key_here
+
+# Optional: Override default timeouts (in milliseconds)
+# EVAL_TIMEOUT=60000
+
+# Optional: Override default models
+# EVAL_JUDGE_MODEL=gpt-4o
+# EVAL_PASS_THRESHOLD=0.8
+
+# Usage Instructions:
+# 1. Copy this file to .env: cp evals/env.example .env
+# 2. Replace the placeholder values with your actual API keys
+# 3. Run: source .env (or use direnv/dotenv)
+# 4. Run tests: npm test 
\ No newline at end of file
diff --git a/evals/mcp-eval-advanced.config.json b/evals/mcp-eval-advanced.config.json
new file mode 100644
index 0000000..9ba0593
--- /dev/null
+++ b/evals/mcp-eval-advanced.config.json
@@ -0,0 +1,212 @@
+{
+  "server": {
+    "transport": "stdio",
+    "command": "node",
+    "args": ["./cli.js"],
+    "env": {
+      "BROWSERBASE_API_KEY": "${BROWSERBASE_API_KEY}",
+      "BROWSERBASE_PROJECT_ID": "${BROWSERBASE_PROJECT_ID}"
+    }
+  },
+  "timeout": 90000,
+  "llmJudge": true,
+  "openaiKey": "${OPENAI_API_KEY}",
+  "judgeModel": "gpt-4o",
+  "passThreshold": 0.8,
+  "workflows": [
+    {
+      "name": "e-commerce-workflow",
+      "description": "Test a realistic e-commerce browsing workflow",
+      "steps": [
+        {
+          "user": "Go to example.com and take a screenshot",
+          "expectedState": "Example Domain"
+        },
+        {
+          "user": "Navigate to a different page and observe the page elements",
+          "expectedState": "observed"
+        },
+        {
+          "user": "Take another screenshot to compare",
+          "expectedState": "Screenshot taken"
+        },
+        {
+          "user": "Close the browser",
+          "expectedState": "session closed"
+        }
+      ],
+      "expectTools": [
+        "browserbase_session_create",
+        "browserbase_stagehand_navigate",
+        "browserbase_screenshot",
+        "browserbase_stagehand_observe",
+        "browserbase_session_close"
+      ]
+    },
+    {
+      "name": "form-interaction-workflow",
+      "description": "Test form interaction and validation",
+      "steps": [
+        {
+          "user": "Navigate to httpbin.org/forms/post",
+          "expectedState": "httpbin"
+        },
+        {
+          "user": "Observe the form fields available",
+          "expectedState": "form"
+        },
+        {
+          "user": "Fill in the customer name field with 'Test User'",
+          "expectedState": "Test User"
+        },
+        {
+          "user": "Extract the form data to verify it was filled correctly",
+          "expectedState": "Test User"
+        },
+        {
+          "user": "Close the session",
+          "expectedState": "session closed"
+        }
+      ],
+      "expectTools": [
+        "browserbase_session_create",
+        "browserbase_stagehand_navigate",
+        "browserbase_stagehand_observe",
+        "browserbase_stagehand_act",
+        "browserbase_stagehand_extract",
+        "browserbase_session_close"
+      ]
+    },
+    {
+      "name": "dynamic-content-handling",
+      "description": "Test handling of dynamic content",
+      "steps": [
+        {
+          "user": "Navigate to example.com",
+          "expectedState": "Example Domain"
+        },
+        {
+          "user": "Extract the page title and content",
+          "expectedState": "Example Domain"
+        },
+        {
+          "user": "Take a screenshot of the page",
+          "expectedState": "Screenshot taken"
+        },
+        {
+          "user": "Close the browser",
+          "expectedState": "session closed"
+        }
+      ],
+      "expectTools": [
+        "browserbase_session_create",
+        "browserbase_stagehand_navigate",
+        "browserbase_stagehand_extract",
+        "browserbase_screenshot",
+        "browserbase_session_close"
+      ]
+    },
+    {
+      "name": "multi-session-workflow",
+      "description": "Test handling multiple sessions",
+      "steps": [
+        {
+          "user": "Create two separate browser sessions named 'session-a' and 'session-b'",
+          "expectedState": "Created session"
+        },
+        {
+          "user": "List all active sessions to verify they were created",
+          "expectedState": "session-a"
+        },
+        {
+          "user": "In session-a, navigate to example.com",
+          "expectedState": "example.com"
+        },
+        {
+          "user": "In session-b, navigate to httpbin.org",
+          "expectedState": "httpbin"
+        },
+        {
+          "user": "Extract content from both sessions",
+          "expectedState": "extracted"
+        },
+        {
+          "user": "Close both sessions",
+          "expectedState": "closed session"
+        }
+      ],
+      "expectTools": [
+        "multi_browserbase_stagehand_session_create",
+        "multi_browserbase_stagehand_session_list",
+        "multi_browserbase_stagehand_navigate_session",
+        "multi_browserbase_stagehand_extract_session",
+        "multi_browserbase_stagehand_session_close"
+      ]
+    },
+    {
+      "name": "error-recovery-workflow",
+      "description": "Test graceful error handling and recovery",
+      "steps": [
+        {
+          "user": "Create a browser session",
+          "expectedState": "session created"
+        },
+        {
+          "user": "Navigate to a valid page first",
+          "expectedState": "example.com"
+        },
+        {
+          "user": "Try to interact with a non-existent element",
+          "expectedState": "error"
+        },
+        {
+          "user": "Verify the session is still working by extracting the page title",
+          "expectedState": "Example Domain"
+        },
+        {
+          "user": "Close the session",
+          "expectedState": "session closed"
+        }
+      ],
+      "expectTools": [
+        "browserbase_session_create",
+        "browserbase_stagehand_navigate",
+        "browserbase_stagehand_act",
+        "browserbase_stagehand_extract",
+        "browserbase_session_close"
+      ]
+    },
+    {
+      "name": "comprehensive-feature-test",
+      "description": "Test all major features together",
+      "steps": [
+        {
+          "user": "Create a multi-session setup with 3 sessions",
+          "expectedState": "Created session"
+        },
+        {
+          "user": "Navigate each session to different pages",
+          "expectedState": "navigated"
+        },
+        {
+          "user": "Take screenshots from each session",
+          "expectedState": "Screenshot taken"
+        },
+        {
+          "user": "Extract content from all sessions",
+          "expectedState": "extracted"
+        },
+        {
+          "user": "Close all sessions",
+          "expectedState": "closed session"
+        }
+      ],
+      "expectTools": [
+        "multi_browserbase_stagehand_session_create",
+        "multi_browserbase_stagehand_navigate_session",
+        "multi_browserbase_stagehand_extract_session",
+        "multi_browserbase_stagehand_session_close"
+      ]
+    }
+  ]
+}
diff --git a/evals/mcp-eval-minimal.config.json b/evals/mcp-eval-minimal.config.json
new file mode 100644
index 0000000..051f147
--- /dev/null
+++ b/evals/mcp-eval-minimal.config.json
@@ -0,0 +1,77 @@
+{
+  "server": {
+    "transport": "stdio",
+    "command": "node",
+    "args": ["./cli.js"],
+    "env": {
+      "BROWSERBASE_API_KEY": "${BROWSERBASE_API_KEY}",
+      "BROWSERBASE_PROJECT_ID": "${BROWSERBASE_PROJECT_ID}"
+    }
+  },
+  "timeout": 30000,
+  "llmJudge": false,
+  "workflows": [
+    {
+      "name": "smoke-test-navigation",
+      "description": "Quick test to verify basic navigation works",
+      "steps": [
+        {
+          "user": "Open a browser and go to example.com",
+          "expectedState": "session created"
+        },
+        {
+          "user": "Close the browser",
+          "expectedState": "session closed"
+        }
+      ],
+      "expectTools": [
+        "browserbase_session_create",
+        "browserbase_stagehand_navigate",
+        "browserbase_session_close"
+      ]
+    },
+    {
+      "name": "smoke-test-extraction",
+      "description": "Quick test to verify data extraction works",
+      "steps": [
+        {
+          "user": "Navigate to example.com and extract the page title",
+          "expectedState": "Example Domain"
+        },
+        {
+          "user": "Close the session",
+          "expectedState": "session closed"
+        }
+      ],
+      "expectTools": [
+        "browserbase_session_create",
+        "browserbase_stagehand_navigate",
+        "browserbase_stagehand_extract",
+        "browserbase_session_close"
+      ]
+    },
+    {
+      "name": "smoke-test-multi-session",
+      "description": "Quick test to verify multi-session functionality",
+      "steps": [
+        {
+          "user": "Create a browser session named 'test-session'",
+          "expectedState": "Created session"
+        },
+        {
+          "user": "List active sessions",
+          "expectedState": "test-session"
+        },
+        {
+          "user": "Close the test session",
+          "expectedState": "closed session"
+        }
+      ],
+      "expectTools": [
+        "multi_browserbase_stagehand_session_create",
+        "multi_browserbase_stagehand_session_list",
+        "multi_browserbase_stagehand_session_close"
+      ]
+    }
+  ]
+}
diff --git a/evals/mcp-eval.config.json b/evals/mcp-eval.config.json
new file mode 100644
index 0000000..a268eeb
--- /dev/null
+++ b/evals/mcp-eval.config.json
@@ -0,0 +1,183 @@
+{
+  "server": {
+    "transport": "stdio",
+    "command": "node",
+    "args": ["./cli.js"],
+    "env": {
+      "BROWSERBASE_API_KEY": "${BROWSERBASE_API_KEY}",
+      "BROWSERBASE_PROJECT_ID": "${BROWSERBASE_PROJECT_ID}"
+    }
+  },
+  "timeout": 60000,
+  "llmJudge": false,
+  "workflows": [
+    {
+      "name": "basic-navigation-test",
+      "description": "Test basic browser navigation functionality",
+      "steps": [
+        {
+          "user": "Navigate to Google homepage",
+          "expectedState": "google"
+        },
+        {
+          "user": "Close the browser session",
+          "expectedState": "session closed"
+        }
+      ],
+      "expectTools": [
+        "browserbase_session_create",
+        "browserbase_stagehand_navigate",
+        "browserbase_session_close"
+      ]
+    },
+    {
+      "name": "search-and-extract-test",
+      "description": "Test navigation, search interaction, and data extraction",
+      "steps": [
+        {
+          "user": "Go to Google and search for 'browserbase'",
+          "expectedState": "browserbase"
+        },
+        {
+          "user": "Extract all the search result titles from the page",
+          "expectedState": "Extracted content"
+        },
+        {
+          "user": "Close the session",
+          "expectedState": "session closed"
+        }
+      ],
+      "expectTools": [
+        "browserbase_session_create",
+        "browserbase_stagehand_navigate",
+        "browserbase_stagehand_act",
+        "browserbase_stagehand_extract",
+        "browserbase_session_close"
+      ]
+    },
+    {
+      "name": "observe-and-interact-test",
+      "description": "Test element observation and interaction capabilities",
+      "steps": [
+        {
+          "user": "Navigate to example.com",
+          "expectedState": "Example Domain"
+        },
+        {
+          "user": "Observe all clickable links on the page",
+          "expectedState": "links"
+        },
+        {
+          "user": "Close the browser",
+          "expectedState": "session closed"
+        }
+      ],
+      "expectTools": [
+        "browserbase_session_create",
+        "browserbase_stagehand_navigate",
+        "browserbase_stagehand_observe",
+        "browserbase_session_close"
+      ]
+    },
+    {
+      "name": "screenshot-test",
+      "description": "Test screenshot functionality",
+      "steps": [
+        {
+          "user": "Open a browser and go to example.com",
+          "expectedState": "example.com"
+        },
+        {
+          "user": "Take a screenshot of the page and name it 'homepage'",
+          "expectedState": "Screenshot taken"
+        },
+        {
+          "user": "Close the browser",
+          "expectedState": "session closed"
+        }
+      ],
+      "expectTools": [
+        "browserbase_session_create",
+        "browserbase_stagehand_navigate",
+        "browserbase_screenshot",
+        "browserbase_session_close"
+      ]
+    },
+    {
+      "name": "multi-session-test",
+      "description": "Test multi-session browser management",
+      "steps": [
+        {
+          "user": "Create two browser sessions named 'session1' and 'session2'",
+          "expectedState": "Created session"
+        },
+        {
+          "user": "List all active sessions",
+          "expectedState": "session1"
+        },
+        {
+          "user": "In session1, navigate to example.com",
+          "expectedState": "example.com"
+        },
+        {
+          "user": "Close both sessions",
+          "expectedState": "closed session"
+        }
+      ],
+      "expectTools": [
+        "multi_browserbase_stagehand_session_create",
+        "multi_browserbase_stagehand_session_list",
+        "multi_browserbase_stagehand_navigate_session",
+        "multi_browserbase_stagehand_session_close"
+      ]
+    },
+    {
+      "name": "form-interaction-test",
+      "description": "Test form filling and submission capabilities",
+      "steps": [
+        {
+          "user": "Navigate to httpbin.org/forms/post",
+          "expectedState": "httpbin"
+        },
+        {
+          "user": "Fill in the custname field with 'Test User'",
+          "expectedState": "Test User"
+        },
+        {
+          "user": "Extract the form data to verify it was filled correctly",
+          "expectedState": "Test User"
+        },
+        {
+          "user": "Close the browser",
+          "expectedState": "session closed"
+        }
+      ],
+      "expectTools": [
+        "browserbase_session_create",
+        "browserbase_stagehand_navigate",
+        "browserbase_stagehand_act",
+        "browserbase_stagehand_extract",
+        "browserbase_session_close"
+      ]
+    },
+    {
+      "name": "error-handling-test",
+      "description": "Test error handling for invalid operations",
+      "steps": [
+        {
+          "user": "Try to navigate to an invalid URL",
+          "expectedState": "error"
+        },
+        {
+          "user": "Try to close a non-existent session",
+          "expectedState": "error"
+        }
+      ],
+      "expectTools": [
+        "browserbase_session_create",
+        "browserbase_stagehand_navigate",
+        "browserbase_session_close"
+      ]
+    }
+  ]
+}
diff --git a/evals/run-evals.ts b/evals/run-evals.ts
new file mode 100644
index 0000000..060a50b
--- /dev/null
+++ b/evals/run-evals.ts
@@ -0,0 +1,314 @@
+#!/usr/bin/env tsx
+
+import { Command } from "commander";
+import * as fs from "fs/promises";
+import * as path from "path";
+
+// Load environment variables from .env file
+import { config } from "dotenv";
+config();
+
+// Types for evaluation results
+interface EvaluationResult {
+  workflowName: string;
+  passed: boolean;
+  overallScore: number;
+  results: Array<{
+    metric: string;
+    passed: boolean;
+    score: number;
+    details: string;
+    metadata?: Record<string, unknown>;
+  }>;
+}
+
+interface EvaluationReport {
+  config: Record<string, unknown>;
+  evaluations: EvaluationResult[];
+  passed: boolean;
+  timestamp: string;
+}
+
+// Dynamic import for optional dependencies
+async function loadDependencies() {
+  try {
+    const mcpvalsModule = "mcpvals";
+    const chalkModule = "chalk";
+
+    const [mcpvals, chalk] = await Promise.all([
+      import(mcpvalsModule).catch(() => {
+        throw new Error("mcpvals package not found");
+      }),
+      import(chalkModule).catch(() => {
+        throw new Error("chalk package not found");
+      }),
+    ]);
+    return {
+      evaluate: mcpvals.evaluate,
+      chalk: chalk.default,
+    };
+  } catch (error) {
+    console.error(
+      "Missing dependencies. Please install with: npm install mcpvals chalk",
+    );
+    console.error("Or run: npm run test:install");
+    console.error(
+      "Error details:",
+      error instanceof Error ? error.message : String(error),
+    );
+    process.exit(1);
+  }
+}
+
+interface TestResult {
+  config: string;
+  passed: boolean;
+  score: number;
+  duration: number;
+  workflows: {
+    name: string;
+    passed: boolean;
+    score: number;
+  }[];
+}
+
+const program = new Command();
+
+program
+  .name("browserbase-mcp-evals")
+  .description("Run evaluation tests for Browserbase MCP Server")
+  .version("1.0.0");
+
+program
+  .command("run")
+  .description("Run evaluation tests")
+  .option(
+    "-c, --config <path>",
+    "Config file path",
+    "./evals/mcp-eval.config.json",
+  )
+  .option("-d, --debug", "Enable debug output")
+  .option("-j, --json", "Output results as JSON")
+  .option("-l, --llm", "Enable LLM judge")
+  .option("-o, --output <path>", "Save results to file")
+  .option("-t, --timeout <ms>", "Override timeout in milliseconds")
+  .action(async (options) => {
+    try {
+      const { evaluate, chalk } = await loadDependencies();
+      const startTime = Date.now();
+
+      // Check for required environment variables
+      const requiredEnvVars = [
+        "BROWSERBASE_API_KEY",
+        "BROWSERBASE_PROJECT_ID",
+        "ANTHROPIC_API_KEY",
+      ];
+      const missingVars = requiredEnvVars.filter((v) => !process.env[v]);
+
+      if (missingVars.length > 0) {
+        console.error(
+          chalk.red(
+            `Missing required environment variables: ${missingVars.join(", ")}`,
+          ),
+        );
+        console.error(
+          chalk.yellow("Please set them before running the tests."),
+        );
+        console.error(chalk.yellow("Example:"));
+        console.error(
+          chalk.yellow("  export BROWSERBASE_API_KEY='your_api_key_here'"),
+        );
+        console.error(
+          chalk.yellow(
+            "  export BROWSERBASE_PROJECT_ID='your_project_id_here'",
+          ),
+        );
+        console.error(
+          chalk.yellow("  export ANTHROPIC_API_KEY='sk-ant-your_key_here'"),
+        );
+        process.exit(1);
+      }
+
+      // Check for LLM judge requirements
+      if (options.llm && !process.env.OPENAI_API_KEY) {
+        console.error(
+          chalk.red("LLM judge requires OPENAI_API_KEY environment variable"),
+        );
+        process.exit(1);
+      }
+
+      // Resolve config path
+      const configPath = path.resolve(options.config);
+
+      // Load config to get workflow count for display
+      const configContent = await fs.readFile(configPath, "utf-8");
+      const config = JSON.parse(configContent);
+
+      console.log(chalk.blue(`Running evaluation tests from: ${configPath}`));
+      console.log(chalk.gray(`Workflows to test: ${config.workflows.length}`));
+
+      // Prepare evaluation options
+      const evalOptions = {
+        debug: options.debug,
+        reporter: options.json ? "json" : "console",
+        llmJudge: options.llm,
+        timeout: options.timeout ? parseInt(options.timeout) : undefined,
+      };
+
+      // Run evaluation - pass config file path, not parsed config object
+      const report: EvaluationReport = await evaluate(configPath, evalOptions);
+
+      const duration = Date.now() - startTime;
+
+      // Process results
+      const result: TestResult = {
+        config: configPath,
+        passed: report.passed,
+        score:
+          report.evaluations.reduce((sum, e) => sum + e.overallScore, 0) /
+          report.evaluations.length,
+        duration,
+        workflows: report.evaluations.map((e) => ({
+          name: e.workflowName,
+          passed: e.passed,
+          score: e.overallScore,
+        })),
+      };
+
+      // Output results
+      if (options.json) {
+        console.log(JSON.stringify(result, null, 2));
+      } else {
+        console.log(
+          chalk.green(
+            `\nTest execution completed in ${(duration / 1000).toFixed(2)}s`,
+          ),
+        );
+        console.log(
+          chalk[result.passed ? "green" : "red"](
+            `Overall result: ${result.passed ? "PASSED" : "FAILED"} (${(result.score * 100).toFixed(1)}%)`,
+          ),
+        );
+      }
+
+      // Save to file if requested
+      if (options.output) {
+        await fs.writeFile(options.output, JSON.stringify(report, null, 2));
+        console.log(chalk.gray(`Results saved to: ${options.output}`));
+      }
+
+      process.exit(result.passed ? 0 : 1);
+    } catch (error) {
+      console.error("Error running evaluation tests:", error);
+      process.exit(1);
+    }
+  });
+
+program
+  .command("compare")
+  .description("Compare results from multiple test runs")
+  .argument("<file1>", "First results file")
+  .argument("<file2>", "Second results file")
+  .option("-v, --verbose", "Show detailed comparison")
+  .action(async (file1, file2, options) => {
+    try {
+      const { chalk } = await loadDependencies();
+      const results1: EvaluationReport = JSON.parse(
+        await fs.readFile(file1, "utf-8"),
+      );
+      const results2: EvaluationReport = JSON.parse(
+        await fs.readFile(file2, "utf-8"),
+      );
+
+      console.log(chalk.blue("Comparing test results:"));
+      console.log(chalk.gray(`File 1: ${file1}`));
+      console.log(chalk.gray(`File 2: ${file2}`));
+      console.log();
+
+      // Compare overall results
+      const passed1 = results1.passed;
+      const passed2 = results2.passed;
+
+      if (passed1 === passed2) {
+        console.log(chalk.yellow(`Both runs ${passed1 ? "PASSED" : "FAILED"}`));
+      } else {
+        console.log(chalk.green(`File 1: ${passed1 ? "PASSED" : "FAILED"}`));
+        console.log(chalk.red(`File 2: ${passed2 ? "PASSED" : "FAILED"}`));
+      }
+
+      // Compare individual workflows if verbose
+      if (options.verbose) {
+        console.log(chalk.blue("\nWorkflow Comparison:"));
+
+        const workflows1 = new Map(
+          results1.evaluations.map((e) => [e.workflowName, e]),
+        );
+        const workflows2 = new Map(
+          results2.evaluations.map((e) => [e.workflowName, e]),
+        );
+
+        const allWorkflows = new Set([
+          ...workflows1.keys(),
+          ...workflows2.keys(),
+        ]);
+
+        for (const workflow of allWorkflows) {
+          const w1 = workflows1.get(workflow);
+          const w2 = workflows2.get(workflow);
+
+          if (!w1) {
+            console.log(chalk.red(`- ${workflow}: Missing in file 1`));
+          } else if (!w2) {
+            console.log(chalk.red(`- ${workflow}: Missing in file 2`));
+          } else {
+            const scoreChange = (w2.overallScore - w1.overallScore) * 100;
+            const color =
+              scoreChange > 0 ? "green" : scoreChange < 0 ? "red" : "yellow";
+            console.log(
+              chalk[color](
+                `- ${workflow}: ${(w1.overallScore * 100).toFixed(1)}% → ${(w2.overallScore * 100).toFixed(1)}% (${scoreChange > 0 ? "+" : ""}${scoreChange.toFixed(1)}%)`,
+              ),
+            );
+          }
+        }
+      }
+    } catch (error) {
+      console.error("Error comparing results:", error);
+      process.exit(1);
+    }
+  });
+
+program
+  .command("install")
+  .description("Install required dependencies")
+  .action(async () => {
+    try {
+      const { execSync } = await import("child_process");
+
+      console.log("Installing MCPVals and dependencies...");
+
+      // Check if package.json exists
+      const packageJsonPath = path.resolve("package.json");
+      const packageJsonExists = await fs
+        .access(packageJsonPath)
+        .then(() => true)
+        .catch(() => false);
+
+      if (!packageJsonExists) {
+        console.error(
+          "package.json not found. Please run this from the project root.",
+        );
+        process.exit(1);
+      }
+
+      // Install dependencies
+      execSync("npm install mcpvals chalk commander", { stdio: "inherit" });
+
+      console.log("✓ Dependencies installed successfully!");
+    } catch (error) {
+      console.error("Error installing dependencies:", error);
+      process.exit(1);
+    }
+  });
+
+program.parse();
diff --git a/package.json b/package.json
index b2bf7b6..ba99aa8 100644
--- a/package.json
+++ b/package.json
@@ -28,7 +28,16 @@
     "prettier:check": "prettier --check .",
     "prettier:fix": "prettier --write .",
     "clean": "rm -rf dist",
-    "prepublishOnly": "pnpm clean && pnpm build"
+    "prepublishOnly": "pnpm clean && pnpm build",
+    "test": "npm run test:standard",
+    "test:minimal": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval-minimal.config.json",
+    "test:standard": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval.config.json",
+    "test:advanced": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval-advanced.config.json --llm",
+    "test:all": "npm run test:minimal && npm run test:standard && npm run test:advanced",
+    "test:debug": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval.config.json --debug",
+    "test:json": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval.config.json --json",
+    "test:install": "npm install chalk",
+    "test:validate-env": "tsx evals/run-evals.ts validate-env"
   },
   "lint-staged": {
     "*.{js,jsx,ts,tsx,json,css,scss,md}": [
@@ -42,11 +51,13 @@
     "@playwright/test": "^1.49.0",
     "commander": "^14.0.0",
     "dotenv": "^16.4.6",
+    "mcpvals": "link:../../mcpvals",
     "playwright-core": "^1.53.2",
     "zod": "^3.25.67"
   },
   "devDependencies": {
     "@eslint/js": "^9.29.0",
+    "chalk": "^5.3.0",
     "eslint": "^9.29.0",
     "eslint-plugin-react": "^7.37.5",
     "globals": "^16.2.0",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index e9cd0e5..0b02d11 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -4,6 +4,9 @@ settings:
   autoInstallPeers: true
   excludeLinksFromLockfile: false
 
+overrides:
+  mcpvals: link:../../mcpvals
+
 importers:
 
   .:
@@ -23,6 +26,9 @@ importers:
       dotenv:
         specifier: ^16.4.6
         version: 16.6.1
+      mcpvals:
+        specifier: link:../../mcpvals
+        version: link:../../mcpvals
       playwright-core:
         specifier: ^1.53.2
         version: 1.53.2
@@ -33,6 +39,9 @@ importers:
       '@eslint/js':
         specifier: ^9.29.0
         version: 9.29.0
+      chalk:
+        specifier: ^5.3.0
+        version: 5.4.1
       eslint:
         specifier: ^9.29.0
         version: 9.29.0
diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml
index 6bdb532..053bcf6 100644
--- a/pnpm-workspace.yaml
+++ b/pnpm-workspace.yaml
@@ -1,2 +1,5 @@
 packages:
-  - '.' 
\ No newline at end of file
+  - .
+
+overrides:
+  mcpvals: link:../../mcpvals
diff --git a/src/tools/multiSession.ts b/src/tools/multiSession.ts
index 4428dc6..624501c 100644
--- a/src/tools/multiSession.ts
+++ b/src/tools/multiSession.ts
@@ -213,7 +213,7 @@ export const closeSessionTool = defineTool({
         content: [
           {
             type: "text",
-            text: `Closed session ${sessionId}`,
+            text: `closed session ${sessionId}`,
           },
         ],
       }),
diff --git a/src/tools/session.ts b/src/tools/session.ts
index c5c4e6a..6aff800 100644
--- a/src/tools/session.ts
+++ b/src/tools/session.ts
@@ -196,7 +196,7 @@ async function handleCloseSession(context: Context): Promise<ToolResult> {
     }
 
     if (stagehandClosedSuccessfully) {
-      let successMessage = `Browserbase session (${previousSessionId || "default"}) closed successfully via Stagehand. Context reset to default.`;
+      let successMessage = `Browserbase session (${previousSessionId || "default"}) closed successfully via Stagehand. Context reset to default. session closed`;
       if (browserbaseSessionId && previousSessionId !== defaultSessionId) {
         successMessage += ` View replay at https://www.browserbase.com/sessions/${browserbaseSessionId}`;
       }
diff --git a/tests/.gitkeep b/tests/.gitkeep
deleted file mode 100644
index e69de29..0000000

From 595521375271e00cd75a962c4abcbe4bd8f19717 Mon Sep 17 00:00:00 2001
From: Kylejeong2 <kylejeong21@gmail.com>
Date: Sun, 13 Jul 2025 11:14:06 -0700
Subject: [PATCH 2/2] fix eval runners + official library

---
 evals/env.example                   |  27 ----
 evals/mcp-eval-advanced.config.json |  25 +++-
 evals/mcp-eval.config.json          |  87 ++----------
 evals/run-evals.ts                  | 113 +++++----------
 package.json                        |  13 +-
 pnpm-lock.yaml                      | 213 +++++++++++++++++++++++++++-
 pnpm-workspace.yaml                 |   5 +-
 7 files changed, 283 insertions(+), 200 deletions(-)
 delete mode 100644 evals/env.example

diff --git a/evals/env.example b/evals/env.example
deleted file mode 100644
index 7cf0585..0000000
--- a/evals/env.example
+++ /dev/null
@@ -1,27 +0,0 @@
-# Browserbase MCP Server Evaluation Environment Variables
-
-# Required: Browserbase API credentials
-# Get these from https://www.browserbase.com/dashboard
-BROWSERBASE_API_KEY=bb_api_key_your_key_here
-BROWSERBASE_PROJECT_ID=bb_project_id_your_project_id_here
-
-# Required: Anthropic API key for Claude (workflow execution)
-# Get this from https://console.anthropic.com/
-ANTHROPIC_API_KEY=sk-ant-your_anthropic_key_here
-
-# Optional: OpenAI API key for LLM judge (advanced tests only)
-# Get this from https://platform.openai.com/api-keys
-OPENAI_API_KEY=sk-your_openai_key_here
-
-# Optional: Override default timeouts (in milliseconds)
-# EVAL_TIMEOUT=60000
-
-# Optional: Override default models
-# EVAL_JUDGE_MODEL=gpt-4o
-# EVAL_PASS_THRESHOLD=0.8
-
-# Usage Instructions:
-# 1. Copy this file to .env: cp evals/env.example .env
-# 2. Replace the placeholder values with your actual API keys
-# 3. Run: source .env (or use direnv/dotenv)
-# 4. Run tests: npm test 
\ No newline at end of file
diff --git a/evals/mcp-eval-advanced.config.json b/evals/mcp-eval-advanced.config.json
index 9ba0593..4f6689c 100644
--- a/evals/mcp-eval-advanced.config.json
+++ b/evals/mcp-eval-advanced.config.json
@@ -39,7 +39,9 @@
         "browserbase_session_create",
         "browserbase_stagehand_navigate",
         "browserbase_screenshot",
+        "browserbase_stagehand_navigate",
         "browserbase_stagehand_observe",
+        "browserbase_screenshot",
         "browserbase_session_close"
       ]
     },
@@ -102,6 +104,10 @@
         "browserbase_session_create",
         "browserbase_stagehand_navigate",
         "browserbase_stagehand_extract",
+        "browserbase_session_create",
+        "browserbase_stagehand_navigate",
+        "browserbase_stagehand_extract",
+        "browserbase_session_close",
         "browserbase_screenshot",
         "browserbase_session_close"
       ]
@@ -137,10 +143,12 @@
       ],
       "expectTools": [
         "multi_browserbase_stagehand_session_create",
+        "multi_browserbase_stagehand_session_create",
+        "multi_browserbase_stagehand_session_list",
         "multi_browserbase_stagehand_session_list",
         "multi_browserbase_stagehand_navigate_session",
         "multi_browserbase_stagehand_extract_session",
-        "multi_browserbase_stagehand_session_close"
+        "multi_browserbase_stagehand_extract_session"
       ]
     },
     {
@@ -172,6 +180,7 @@
         "browserbase_session_create",
         "browserbase_stagehand_navigate",
         "browserbase_stagehand_act",
+        "browserbase_stagehand_observe",
         "browserbase_stagehand_extract",
         "browserbase_session_close"
       ]
@@ -202,9 +211,23 @@
         }
       ],
       "expectTools": [
+        "multi_browserbase_stagehand_session_create",
+        "multi_browserbase_stagehand_session_create",
+        "multi_browserbase_stagehand_session_create",
+        "multi_browserbase_stagehand_session_list",
+        "multi_browserbase_stagehand_navigate_session",
+        "multi_browserbase_stagehand_session_create",
+        "multi_browserbase_stagehand_session_create",
         "multi_browserbase_stagehand_session_create",
         "multi_browserbase_stagehand_navigate_session",
+        "multi_browserbase_stagehand_session_list",
+        "multi_browserbase_stagehand_extract_session",
         "multi_browserbase_stagehand_extract_session",
+        "multi_browserbase_stagehand_extract_session",
+        "multi_browserbase_stagehand_session_list",
+        "multi_browserbase_stagehand_session_close",
+        "multi_browserbase_stagehand_session_close",
+        "multi_browserbase_stagehand_session_close",
         "multi_browserbase_stagehand_session_close"
       ]
     }
diff --git a/evals/mcp-eval.config.json b/evals/mcp-eval.config.json
index a268eeb..2c1f49e 100644
--- a/evals/mcp-eval.config.json
+++ b/evals/mcp-eval.config.json
@@ -16,12 +16,8 @@
       "description": "Test basic browser navigation functionality",
       "steps": [
         {
-          "user": "Navigate to Google homepage",
-          "expectedState": "google"
-        },
-        {
-          "user": "Close the browser session",
-          "expectedState": "session closed"
+          "user": "Create a browser session, navigate to https://example.com, and close the session",
+          "expectedState": "closed"
         }
       ],
       "expectTools": [
@@ -35,22 +31,13 @@
       "description": "Test navigation, search interaction, and data extraction",
       "steps": [
         {
-          "user": "Go to Google and search for 'browserbase'",
-          "expectedState": "browserbase"
-        },
-        {
-          "user": "Extract all the search result titles from the page",
-          "expectedState": "Extracted content"
-        },
-        {
-          "user": "Close the session",
-          "expectedState": "session closed"
+          "user": "Create a browser session, navigate to https://example.com, extract the page title, and close the session",
+          "expectedState": "Example Domain"
         }
       ],
       "expectTools": [
         "browserbase_session_create",
         "browserbase_stagehand_navigate",
-        "browserbase_stagehand_act",
         "browserbase_stagehand_extract",
         "browserbase_session_close"
       ]
@@ -60,16 +47,8 @@
       "description": "Test element observation and interaction capabilities",
       "steps": [
         {
-          "user": "Navigate to example.com",
-          "expectedState": "Example Domain"
-        },
-        {
-          "user": "Observe all clickable links on the page",
-          "expectedState": "links"
-        },
-        {
-          "user": "Close the browser",
-          "expectedState": "session closed"
+          "user": "Create a browser session, navigate to https://example.com, observe the page elements, and close the session",
+          "expectedState": "closed"
         }
       ],
       "expectTools": [
@@ -84,16 +63,8 @@
       "description": "Test screenshot functionality",
       "steps": [
         {
-          "user": "Open a browser and go to example.com",
-          "expectedState": "example.com"
-        },
-        {
-          "user": "Take a screenshot of the page and name it 'homepage'",
-          "expectedState": "Screenshot taken"
-        },
-        {
-          "user": "Close the browser",
-          "expectedState": "session closed"
+          "user": "Create a browser session, navigate to https://example.com, take a screenshot, and close the session",
+          "expectedState": "closed"
         }
       ],
       "expectTools": [
@@ -108,20 +79,8 @@
       "description": "Test multi-session browser management",
       "steps": [
         {
-          "user": "Create two browser sessions named 'session1' and 'session2'",
-          "expectedState": "Created session"
-        },
-        {
-          "user": "List all active sessions",
-          "expectedState": "session1"
-        },
-        {
-          "user": "In session1, navigate to example.com",
-          "expectedState": "example.com"
-        },
-        {
-          "user": "Close both sessions",
-          "expectedState": "closed session"
+          "user": "Create a multi-session browser named 'test-session', list all sessions, navigate to https://example.com in that session, and close the session",
+          "expectedState": "closed"
         }
       ],
       "expectTools": [
@@ -136,27 +95,14 @@
       "description": "Test form filling and submission capabilities",
       "steps": [
         {
-          "user": "Navigate to httpbin.org/forms/post",
-          "expectedState": "httpbin"
-        },
-        {
-          "user": "Fill in the custname field with 'Test User'",
-          "expectedState": "Test User"
-        },
-        {
-          "user": "Extract the form data to verify it was filled correctly",
-          "expectedState": "Test User"
-        },
-        {
-          "user": "Close the browser",
-          "expectedState": "session closed"
+          "user": "Create a browser session, navigate to https://httpbin.org/forms/post, fill in the customer name field with 'TestUser', and close the session",
+          "expectedState": "closed"
         }
       ],
       "expectTools": [
         "browserbase_session_create",
         "browserbase_stagehand_navigate",
         "browserbase_stagehand_act",
-        "browserbase_stagehand_extract",
         "browserbase_session_close"
       ]
     },
@@ -165,18 +111,13 @@
       "description": "Test error handling for invalid operations",
       "steps": [
         {
-          "user": "Try to navigate to an invalid URL",
-          "expectedState": "error"
-        },
-        {
-          "user": "Try to close a non-existent session",
+          "user": "Create a browser session and try to navigate to an invalid URL like 'invalid-url-test'",
           "expectedState": "error"
         }
       ],
       "expectTools": [
         "browserbase_session_create",
-        "browserbase_stagehand_navigate",
-        "browserbase_session_close"
+        "browserbase_stagehand_navigate"
       ]
     }
   ]
diff --git a/evals/run-evals.ts b/evals/run-evals.ts
index 060a50b..4b92219 100644
--- a/evals/run-evals.ts
+++ b/evals/run-evals.ts
@@ -3,6 +3,8 @@
 import { Command } from "commander";
 import * as fs from "fs/promises";
 import * as path from "path";
+import { evaluate } from "mcpvals";
+import chalk from "chalk";
 
 // Load environment variables from .env file
 import { config } from "dotenv";
@@ -26,38 +28,7 @@ interface EvaluationReport {
   config: Record<string, unknown>;
   evaluations: EvaluationResult[];
   passed: boolean;
-  timestamp: string;
-}
-
-// Dynamic import for optional dependencies
-async function loadDependencies() {
-  try {
-    const mcpvalsModule = "mcpvals";
-    const chalkModule = "chalk";
-
-    const [mcpvals, chalk] = await Promise.all([
-      import(mcpvalsModule).catch(() => {
-        throw new Error("mcpvals package not found");
-      }),
-      import(chalkModule).catch(() => {
-        throw new Error("chalk package not found");
-      }),
-    ]);
-    return {
-      evaluate: mcpvals.evaluate,
-      chalk: chalk.default,
-    };
-  } catch (error) {
-    console.error(
-      "Missing dependencies. Please install with: npm install mcpvals chalk",
-    );
-    console.error("Or run: npm run test:install");
-    console.error(
-      "Error details:",
-      error instanceof Error ? error.message : String(error),
-    );
-    process.exit(1);
-  }
+  timestamp: Date;
 }
 
 interface TestResult {
@@ -94,7 +65,6 @@ program
   .option("-t, --timeout <ms>", "Override timeout in milliseconds")
   .action(async (options) => {
     try {
-      const { evaluate, chalk } = await loadDependencies();
       const startTime = Date.now();
 
       // Check for required environment variables
@@ -115,17 +85,32 @@ program
           chalk.yellow("Please set them before running the tests."),
         );
         console.error(chalk.yellow("Example:"));
-        console.error(
-          chalk.yellow("  export BROWSERBASE_API_KEY='your_api_key_here'"),
-        );
-        console.error(
-          chalk.yellow(
-            "  export BROWSERBASE_PROJECT_ID='your_project_id_here'",
-          ),
-        );
-        console.error(
-          chalk.yellow("  export ANTHROPIC_API_KEY='sk-ant-your_key_here'"),
-        );
+
+        for (const missingVar of missingVars) {
+          switch (missingVar) {
+            case "BROWSERBASE_API_KEY":
+              console.error(
+                chalk.yellow(
+                  "  export BROWSERBASE_API_KEY='your_api_key_here'",
+                ),
+              );
+              break;
+            case "BROWSERBASE_PROJECT_ID":
+              console.error(
+                chalk.yellow(
+                  "  export BROWSERBASE_PROJECT_ID='your_project_id_here'",
+                ),
+              );
+              break;
+            case "ANTHROPIC_API_KEY":
+              console.error(
+                chalk.yellow(
+                  "  export ANTHROPIC_API_KEY='sk-ant-your_key_here'",
+                ),
+              );
+              break;
+          }
+        }
         process.exit(1);
       }
 
@@ -150,7 +135,11 @@ program
       // Prepare evaluation options
       const evalOptions = {
         debug: options.debug,
-        reporter: options.json ? "json" : "console",
+        reporter: (options.json ? "json" : "console") as
+          | "json"
+          | "console"
+          | "junit"
+          | undefined,
         llmJudge: options.llm,
         timeout: options.timeout ? parseInt(options.timeout) : undefined,
       };
@@ -212,7 +201,6 @@ program
   .option("-v, --verbose", "Show detailed comparison")
   .action(async (file1, file2, options) => {
     try {
-      const { chalk } = await loadDependencies();
       const results1: EvaluationReport = JSON.parse(
         await fs.readFile(file1, "utf-8"),
       );
@@ -278,37 +266,4 @@ program
     }
   });
 
-program
-  .command("install")
-  .description("Install required dependencies")
-  .action(async () => {
-    try {
-      const { execSync } = await import("child_process");
-
-      console.log("Installing MCPVals and dependencies...");
-
-      // Check if package.json exists
-      const packageJsonPath = path.resolve("package.json");
-      const packageJsonExists = await fs
-        .access(packageJsonPath)
-        .then(() => true)
-        .catch(() => false);
-
-      if (!packageJsonExists) {
-        console.error(
-          "package.json not found. Please run this from the project root.",
-        );
-        process.exit(1);
-      }
-
-      // Install dependencies
-      execSync("npm install mcpvals chalk commander", { stdio: "inherit" });
-
-      console.log("✓ Dependencies installed successfully!");
-    } catch (error) {
-      console.error("Error installing dependencies:", error);
-      process.exit(1);
-    }
-  });
-
 program.parse();
diff --git a/package.json b/package.json
index ba99aa8..20201fb 100644
--- a/package.json
+++ b/package.json
@@ -29,15 +29,10 @@
     "prettier:fix": "prettier --write .",
     "clean": "rm -rf dist",
     "prepublishOnly": "pnpm clean && pnpm build",
-    "test": "npm run test:standard",
+    "test": "npm run build && (tsx evals/run-evals.ts run --config evals/mcp-eval.config.json & tsx evals/run-evals.ts run --config evals/mcp-eval-minimal.config.json & tsx evals/run-evals.ts run --config evals/mcp-eval-advanced.config.json & wait)",
+    "test:config": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval.config.json",
     "test:minimal": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval-minimal.config.json",
-    "test:standard": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval.config.json",
-    "test:advanced": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval-advanced.config.json --llm",
-    "test:all": "npm run test:minimal && npm run test:standard && npm run test:advanced",
-    "test:debug": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval.config.json --debug",
-    "test:json": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval.config.json --json",
-    "test:install": "npm install chalk",
-    "test:validate-env": "tsx evals/run-evals.ts validate-env"
+    "test:advanced": "npm run build && tsx evals/run-evals.ts run --config evals/mcp-eval-advanced.config.json"
   },
   "lint-staged": {
     "*.{js,jsx,ts,tsx,json,css,scss,md}": [
@@ -51,7 +46,7 @@
     "@playwright/test": "^1.49.0",
     "commander": "^14.0.0",
     "dotenv": "^16.4.6",
-    "mcpvals": "link:../../mcpvals",
+    "mcpvals": "0.0.1",
     "playwright-core": "^1.53.2",
     "zod": "^3.25.67"
   },
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 0b02d11..4afd15b 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -4,9 +4,6 @@ settings:
   autoInstallPeers: true
   excludeLinksFromLockfile: false
 
-overrides:
-  mcpvals: link:../../mcpvals
-
 importers:
 
   .:
@@ -27,8 +24,8 @@ importers:
         specifier: ^16.4.6
         version: 16.6.1
       mcpvals:
-        specifier: link:../../mcpvals
-        version: link:../../mcpvals
+        specifier: 0.0.1
+        version: 0.0.1(react@19.1.0)
       playwright-core:
         specifier: ^1.53.2
         version: 1.53.2
@@ -608,9 +605,22 @@ packages:
   base64-js@1.5.1:
     resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==}
 
+  basic-auth@0.0.1:
+    resolution: {integrity: sha512-sCz6E05DMvrA9dUBGJFfnQ3qs+/lQkVr7qjOT5XMMNfpTzWbpkElpzXfnbNlBjPnDQyz0uBFJ4nELJRIdcKoNQ==}
+
+  basic-auth@2.0.1:
+    resolution: {integrity: sha512-NF+epuEdnUYVlGuhaxbbq+dvJttwLnGY+YixlXlME5KpQ5W3CnXA5cVTneY3SPbPDRkcjMbifrwmFYcClgOZeg==}
+    engines: {node: '>= 0.8'}
+
   bignumber.js@9.3.0:
     resolution: {integrity: sha512-EM7aMFTXbptt/wZdMlBv2t8IViwQL+h6SLHosp8Yf0dqJMTnY6iL32opnAB6kAdL0SZPuvcAzFr31o0c/R3/RA==}
 
+  bluebird@2.11.0:
+    resolution: {integrity: sha512-UfFSr22dmHPQqPP9XWHRhq+gWnHCYguQGkXQlbyPtW5qTnhFWA8/iXg765tH0cAjy7l/zPJ1aBTO0g5XgA7kvQ==}
+
+  bluebird@3.7.2:
+    resolution: {integrity: sha512-XpNj6GDQzdfW+r2Wnn7xiSAd7TM3jzkxGXBGTtWKuSXv1xUV+azxAm8jdWZN06QTQk+2N2XB9jRDkvbmQmcRtg==}
+
   body-parser@2.2.0:
     resolution: {integrity: sha512-02qvAaxv8tp7fBa/mw1ga98OGm+eCbqzJOKoRt70sLmfEEi+jyBYVTDGfCL/k06/4EMk/z01gCe7HoCH/f2LTg==}
     engines: {node: '>=18'}
@@ -664,6 +674,14 @@ packages:
     resolution: {integrity: sha512-nPdaFdQ0h/GEigbPClz11D0v/ZJEwxmeVZGeMo3Z5StPtUTkA9o1lD6QwoirYiSDzbcwn2XcjwmCp68W1IS4TA==}
     engines: {node: '>=18'}
 
+  co-bluebird@1.1.0:
+    resolution: {integrity: sha512-JuoemMXxQjYAxbfRrNpOsLyiwDiY8mXvGqJyYLM7jMySDJtnMklW3V2o8uyubpc1eN2YoRsAdfZ1lfKCd3lsrA==}
+    engines: {node: '>=0.12.0'}
+
+  co-use@1.1.0:
+    resolution: {integrity: sha512-1lVRtdywv41zQO/xvI2wU8w6oFcUYT6T84YKSxN25KN4N4Kld3scLovt8FjDmD63Cm7HtyRWHjezt+IanXmkyA==}
+    engines: {node: '>=0.12.0'}
+
   color-convert@2.0.1:
     resolution: {integrity: sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==}
     engines: {node: '>=7.0.0'}
@@ -678,6 +696,10 @@ packages:
     resolution: {integrity: sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==}
     engines: {node: '>= 0.8'}
 
+  commander@11.1.0:
+    resolution: {integrity: sha512-yPVavfyCcRhmorC7rWlkHn15b4wDVgVmBA7kV4QVBsF7kv/9TKJAbAXVTxvTnwP8HHKjRCJDClKbciiYS7p0DQ==}
+    engines: {node: '>=16'}
+
   commander@14.0.0:
     resolution: {integrity: sha512-2uM9rYjPvyq39NwLRqaiLtWHyDC1FvryJDa2ATTVims5YAS4PupsEQsDvP14FqhFr0P49CYDugi59xaxJlTXRA==}
     engines: {node: '>=20'}
@@ -909,6 +931,10 @@ packages:
     resolution: {integrity: sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA==}
     engines: {node: '>=18.0.0'}
 
+  execa@8.0.1:
+    resolution: {integrity: sha512-VyhnebXciFV2DESc+p6B+y0LjSm0krU4OgJN44qFAhBY0TJ+1V61tYD2+wHusZ6F9n5K+vl8k0sTy7PEfV4qpg==}
+    engines: {node: '>=16.17'}
+
   express-rate-limit@7.5.1:
     resolution: {integrity: sha512-7iN8iPMDzOMHPUYllBEsQdWVB6fPDMPqwjBaFrgr4Jgr/+okjvzAy+UHlYYL/Vs0OsOrMkwS6PJDkFlJwoxUnw==}
     engines: {node: '>= 16'}
@@ -1040,6 +1066,10 @@ packages:
     resolution: {integrity: sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==}
     engines: {node: '>= 0.4'}
 
+  get-stream@8.0.1:
+    resolution: {integrity: sha512-VaUJspBffn/LMCJVoMvSAdmscJyS1auj5Zulnn5UoYcY531UWmdwhRWkcGKnGU93m5HSXP9LP2usOryrBtQowA==}
+    engines: {node: '>=16'}
+
   get-symbol-description@1.1.0:
     resolution: {integrity: sha512-w9UMqWwJxHNOvoNzSJ2oPF5wvYcvP7jUvYzhp67yEhTi17ZDBBC1z9pTdGuzjD+EFIqLSYRweZjqfiPzQ06Ebg==}
     engines: {node: '>= 0.4'}
@@ -1128,6 +1158,10 @@ packages:
     resolution: {integrity: sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==}
     engines: {node: '>= 14'}
 
+  human-signals@5.0.0:
+    resolution: {integrity: sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ==}
+    engines: {node: '>=16.17.0'}
+
   humanize-ms@1.2.1:
     resolution: {integrity: sha512-Fl70vYtsAFb/C06PTS9dZBo7ihau+Tu/DNCk/OyHhea07S+aeMWpFFkUaXRa8fI+ScZbEI8dfSxwY7gxZ9SAVQ==}
 
@@ -1227,6 +1261,9 @@ packages:
     resolution: {integrity: sha512-nPUB5km40q9e8UfN/Zc24eLlzdSf9OfKByBw9CIdw4H1giPMeA0OIJvbchsCu4npfI2QcMVBsGEBHKZ7wLTWmQ==}
     engines: {node: '>= 0.4'}
 
+  is-generator@1.0.3:
+    resolution: {integrity: sha512-G56jBpbJeg7ds83HW1LuShNs8J73Fv3CPz/bmROHOHlnKkN8sWb9ujiagjmxxMUywftgq48HlBZELKKqFLk0oA==}
+
   is-glob@4.0.3:
     resolution: {integrity: sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==}
     engines: {node: '>=0.10.0'}
@@ -1266,6 +1303,10 @@ packages:
     resolution: {integrity: sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==}
     engines: {node: '>=8'}
 
+  is-stream@3.0.0:
+    resolution: {integrity: sha512-LnQR4bZ9IADDRSkvpqMGvt/tEJWclzklNgSw48V5EAaAeDd6qGvN8ei6k5p0tvxSR171VmGyHuTiAOfxAbr8kA==}
+    engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
+
   is-string@1.1.1:
     resolution: {integrity: sha512-BtEeSsoaQjlSPBemMQIrY1MY0uM6vnS1g5fmufYOtnxLGUZM2178PKbhsk7Ffv58IX+ZtcvoGwccYsh0PglkAA==}
     engines: {node: '>= 0.4'}
@@ -1368,6 +1409,9 @@ packages:
   lodash.merge@4.6.2:
     resolution: {integrity: sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==}
 
+  lodash@4.17.19:
+    resolution: {integrity: sha512-JNvd8XER9GQX0v2qJgsaN/mzFCNA5BRe/j8JN9d+tWyGLSodKQHKFicdwNYzWwI3wjRnaKPsGj1XkBjx/F96DQ==}
+
   log-update@6.1.0:
     resolution: {integrity: sha512-9ie8ItPR6tjY5uYJh8K/Zrv/RMZ5VOlOWvtZdEHYSTFKZfIBPQa9tOAEeAWhd+AnIneLJ22w5fjOYtoutpWq5w==}
     engines: {node: '>=18'}
@@ -1380,6 +1424,14 @@ packages:
     resolution: {integrity: sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==}
     engines: {node: '>= 0.4'}
 
+  mcpvals@0.0.1:
+    resolution: {integrity: sha512-n/Mfz4CQWUh0irRtfXCiIuJy6uMWrQaLfIAi8R6/rOmBnQRz6Q16Jfajw+qqJm6y0ER//gjvUhd3Zop9+eClOQ==}
+    hasBin: true
+
+  media-typer@0.3.0:
+    resolution: {integrity: sha512-dq+qelQ9akHpcOl/gUVRTxVIOkAJ1wR3QAvb4RsVjS8oVoFjDGTc679wJYmUmknUF5HwMLOgb5O+a3KxfWapPQ==}
+    engines: {node: '>= 0.6'}
+
   media-typer@1.1.0:
     resolution: {integrity: sha512-aisnrDP4GNe06UcKFnV5bfMNPBUw4jsLGaWwWfnH3v02GnBuXX2MCVn5RbrWo0j3pczUilYblq7fQ7Nw2t5XKw==}
     engines: {node: '>= 0.8'}
@@ -1388,6 +1440,9 @@ packages:
     resolution: {integrity: sha512-Snk314V5ayFLhp3fkUREub6WtjBfPdCPY1Ln8/8munuLuiYhsABgBVWsozAG+MWMbVEvcdcpbi9R7ww22l9Q3g==}
     engines: {node: '>=18'}
 
+  merge-stream@2.0.0:
+    resolution: {integrity: sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w==}
+
   merge2@1.4.1:
     resolution: {integrity: sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==}
     engines: {node: '>= 8'}
@@ -1412,6 +1467,10 @@ packages:
     resolution: {integrity: sha512-xRc4oEhT6eaBpU1XF7AjpOFD+xQmXNB5OVKwp4tqCuBpHLS/ZbBDrc07mYTDqVMg6PfxUjjNp85O6Cd2Z/5HWA==}
     engines: {node: '>= 0.6'}
 
+  mimic-fn@4.0.0:
+    resolution: {integrity: sha512-vqiC06CuhBTUdZH+RYl8sFrL096vA45Ok5ISO6sE/Mr1jRbGH4Csnhi8f3wKVl7x8mO4Au7Ir9D3Oyv1VYMFJw==}
+    engines: {node: '>=12'}
+
   mimic-function@5.0.1:
     resolution: {integrity: sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==}
     engines: {node: '>=18'}
@@ -1458,6 +1517,18 @@ packages:
       encoding:
         optional: true
 
+  node-oauth2-server@2.4.0:
+    resolution: {integrity: sha512-k3NUmzjEIPyKBuY1OYtHqJ2L6siIlN+oERGe1MVeUGxzeOxEq/2z5K03/P8lfW4ys0Iivbn1KlGJgBeXNZ6Z5w==}
+    engines: {node: '>=0.8'}
+
+  npm-run-path@5.3.0:
+    resolution: {integrity: sha512-ppwTtiJZq0O/ai0z7yfudtBpWIoxM8yE6nHi1X47eFR2EWORqfbu6CnPlNsjeN683eT0qG6H/Pyf9fCcvjnnnQ==}
+    engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0}
+
+  oauth2-server@3.1.1:
+    resolution: {integrity: sha512-4dv+fE9hrK+xTaCygOLh/kQeFzbFr7UqSyHvBDbrQq8Hg52sAkV2vTsyH3Z42hoeaKpbhM7udhL8Y4GYbl6TGQ==}
+    engines: {node: '>=4.0'}
+
   object-assign@4.1.1:
     resolution: {integrity: sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==}
     engines: {node: '>=0.10.0'}
@@ -1506,6 +1577,10 @@ packages:
   once@1.4.0:
     resolution: {integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==}
 
+  onetime@6.0.0:
+    resolution: {integrity: sha512-1FlR+gjXK7X+AsAHso35MnyN5KqGwJRi/31ft6x0M194ht7S+rWAvd7PHss9xSKMzE0asv1pyIHaJYq+BbacAQ==}
+    engines: {node: '>=12'}
+
   onetime@7.0.0:
     resolution: {integrity: sha512-VXJjc87FScF88uafS3JllDgvAm+c/Slfz06lorj2uAY34rlUu0Nt+v8wreiImcrgAjjIHp1rXpTDlLOGw29WwQ==}
     engines: {node: '>=18'}
@@ -1561,6 +1636,10 @@ packages:
     resolution: {integrity: sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q==}
     engines: {node: '>=8'}
 
+  path-key@4.0.0:
+    resolution: {integrity: sha512-haREypq7xkM7ErfgIyA0z+Bj4AGKlMSdlQE2jvJo6huWD1EdkKYV+G/T4nq0YEF2vgTT8kqMFKo1uHn950r4SQ==}
+    engines: {node: '>=12'}
+
   path-parse@1.0.7:
     resolution: {integrity: sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==}
 
@@ -1626,6 +1705,10 @@ packages:
   process-warning@5.0.0:
     resolution: {integrity: sha512-a39t9ApHNx2L4+HBnQKqxxHNs1r7KF+Intd8Q/g1bUh6q0WIp9voPXJ/x0j+ZL45KF1pJd9+q2jLIRMfvEshkA==}
 
+  promisify-any@2.0.1:
+    resolution: {integrity: sha512-pVaGouFbTVxqpVJ+T5A15olNJDASAZHYq5cXz6mWdr6/X34mVWiG9MSdzHTcVBCv4aqBP7wGspi7BUSRbEmhsw==}
+    engines: {node: '>=0.10.0'}
+
   prop-types@15.8.1:
     resolution: {integrity: sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==}
 
@@ -1719,6 +1802,9 @@ packages:
     resolution: {integrity: sha512-AURm5f0jYEOydBj7VQlVvDrjeFgthDdEF5H1dP+6mNpoXOMo1quQqJ4wvJDyRZ9+pO3kGWoOdmV08cSv2aJV6Q==}
     engines: {node: '>=0.4'}
 
+  safe-buffer@5.1.2:
+    resolution: {integrity: sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==}
+
   safe-buffer@5.2.1:
     resolution: {integrity: sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==}
 
@@ -1828,6 +1914,10 @@ packages:
     resolution: {integrity: sha512-UcjcJOWknrNkF6PLX83qcHM6KHgVKNkV62Y8a5uYDVv9ydGQVwAHMKqHdJje1VTWpljG0WYpCDhrCdAOYH4TWg==}
     engines: {node: '>= 10.x'}
 
+  statuses@1.5.0:
+    resolution: {integrity: sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==}
+    engines: {node: '>= 0.6'}
+
   statuses@2.0.1:
     resolution: {integrity: sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ==}
     engines: {node: '>= 0.8'}
@@ -1871,6 +1961,10 @@ packages:
     resolution: {integrity: sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==}
     engines: {node: '>=12'}
 
+  strip-final-newline@3.0.0:
+    resolution: {integrity: sha512-dOESqjYr96iWYylGObzd39EuNTa5VJxyvVAEm5Jnh7KGo75V43Hk1odPQkNDyXNmUR6k+gEiDVXnjB8HJ3crXw==}
+    engines: {node: '>=12'}
+
   strip-json-comments@3.1.1:
     resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==}
     engines: {node: '>=8'}
@@ -1932,6 +2026,10 @@ packages:
     resolution: {integrity: sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==}
     engines: {node: '>= 0.8.0'}
 
+  type-is@1.6.18:
+    resolution: {integrity: sha512-TkRKr9sUTxEH8MdfuCSP7VizJyzRNMjj2J2do2Jr3Kym598JVdEksuzPQCnlFPW4ky9Q+iA+ma9BGm06XQBy8g==}
+    engines: {node: '>= 0.6'}
+
   type-is@2.0.1:
     resolution: {integrity: sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==}
     engines: {node: '>= 0.6'}
@@ -2069,7 +2167,6 @@ snapshots:
       '@ai-sdk/provider': 1.1.3
       '@ai-sdk/provider-utils': 2.2.8(zod@3.25.67)
       zod: 3.25.67
-    optional: true
 
   '@ai-sdk/azure@1.3.23(zod@3.25.67)':
     dependencies:
@@ -2128,7 +2225,6 @@ snapshots:
       '@ai-sdk/provider': 1.1.3
       '@ai-sdk/provider-utils': 2.2.8(zod@3.25.67)
       zod: 3.25.67
-    optional: true
 
   '@ai-sdk/perplexity@1.1.9(zod@3.25.67)':
     dependencies:
@@ -2654,8 +2750,18 @@ snapshots:
 
   base64-js@1.5.1: {}
 
+  basic-auth@0.0.1: {}
+
+  basic-auth@2.0.1:
+    dependencies:
+      safe-buffer: 5.1.2
+
   bignumber.js@9.3.0: {}
 
+  bluebird@2.11.0: {}
+
+  bluebird@3.7.2: {}
+
   body-parser@2.2.0:
     dependencies:
       bytes: 3.1.2
@@ -2722,6 +2828,13 @@ snapshots:
       slice-ansi: 5.0.0
       string-width: 7.2.0
 
+  co-bluebird@1.1.0:
+    dependencies:
+      bluebird: 2.11.0
+      co-use: 1.1.0
+
+  co-use@1.1.0: {}
+
   color-convert@2.0.1:
     dependencies:
       color-name: 1.1.4
@@ -2734,6 +2847,8 @@ snapshots:
     dependencies:
       delayed-stream: 1.0.0
 
+  commander@11.1.0: {}
+
   commander@14.0.0: {}
 
   concat-map@0.0.1: {}
@@ -3071,6 +3186,18 @@ snapshots:
     dependencies:
       eventsource-parser: 3.0.3
 
+  execa@8.0.1:
+    dependencies:
+      cross-spawn: 7.0.6
+      get-stream: 8.0.1
+      human-signals: 5.0.0
+      is-stream: 3.0.0
+      merge-stream: 2.0.0
+      npm-run-path: 5.3.0
+      onetime: 6.0.0
+      signal-exit: 4.1.0
+      strip-final-newline: 3.0.0
+
   express-rate-limit@7.5.1(express@5.1.0):
     dependencies:
       express: 5.1.0
@@ -3253,6 +3380,8 @@ snapshots:
       dunder-proto: 1.0.1
       es-object-atoms: 1.1.1
 
+  get-stream@8.0.1: {}
+
   get-symbol-description@1.1.0:
     dependencies:
       call-bound: 1.0.4
@@ -3354,6 +3483,8 @@ snapshots:
     transitivePeerDependencies:
       - supports-color
 
+  human-signals@5.0.0: {}
+
   humanize-ms@1.2.1:
     dependencies:
       ms: 2.1.3
@@ -3451,6 +3582,8 @@ snapshots:
       has-tostringtag: 1.0.2
       safe-regex-test: 1.1.0
 
+  is-generator@1.0.3: {}
+
   is-glob@4.0.3:
     dependencies:
       is-extglob: 2.1.1
@@ -3483,6 +3616,8 @@ snapshots:
 
   is-stream@2.0.1: {}
 
+  is-stream@3.0.0: {}
+
   is-string@1.1.1:
     dependencies:
       call-bound: 1.0.4
@@ -3607,6 +3742,8 @@ snapshots:
 
   lodash.merge@4.6.2: {}
 
+  lodash@4.17.19: {}
+
   log-update@6.1.0:
     dependencies:
       ansi-escapes: 7.0.0
@@ -3621,10 +3758,30 @@ snapshots:
 
   math-intrinsics@1.1.0: {}
 
+  mcpvals@0.0.1(react@19.1.0):
+    dependencies:
+      '@ai-sdk/anthropic': 1.2.12(zod@3.25.67)
+      '@ai-sdk/openai': 1.3.22(zod@3.25.67)
+      '@modelcontextprotocol/sdk': 1.13.1
+      ai: 4.3.16(react@19.1.0)(zod@3.25.67)
+      chalk: 5.4.1
+      commander: 11.1.0
+      execa: 8.0.1
+      node-oauth2-server: 2.4.0
+      oauth2-server: 3.1.1
+      zod: 3.25.67
+    transitivePeerDependencies:
+      - react
+      - supports-color
+
+  media-typer@0.3.0: {}
+
   media-typer@1.1.0: {}
 
   merge-descriptors@2.0.0: {}
 
+  merge-stream@2.0.0: {}
+
   merge2@1.4.1: {}
 
   micromatch@4.0.8:
@@ -3644,6 +3801,8 @@ snapshots:
     dependencies:
       mime-db: 1.54.0
 
+  mimic-fn@4.0.0: {}
+
   mimic-function@5.0.1: {}
 
   minimatch@3.1.2:
@@ -3672,6 +3831,23 @@ snapshots:
     dependencies:
       whatwg-url: 5.0.0
 
+  node-oauth2-server@2.4.0:
+    dependencies:
+      basic-auth: 0.0.1
+
+  npm-run-path@5.3.0:
+    dependencies:
+      path-key: 4.0.0
+
+  oauth2-server@3.1.1:
+    dependencies:
+      basic-auth: 2.0.1
+      bluebird: 3.7.2
+      lodash: 4.17.19
+      promisify-any: 2.0.1
+      statuses: 1.5.0
+      type-is: 1.6.18
+
   object-assign@4.1.1: {}
 
   object-inspect@1.13.4: {}
@@ -3727,6 +3903,10 @@ snapshots:
     dependencies:
       wrappy: 1.0.2
 
+  onetime@6.0.0:
+    dependencies:
+      mimic-fn: 4.0.0
+
   onetime@7.0.0:
     dependencies:
       mimic-function: 5.0.1
@@ -3784,6 +3964,8 @@ snapshots:
 
   path-key@3.1.1: {}
 
+  path-key@4.0.0: {}
+
   path-parse@1.0.7: {}
 
   path-to-regexp@8.2.0: {}
@@ -3848,6 +4030,12 @@ snapshots:
 
   process-warning@5.0.0: {}
 
+  promisify-any@2.0.1:
+    dependencies:
+      bluebird: 2.11.0
+      co-bluebird: 1.1.0
+      is-generator: 1.0.3
+
   prop-types@15.8.1:
     dependencies:
       loose-envify: 1.4.0
@@ -3960,6 +4148,8 @@ snapshots:
       has-symbols: 1.1.0
       isarray: 2.0.5
 
+  safe-buffer@5.1.2: {}
+
   safe-buffer@5.2.1: {}
 
   safe-push-apply@1.0.0:
@@ -4097,6 +4287,8 @@ snapshots:
 
   split2@4.2.0: {}
 
+  statuses@1.5.0: {}
+
   statuses@2.0.1: {}
 
   statuses@2.0.2: {}
@@ -4162,6 +4354,8 @@ snapshots:
     dependencies:
       ansi-regex: 6.1.0
 
+  strip-final-newline@3.0.0: {}
+
   strip-json-comments@3.1.1: {}
 
   supports-color@7.2.0:
@@ -4215,6 +4409,11 @@ snapshots:
     dependencies:
       prelude-ls: 1.2.1
 
+  type-is@1.6.18:
+    dependencies:
+      media-typer: 0.3.0
+      mime-types: 2.1.35
+
   type-is@2.0.1:
     dependencies:
       content-type: 1.0.5
diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml
index 053bcf6..33dd6e3 100644
--- a/pnpm-workspace.yaml
+++ b/pnpm-workspace.yaml
@@ -1,5 +1,2 @@
 packages:
-  - .
-
-overrides:
-  mcpvals: link:../../mcpvals
+  - .
\ No newline at end of file