skills/skills/mcp-builder/scripts/evaluation.py at main

  1"""MCP Server Evaluation Harness
  2
  3This script evaluates MCP servers by running test questions against them using Claude.
  4"""
  5
  6import argparse
  7import asyncio
  8import json
  9import re
 10import sys
 11import time
 12import traceback
 13import xml.etree.ElementTree as ET
 14from pathlib import Path
 15from typing import Any
 16
 17from anthropic import Anthropic
 18
 19from connections import create_connection
 20
 21EVALUATION_PROMPT = """You are an AI assistant with access to tools.
 22
 23When given a task, you MUST:
 241. Use the available tools to complete the task
 252. Provide summary of each step in your approach, wrapped in <summary> tags
 263. Provide feedback on the tools provided, wrapped in <feedback> tags
 274. Provide your final response, wrapped in <response> tags
 28
 29Summary Requirements:
 30- In your <summary> tags, you must explain:
 31  - The steps you took to complete the task
 32  - Which tools you used, in what order, and why
 33  - The inputs you provided to each tool
 34  - The outputs you received from each tool
 35  - A summary for how you arrived at the response
 36
 37Feedback Requirements:
 38- In your <feedback> tags, provide constructive feedback on the tools:
 39  - Comment on tool names: Are they clear and descriptive?
 40  - Comment on input parameters: Are they well-documented? Are required vs optional parameters clear?
 41  - Comment on descriptions: Do they accurately describe what the tool does?
 42  - Comment on any errors encountered during tool usage: Did the tool fail to execute? Did the tool return too many tokens?
 43  - Identify specific areas for improvement and explain WHY they would help
 44  - Be specific and actionable in your suggestions
 45
 46Response Requirements:
 47- Your response should be concise and directly address what was asked
 48- Always wrap your final response in <response> tags
 49- If you cannot solve the task return <response>NOT_FOUND</response>
 50- For numeric responses, provide just the number
 51- For IDs, provide just the ID
 52- For names or text, provide the exact text requested
 53- Your response should go last"""
 54
 55
 56def parse_evaluation_file(file_path: Path) -> list[dict[str, Any]]:
 57    """Parse XML evaluation file with qa_pair elements."""
 58    try:
 59        tree = ET.parse(file_path)
 60        root = tree.getroot()
 61        evaluations = []
 62
 63        for qa_pair in root.findall(".//qa_pair"):
 64            question_elem = qa_pair.find("question")
 65            answer_elem = qa_pair.find("answer")
 66
 67            if question_elem is not None and answer_elem is not None:
 68                evaluations.append({
 69                    "question": (question_elem.text or "").strip(),
 70                    "answer": (answer_elem.text or "").strip(),
 71                })
 72
 73        return evaluations
 74    except Exception as e:
 75        print(f"Error parsing evaluation file {file_path}: {e}")
 76        return []
 77
 78
 79def extract_xml_content(text: str, tag: str) -> str | None:
 80    """Extract content from XML tags."""
 81    pattern = rf"<{tag}>(.*?)</{tag}>"
 82    matches = re.findall(pattern, text, re.DOTALL)
 83    return matches[-1].strip() if matches else None
 84
 85
 86async def agent_loop(
 87    client: Anthropic,
 88    model: str,
 89    question: str,
 90    tools: list[dict[str, Any]],
 91    connection: Any,
 92) -> tuple[str, dict[str, Any]]:
 93    """Run the agent loop with MCP tools."""
 94    messages = [{"role": "user", "content": question}]
 95
 96    response = await asyncio.to_thread(
 97        client.messages.create,
 98        model=model,
 99        max_tokens=4096,
100        system=EVALUATION_PROMPT,
101        messages=messages,
102        tools=tools,
103    )
104
105    messages.append({"role": "assistant", "content": response.content})
106
107    tool_metrics = {}
108
109    while response.stop_reason == "tool_use":
110        tool_use = next(block for block in response.content if block.type == "tool_use")
111        tool_name = tool_use.name
112        tool_input = tool_use.input
113
114        tool_start_ts = time.time()
115        try:
116            tool_result = await connection.call_tool(tool_name, tool_input)
117            tool_response = json.dumps(tool_result) if isinstance(tool_result, (dict, list)) else str(tool_result)
118        except Exception as e:
119            tool_response = f"Error executing tool {tool_name}: {str(e)}\n"
120            tool_response += traceback.format_exc()
121        tool_duration = time.time() - tool_start_ts
122
123        if tool_name not in tool_metrics:
124            tool_metrics[tool_name] = {"count": 0, "durations": []}
125        tool_metrics[tool_name]["count"] += 1
126        tool_metrics[tool_name]["durations"].append(tool_duration)
127
128        messages.append({
129            "role": "user",
130            "content": [{
131                "type": "tool_result",
132                "tool_use_id": tool_use.id,
133                "content": tool_response,
134            }]
135        })
136
137        response = await asyncio.to_thread(
138            client.messages.create,
139            model=model,
140            max_tokens=4096,
141            system=EVALUATION_PROMPT,
142            messages=messages,
143            tools=tools,
144        )
145        messages.append({"role": "assistant", "content": response.content})
146
147    response_text = next(
148        (block.text for block in response.content if hasattr(block, "text")),
149        None,
150    )
151    return response_text, tool_metrics
152
153
154async def evaluate_single_task(
155    client: Anthropic,
156    model: str,
157    qa_pair: dict[str, Any],
158    tools: list[dict[str, Any]],
159    connection: Any,
160    task_index: int,
161) -> dict[str, Any]:
162    """Evaluate a single QA pair with the given tools."""
163    start_time = time.time()
164
165    print(f"Task {task_index + 1}: Running task with question: {qa_pair['question']}")
166    response, tool_metrics = await agent_loop(client, model, qa_pair["question"], tools, connection)
167
168    response_value = extract_xml_content(response, "response")
169    summary = extract_xml_content(response, "summary")
170    feedback = extract_xml_content(response, "feedback")
171
172    duration_seconds = time.time() - start_time
173
174    return {
175        "question": qa_pair["question"],
176        "expected": qa_pair["answer"],
177        "actual": response_value,
178        "score": int(response_value == qa_pair["answer"]) if response_value else 0,
179        "total_duration": duration_seconds,
180        "tool_calls": tool_metrics,
181        "num_tool_calls": sum(len(metrics["durations"]) for metrics in tool_metrics.values()),
182        "summary": summary,
183        "feedback": feedback,
184    }
185
186
187REPORT_HEADER = """
188# Evaluation Report
189
190## Summary
191
192- **Accuracy**: {correct}/{total} ({accuracy:.1f}%)
193- **Average Task Duration**: {average_duration_s:.2f}s
194- **Average Tool Calls per Task**: {average_tool_calls:.2f}
195- **Total Tool Calls**: {total_tool_calls}
196
197---
198"""
199
200TASK_TEMPLATE = """
201### Task {task_num}
202
203**Question**: {question}
204**Ground Truth Answer**: `{expected_answer}`
205**Actual Answer**: `{actual_answer}`
206**Correct**: {correct_indicator}
207**Duration**: {total_duration:.2f}s
208**Tool Calls**: {tool_calls}
209
210**Summary**
211{summary}
212
213**Feedback**
214{feedback}
215
216---
217"""
218
219
220async def run_evaluation(
221    eval_path: Path,
222    connection: Any,
223    model: str = "claude-3-7-sonnet-20250219",
224) -> str:
225    """Run evaluation with MCP server tools."""
226    print("🚀 Starting Evaluation")
227
228    client = Anthropic()
229
230    tools = await connection.list_tools()
231    print(f"📋 Loaded {len(tools)} tools from MCP server")
232
233    qa_pairs = parse_evaluation_file(eval_path)
234    print(f"📋 Loaded {len(qa_pairs)} evaluation tasks")
235
236    results = []
237    for i, qa_pair in enumerate(qa_pairs):
238        print(f"Processing task {i + 1}/{len(qa_pairs)}")
239        result = await evaluate_single_task(client, model, qa_pair, tools, connection, i)
240        results.append(result)
241
242    correct = sum(r["score"] for r in results)
243    accuracy = (correct / len(results)) * 100 if results else 0
244    average_duration_s = sum(r["total_duration"] for r in results) / len(results) if results else 0
245    average_tool_calls = sum(r["num_tool_calls"] for r in results) / len(results) if results else 0
246    total_tool_calls = sum(r["num_tool_calls"] for r in results)
247
248    report = REPORT_HEADER.format(
249        correct=correct,
250        total=len(results),
251        accuracy=accuracy,
252        average_duration_s=average_duration_s,
253        average_tool_calls=average_tool_calls,
254        total_tool_calls=total_tool_calls,
255    )
256
257    report += "".join([
258        TASK_TEMPLATE.format(
259            task_num=i + 1,
260            question=qa_pair["question"],
261            expected_answer=qa_pair["answer"],
262            actual_answer=result["actual"] or "N/A",
263            correct_indicator="✅" if result["score"] else "❌",
264            total_duration=result["total_duration"],
265            tool_calls=json.dumps(result["tool_calls"], indent=2),
266            summary=result["summary"] or "N/A",
267            feedback=result["feedback"] or "N/A",
268        )
269        for i, (qa_pair, result) in enumerate(zip(qa_pairs, results))
270    ])
271
272    return report
273
274
275def parse_headers(header_list: list[str]) -> dict[str, str]:
276    """Parse header strings in format 'Key: Value' into a dictionary."""
277    headers = {}
278    if not header_list:
279        return headers
280
281    for header in header_list:
282        if ":" in header:
283            key, value = header.split(":", 1)
284            headers[key.strip()] = value.strip()
285        else:
286            print(f"Warning: Ignoring malformed header: {header}")
287    return headers
288
289
290def parse_env_vars(env_list: list[str]) -> dict[str, str]:
291    """Parse environment variable strings in format 'KEY=VALUE' into a dictionary."""
292    env = {}
293    if not env_list:
294        return env
295
296    for env_var in env_list:
297        if "=" in env_var:
298            key, value = env_var.split("=", 1)
299            env[key.strip()] = value.strip()
300        else:
301            print(f"Warning: Ignoring malformed environment variable: {env_var}")
302    return env
303
304
305async def main():
306    parser = argparse.ArgumentParser(
307        description="Evaluate MCP servers using test questions",
308        formatter_class=argparse.RawDescriptionHelpFormatter,
309        epilog="""
310Examples:
311  # Evaluate a local stdio MCP server
312  python evaluation.py -t stdio -c python -a my_server.py eval.xml
313
314  # Evaluate an SSE MCP server
315  python evaluation.py -t sse -u https://example.com/mcp -H "Authorization: Bearer token" eval.xml
316
317  # Evaluate an HTTP MCP server with custom model
318  python evaluation.py -t http -u https://example.com/mcp -m claude-3-5-sonnet-20241022 eval.xml
319        """,
320    )
321
322    parser.add_argument("eval_file", type=Path, help="Path to evaluation XML file")
323    parser.add_argument("-t", "--transport", choices=["stdio", "sse", "http"], default="stdio", help="Transport type (default: stdio)")
324    parser.add_argument("-m", "--model", default="claude-3-7-sonnet-20250219", help="Claude model to use (default: claude-3-7-sonnet-20250219)")
325
326    stdio_group = parser.add_argument_group("stdio options")
327    stdio_group.add_argument("-c", "--command", help="Command to run MCP server (stdio only)")
328    stdio_group.add_argument("-a", "--args", nargs="+", help="Arguments for the command (stdio only)")
329    stdio_group.add_argument("-e", "--env", nargs="+", help="Environment variables in KEY=VALUE format (stdio only)")
330
331    remote_group = parser.add_argument_group("sse/http options")
332    remote_group.add_argument("-u", "--url", help="MCP server URL (sse/http only)")
333    remote_group.add_argument("-H", "--header", nargs="+", dest="headers", help="HTTP headers in 'Key: Value' format (sse/http only)")
334
335    parser.add_argument("-o", "--output", type=Path, help="Output file for evaluation report (default: stdout)")
336
337    args = parser.parse_args()
338
339    if not args.eval_file.exists():
340        print(f"Error: Evaluation file not found: {args.eval_file}")
341        sys.exit(1)
342
343    headers = parse_headers(args.headers) if args.headers else None
344    env_vars = parse_env_vars(args.env) if args.env else None
345
346    try:
347        connection = create_connection(
348            transport=args.transport,
349            command=args.command,
350            args=args.args,
351            env=env_vars,
352            url=args.url,
353            headers=headers,
354        )
355    except ValueError as e:
356        print(f"Error: {e}")
357        sys.exit(1)
358
359    print(f"🔗 Connecting to MCP server via {args.transport}...")
360
361    async with connection:
362        print("✅ Connected successfully")
363        report = await run_evaluation(args.eval_file, connection, args.model)
364
365        if args.output:
366            args.output.write_text(report)
367            print(f"\n✅ Report saved to {args.output}")
368        else:
369            print("\n" + report)
370
371
372if __name__ == "__main__":
373    asyncio.run(main())