main
1"""MCP Server Evaluation Harness
2
3This script evaluates MCP servers by running test questions against them using Claude.
4"""
5
6import argparse
7import asyncio
8import json
9import re
10import sys
11import time
12import traceback
13import xml.etree.ElementTree as ET
14from pathlib import Path
15from typing import Any
16
17from anthropic import Anthropic
18
19from connections import create_connection
20
21EVALUATION_PROMPT = """You are an AI assistant with access to tools.
22
23When given a task, you MUST:
241. Use the available tools to complete the task
252. Provide summary of each step in your approach, wrapped in <summary> tags
263. Provide feedback on the tools provided, wrapped in <feedback> tags
274. Provide your final response, wrapped in <response> tags
28
29Summary Requirements:
30- In your <summary> tags, you must explain:
31 - The steps you took to complete the task
32 - Which tools you used, in what order, and why
33 - The inputs you provided to each tool
34 - The outputs you received from each tool
35 - A summary for how you arrived at the response
36
37Feedback Requirements:
38- In your <feedback> tags, provide constructive feedback on the tools:
39 - Comment on tool names: Are they clear and descriptive?
40 - Comment on input parameters: Are they well-documented? Are required vs optional parameters clear?
41 - Comment on descriptions: Do they accurately describe what the tool does?
42 - Comment on any errors encountered during tool usage: Did the tool fail to execute? Did the tool return too many tokens?
43 - Identify specific areas for improvement and explain WHY they would help
44 - Be specific and actionable in your suggestions
45
46Response Requirements:
47- Your response should be concise and directly address what was asked
48- Always wrap your final response in <response> tags
49- If you cannot solve the task return <response>NOT_FOUND</response>
50- For numeric responses, provide just the number
51- For IDs, provide just the ID
52- For names or text, provide the exact text requested
53- Your response should go last"""
54
55
56def parse_evaluation_file(file_path: Path) -> list[dict[str, Any]]:
57 """Parse XML evaluation file with qa_pair elements."""
58 try:
59 tree = ET.parse(file_path)
60 root = tree.getroot()
61 evaluations = []
62
63 for qa_pair in root.findall(".//qa_pair"):
64 question_elem = qa_pair.find("question")
65 answer_elem = qa_pair.find("answer")
66
67 if question_elem is not None and answer_elem is not None:
68 evaluations.append({
69 "question": (question_elem.text or "").strip(),
70 "answer": (answer_elem.text or "").strip(),
71 })
72
73 return evaluations
74 except Exception as e:
75 print(f"Error parsing evaluation file {file_path}: {e}")
76 return []
77
78
79def extract_xml_content(text: str, tag: str) -> str | None:
80 """Extract content from XML tags."""
81 pattern = rf"<{tag}>(.*?)</{tag}>"
82 matches = re.findall(pattern, text, re.DOTALL)
83 return matches[-1].strip() if matches else None
84
85
86async def agent_loop(
87 client: Anthropic,
88 model: str,
89 question: str,
90 tools: list[dict[str, Any]],
91 connection: Any,
92) -> tuple[str, dict[str, Any]]:
93 """Run the agent loop with MCP tools."""
94 messages = [{"role": "user", "content": question}]
95
96 response = await asyncio.to_thread(
97 client.messages.create,
98 model=model,
99 max_tokens=4096,
100 system=EVALUATION_PROMPT,
101 messages=messages,
102 tools=tools,
103 )
104
105 messages.append({"role": "assistant", "content": response.content})
106
107 tool_metrics = {}
108
109 while response.stop_reason == "tool_use":
110 tool_use = next(block for block in response.content if block.type == "tool_use")
111 tool_name = tool_use.name
112 tool_input = tool_use.input
113
114 tool_start_ts = time.time()
115 try:
116 tool_result = await connection.call_tool(tool_name, tool_input)
117 tool_response = json.dumps(tool_result) if isinstance(tool_result, (dict, list)) else str(tool_result)
118 except Exception as e:
119 tool_response = f"Error executing tool {tool_name}: {str(e)}\n"
120 tool_response += traceback.format_exc()
121 tool_duration = time.time() - tool_start_ts
122
123 if tool_name not in tool_metrics:
124 tool_metrics[tool_name] = {"count": 0, "durations": []}
125 tool_metrics[tool_name]["count"] += 1
126 tool_metrics[tool_name]["durations"].append(tool_duration)
127
128 messages.append({
129 "role": "user",
130 "content": [{
131 "type": "tool_result",
132 "tool_use_id": tool_use.id,
133 "content": tool_response,
134 }]
135 })
136
137 response = await asyncio.to_thread(
138 client.messages.create,
139 model=model,
140 max_tokens=4096,
141 system=EVALUATION_PROMPT,
142 messages=messages,
143 tools=tools,
144 )
145 messages.append({"role": "assistant", "content": response.content})
146
147 response_text = next(
148 (block.text for block in response.content if hasattr(block, "text")),
149 None,
150 )
151 return response_text, tool_metrics
152
153
154async def evaluate_single_task(
155 client: Anthropic,
156 model: str,
157 qa_pair: dict[str, Any],
158 tools: list[dict[str, Any]],
159 connection: Any,
160 task_index: int,
161) -> dict[str, Any]:
162 """Evaluate a single QA pair with the given tools."""
163 start_time = time.time()
164
165 print(f"Task {task_index + 1}: Running task with question: {qa_pair['question']}")
166 response, tool_metrics = await agent_loop(client, model, qa_pair["question"], tools, connection)
167
168 response_value = extract_xml_content(response, "response")
169 summary = extract_xml_content(response, "summary")
170 feedback = extract_xml_content(response, "feedback")
171
172 duration_seconds = time.time() - start_time
173
174 return {
175 "question": qa_pair["question"],
176 "expected": qa_pair["answer"],
177 "actual": response_value,
178 "score": int(response_value == qa_pair["answer"]) if response_value else 0,
179 "total_duration": duration_seconds,
180 "tool_calls": tool_metrics,
181 "num_tool_calls": sum(len(metrics["durations"]) for metrics in tool_metrics.values()),
182 "summary": summary,
183 "feedback": feedback,
184 }
185
186
187REPORT_HEADER = """
188# Evaluation Report
189
190## Summary
191
192- **Accuracy**: {correct}/{total} ({accuracy:.1f}%)
193- **Average Task Duration**: {average_duration_s:.2f}s
194- **Average Tool Calls per Task**: {average_tool_calls:.2f}
195- **Total Tool Calls**: {total_tool_calls}
196
197---
198"""
199
200TASK_TEMPLATE = """
201### Task {task_num}
202
203**Question**: {question}
204**Ground Truth Answer**: `{expected_answer}`
205**Actual Answer**: `{actual_answer}`
206**Correct**: {correct_indicator}
207**Duration**: {total_duration:.2f}s
208**Tool Calls**: {tool_calls}
209
210**Summary**
211{summary}
212
213**Feedback**
214{feedback}
215
216---
217"""
218
219
220async def run_evaluation(
221 eval_path: Path,
222 connection: Any,
223 model: str = "claude-3-7-sonnet-20250219",
224) -> str:
225 """Run evaluation with MCP server tools."""
226 print("🚀 Starting Evaluation")
227
228 client = Anthropic()
229
230 tools = await connection.list_tools()
231 print(f"📋 Loaded {len(tools)} tools from MCP server")
232
233 qa_pairs = parse_evaluation_file(eval_path)
234 print(f"📋 Loaded {len(qa_pairs)} evaluation tasks")
235
236 results = []
237 for i, qa_pair in enumerate(qa_pairs):
238 print(f"Processing task {i + 1}/{len(qa_pairs)}")
239 result = await evaluate_single_task(client, model, qa_pair, tools, connection, i)
240 results.append(result)
241
242 correct = sum(r["score"] for r in results)
243 accuracy = (correct / len(results)) * 100 if results else 0
244 average_duration_s = sum(r["total_duration"] for r in results) / len(results) if results else 0
245 average_tool_calls = sum(r["num_tool_calls"] for r in results) / len(results) if results else 0
246 total_tool_calls = sum(r["num_tool_calls"] for r in results)
247
248 report = REPORT_HEADER.format(
249 correct=correct,
250 total=len(results),
251 accuracy=accuracy,
252 average_duration_s=average_duration_s,
253 average_tool_calls=average_tool_calls,
254 total_tool_calls=total_tool_calls,
255 )
256
257 report += "".join([
258 TASK_TEMPLATE.format(
259 task_num=i + 1,
260 question=qa_pair["question"],
261 expected_answer=qa_pair["answer"],
262 actual_answer=result["actual"] or "N/A",
263 correct_indicator="✅" if result["score"] else "❌",
264 total_duration=result["total_duration"],
265 tool_calls=json.dumps(result["tool_calls"], indent=2),
266 summary=result["summary"] or "N/A",
267 feedback=result["feedback"] or "N/A",
268 )
269 for i, (qa_pair, result) in enumerate(zip(qa_pairs, results))
270 ])
271
272 return report
273
274
275def parse_headers(header_list: list[str]) -> dict[str, str]:
276 """Parse header strings in format 'Key: Value' into a dictionary."""
277 headers = {}
278 if not header_list:
279 return headers
280
281 for header in header_list:
282 if ":" in header:
283 key, value = header.split(":", 1)
284 headers[key.strip()] = value.strip()
285 else:
286 print(f"Warning: Ignoring malformed header: {header}")
287 return headers
288
289
290def parse_env_vars(env_list: list[str]) -> dict[str, str]:
291 """Parse environment variable strings in format 'KEY=VALUE' into a dictionary."""
292 env = {}
293 if not env_list:
294 return env
295
296 for env_var in env_list:
297 if "=" in env_var:
298 key, value = env_var.split("=", 1)
299 env[key.strip()] = value.strip()
300 else:
301 print(f"Warning: Ignoring malformed environment variable: {env_var}")
302 return env
303
304
305async def main():
306 parser = argparse.ArgumentParser(
307 description="Evaluate MCP servers using test questions",
308 formatter_class=argparse.RawDescriptionHelpFormatter,
309 epilog="""
310Examples:
311 # Evaluate a local stdio MCP server
312 python evaluation.py -t stdio -c python -a my_server.py eval.xml
313
314 # Evaluate an SSE MCP server
315 python evaluation.py -t sse -u https://example.com/mcp -H "Authorization: Bearer token" eval.xml
316
317 # Evaluate an HTTP MCP server with custom model
318 python evaluation.py -t http -u https://example.com/mcp -m claude-3-5-sonnet-20241022 eval.xml
319 """,
320 )
321
322 parser.add_argument("eval_file", type=Path, help="Path to evaluation XML file")
323 parser.add_argument("-t", "--transport", choices=["stdio", "sse", "http"], default="stdio", help="Transport type (default: stdio)")
324 parser.add_argument("-m", "--model", default="claude-3-7-sonnet-20250219", help="Claude model to use (default: claude-3-7-sonnet-20250219)")
325
326 stdio_group = parser.add_argument_group("stdio options")
327 stdio_group.add_argument("-c", "--command", help="Command to run MCP server (stdio only)")
328 stdio_group.add_argument("-a", "--args", nargs="+", help="Arguments for the command (stdio only)")
329 stdio_group.add_argument("-e", "--env", nargs="+", help="Environment variables in KEY=VALUE format (stdio only)")
330
331 remote_group = parser.add_argument_group("sse/http options")
332 remote_group.add_argument("-u", "--url", help="MCP server URL (sse/http only)")
333 remote_group.add_argument("-H", "--header", nargs="+", dest="headers", help="HTTP headers in 'Key: Value' format (sse/http only)")
334
335 parser.add_argument("-o", "--output", type=Path, help="Output file for evaluation report (default: stdout)")
336
337 args = parser.parse_args()
338
339 if not args.eval_file.exists():
340 print(f"Error: Evaluation file not found: {args.eval_file}")
341 sys.exit(1)
342
343 headers = parse_headers(args.headers) if args.headers else None
344 env_vars = parse_env_vars(args.env) if args.env else None
345
346 try:
347 connection = create_connection(
348 transport=args.transport,
349 command=args.command,
350 args=args.args,
351 env=env_vars,
352 url=args.url,
353 headers=headers,
354 )
355 except ValueError as e:
356 print(f"Error: {e}")
357 sys.exit(1)
358
359 print(f"🔗 Connecting to MCP server via {args.transport}...")
360
361 async with connection:
362 print("✅ Connected successfully")
363 report = await run_evaluation(args.eval_file, connection, args.model)
364
365 if args.output:
366 args.output.write_text(report)
367 print(f"\n✅ Report saved to {args.output}")
368 else:
369 print("\n" + report)
370
371
372if __name__ == "__main__":
373 asyncio.run(main())