Skip to content

Commit 7ac0867

Browse files
committed
A2A lint
Signed-off-by: Mihai Criveti <[email protected]>
1 parent df217a1 commit 7ac0867

File tree

10 files changed

+217
-35
lines changed

10 files changed

+217
-35
lines changed

mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/anthropic_judge.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,16 @@ async def rank_responses(
308308
raise ValueError(f"Unknown ranking method: {ranking_method}")
309309

310310
async def _rank_by_scoring(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
311-
"""Rank by scoring each response individually."""
311+
"""Rank by scoring each response individually.
312+
313+
Args:
314+
responses: List of response strings to rank
315+
criteria: Evaluation criteria to use for scoring
316+
context: Optional context for evaluation
317+
318+
Returns:
319+
RankingResult containing ranked responses with scores and reasoning
320+
"""
312321
rubric = EvaluationRubric(criteria=criteria, scale_description={"1": "Poor", "2": "Below Average", "3": "Average", "4": "Good", "5": "Excellent"})
313322

314323
# Evaluate each response
@@ -325,7 +334,16 @@ async def _rank_by_scoring(self, responses: List[str], criteria: List[Evaluation
325334
return RankingResult(rankings=ranked_results, consistency_score=1.0, reasoning="Ranked by individual scoring of each response")
326335

327336
async def _rank_by_tournament(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
328-
"""Rank using tournament-style pairwise comparisons."""
337+
"""Rank using tournament-style pairwise comparisons.
338+
339+
Args:
340+
responses: List of response strings to rank
341+
criteria: Evaluation criteria to use for comparisons
342+
context: Optional context for evaluation
343+
344+
Returns:
345+
RankingResult containing ranked responses based on tournament wins
346+
"""
329347
n = len(responses)
330348
wins = [0] * n
331349

@@ -363,7 +381,16 @@ async def _rank_by_tournament(self, responses: List[str], criteria: List[Evaluat
363381
return RankingResult(rankings=ranked_results, consistency_score=max(0.0, consistency), reasoning="Ranked by tournament-style pairwise comparisons")
364382

365383
async def _rank_by_round_robin(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
366-
"""Rank using round-robin pairwise comparisons."""
384+
"""Rank using round-robin pairwise comparisons.
385+
386+
Args:
387+
responses: List of response strings to rank
388+
criteria: Evaluation criteria to use for comparisons
389+
context: Optional context for evaluation
390+
391+
Returns:
392+
RankingResult containing ranked responses based on round-robin wins
393+
"""
367394
# For now, implement same as tournament
368395
return await self._rank_by_tournament(responses, criteria, context)
369396

mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/bedrock_judge.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@ async def _make_api_call(self, messages: List[Dict[str, str]], temperature: Opti
7777
7878
Returns:
7979
Response content from the API
80+
81+
Raises:
82+
Exception: If Bedrock API call fails
8083
"""
8184
# Format for Anthropic models on Bedrock
8285
system_message = ""
@@ -338,7 +341,16 @@ async def rank_responses(
338341
raise ValueError(f"Unknown ranking method: {ranking_method}")
339342

340343
async def _rank_by_scoring(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
341-
"""Rank by scoring each response individually."""
344+
"""Rank by scoring each response individually.
345+
346+
Args:
347+
responses: List of response strings to rank
348+
criteria: Evaluation criteria to use for scoring
349+
context: Optional context for evaluation
350+
351+
Returns:
352+
RankingResult containing ranked responses with scores and reasoning
353+
"""
342354
rubric = EvaluationRubric(criteria=criteria, scale_description={"1": "Poor", "2": "Below Average", "3": "Average", "4": "Good", "5": "Excellent"})
343355

344356
# Evaluate each response
@@ -355,7 +367,16 @@ async def _rank_by_scoring(self, responses: List[str], criteria: List[Evaluation
355367
return RankingResult(rankings=ranked_results, consistency_score=1.0, reasoning="Ranked by individual scoring of each response")
356368

357369
async def _rank_by_tournament(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
358-
"""Rank using tournament-style pairwise comparisons."""
370+
"""Rank using tournament-style pairwise comparisons.
371+
372+
Args:
373+
responses: List of response strings to rank
374+
criteria: Evaluation criteria to use for comparisons
375+
context: Optional context for evaluation
376+
377+
Returns:
378+
RankingResult containing ranked responses based on tournament wins
379+
"""
359380
n = len(responses)
360381
wins = [0] * n
361382

@@ -393,7 +414,16 @@ async def _rank_by_tournament(self, responses: List[str], criteria: List[Evaluat
393414
return RankingResult(rankings=ranked_results, consistency_score=max(0.0, consistency), reasoning="Ranked by tournament-style pairwise comparisons")
394415

395416
async def _rank_by_round_robin(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
396-
"""Rank using round-robin pairwise comparisons."""
417+
"""Rank using round-robin pairwise comparisons.
418+
419+
Args:
420+
responses: List of response strings to rank
421+
criteria: Evaluation criteria to use for comparisons
422+
context: Optional context for evaluation
423+
424+
Returns:
425+
RankingResult containing ranked responses based on round-robin wins
426+
"""
397427
# For now, implement same as tournament
398428
return await self._rank_by_tournament(responses, criteria, context)
399429

mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/gemini_judge.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ async def _make_api_call(self, messages: List[Dict[str, str]], temperature: Opti
8080
8181
Returns:
8282
Response content from the API
83+
84+
Raises:
85+
Exception: If Gemini API call fails
8386
"""
8487
self.logger.debug(f"🔗 Making Gemini API call to {self.model_name}")
8588
self.logger.debug(f" Messages: {len(messages)}, Temperature: {temperature or self.temperature}, Max tokens: {max_tokens or self.max_tokens}")
@@ -342,7 +345,16 @@ async def rank_responses(
342345
raise ValueError(f"Unknown ranking method: {ranking_method}")
343346

344347
async def _rank_by_scoring(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
345-
"""Rank by scoring each response individually."""
348+
"""Rank by scoring each response individually.
349+
350+
Args:
351+
responses: List of response strings to rank
352+
criteria: Evaluation criteria to use for scoring
353+
context: Optional context for evaluation
354+
355+
Returns:
356+
RankingResult containing ranked responses with scores and reasoning
357+
"""
346358
rubric = EvaluationRubric(criteria=criteria, scale_description={"1": "Poor", "2": "Below Average", "3": "Average", "4": "Good", "5": "Excellent"})
347359

348360
# Evaluate each response
@@ -359,7 +371,16 @@ async def _rank_by_scoring(self, responses: List[str], criteria: List[Evaluation
359371
return RankingResult(rankings=ranked_results, consistency_score=1.0, reasoning="Ranked by individual scoring of each response")
360372

361373
async def _rank_by_tournament(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
362-
"""Rank using tournament-style pairwise comparisons."""
374+
"""Rank using tournament-style pairwise comparisons.
375+
376+
Args:
377+
responses: List of response strings to rank
378+
criteria: Evaluation criteria to use for comparisons
379+
context: Optional context for evaluation
380+
381+
Returns:
382+
RankingResult containing ranked responses based on tournament wins
383+
"""
363384
n = len(responses)
364385
wins = [0] * n
365386

@@ -397,7 +418,16 @@ async def _rank_by_tournament(self, responses: List[str], criteria: List[Evaluat
397418
return RankingResult(rankings=ranked_results, consistency_score=max(0.0, consistency), reasoning="Ranked by tournament-style pairwise comparisons")
398419

399420
async def _rank_by_round_robin(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
400-
"""Rank using round-robin pairwise comparisons."""
421+
"""Rank using round-robin pairwise comparisons.
422+
423+
Args:
424+
responses: List of response strings to rank
425+
criteria: Evaluation criteria to use for comparisons
426+
context: Optional context for evaluation
427+
428+
Returns:
429+
RankingResult containing ranked responses based on round-robin wins
430+
"""
401431
# For now, implement same as tournament
402432
return await self._rank_by_tournament(responses, criteria, context)
403433

mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/ollama_judge.py

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,11 @@ def __init__(self, config: Dict[str, Any]) -> None:
5555
self._is_healthy = None
5656

5757
async def _get_session(self):
58-
"""Get or create HTTP session."""
58+
"""Get or create HTTP session.
59+
60+
Returns:
61+
aiohttp.ClientSession: HTTP session for making requests
62+
"""
5963
if self.session is None:
6064
timeout = aiohttp.ClientTimeout(total=self.request_timeout)
6165
self.session = aiohttp.ClientSession(timeout=timeout)
@@ -77,7 +81,11 @@ def __del__(self):
7781
pass
7882

7983
async def is_healthy(self) -> bool:
80-
"""Check if OLLAMA server is healthy and model is available."""
84+
"""Check if OLLAMA server is healthy and model is available.
85+
86+
Returns:
87+
bool: True if OLLAMA server is healthy and model is available, False otherwise
88+
"""
8189
if self._is_healthy is not None:
8290
return self._is_healthy
8391

@@ -112,6 +120,9 @@ async def _make_api_call(self, messages: List[Dict[str, str]], temperature: Opti
112120
113121
Returns:
114122
Response content from the API
123+
124+
Raises:
125+
Exception: If OLLAMA API call fails
115126
"""
116127
session = await self._get_session()
117128

@@ -354,7 +365,16 @@ async def rank_responses(
354365
raise ValueError(f"Unknown ranking method: {ranking_method}")
355366

356367
async def _rank_by_scoring(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
357-
"""Rank by scoring each response individually."""
368+
"""Rank by scoring each response individually.
369+
370+
Args:
371+
responses: List of response strings to rank
372+
criteria: Evaluation criteria to use for scoring
373+
context: Optional context for evaluation
374+
375+
Returns:
376+
RankingResult containing ranked responses with scores and reasoning
377+
"""
358378
rubric = EvaluationRubric(criteria=criteria, scale_description={"1": "Poor", "2": "Below Average", "3": "Average", "4": "Good", "5": "Excellent"})
359379

360380
# Evaluate each response
@@ -371,7 +391,16 @@ async def _rank_by_scoring(self, responses: List[str], criteria: List[Evaluation
371391
return RankingResult(rankings=ranked_results, consistency_score=1.0, reasoning="Ranked by individual scoring of each response")
372392

373393
async def _rank_by_tournament(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
374-
"""Rank using tournament-style pairwise comparisons."""
394+
"""Rank using tournament-style pairwise comparisons.
395+
396+
Args:
397+
responses: List of response strings to rank
398+
criteria: Evaluation criteria to use for comparisons
399+
context: Optional context for evaluation
400+
401+
Returns:
402+
RankingResult containing ranked responses based on tournament wins
403+
"""
375404
n = len(responses)
376405
wins = [0] * n
377406

@@ -409,7 +438,16 @@ async def _rank_by_tournament(self, responses: List[str], criteria: List[Evaluat
409438
return RankingResult(rankings=ranked_results, consistency_score=max(0.0, consistency), reasoning="Ranked by tournament-style pairwise comparisons")
410439

411440
async def _rank_by_round_robin(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
412-
"""Rank using round-robin pairwise comparisons."""
441+
"""Rank using round-robin pairwise comparisons.
442+
443+
Args:
444+
responses: List of response strings to rank
445+
criteria: Evaluation criteria to use for comparisons
446+
context: Optional context for evaluation
447+
448+
Returns:
449+
RankingResult containing ranked responses based on round-robin wins
450+
"""
413451
# For now, implement same as tournament
414452
return await self._rank_by_tournament(responses, criteria, context)
415453

mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/watsonx_judge.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,9 @@ async def _make_api_call(self, messages: List[Dict[str, str]], temperature: Opti
9595
9696
Returns:
9797
Response content from the API
98+
99+
Raises:
100+
Exception: If Watsonx.ai API call fails
98101
"""
99102
self.logger.debug(f"🔗 Making Watsonx.ai API call to {self.model}")
100103
self.logger.debug(f" Messages: {len(messages)}, Temperature: {temperature or self.temperature}, Max tokens: {max_tokens or self.max_tokens}")
@@ -359,7 +362,16 @@ async def rank_responses(
359362
raise ValueError(f"Unknown ranking method: {ranking_method}")
360363

361364
async def _rank_by_scoring(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
362-
"""Rank by scoring each response individually."""
365+
"""Rank by scoring each response individually.
366+
367+
Args:
368+
responses: List of response strings to rank
369+
criteria: Evaluation criteria to use for scoring
370+
context: Optional context for evaluation
371+
372+
Returns:
373+
RankingResult containing ranked responses with scores and reasoning
374+
"""
363375
rubric = EvaluationRubric(criteria=criteria, scale_description={"1": "Poor", "2": "Below Average", "3": "Average", "4": "Good", "5": "Excellent"})
364376

365377
# Evaluate each response
@@ -376,7 +388,16 @@ async def _rank_by_scoring(self, responses: List[str], criteria: List[Evaluation
376388
return RankingResult(rankings=ranked_results, consistency_score=1.0, reasoning="Ranked by individual scoring of each response")
377389

378390
async def _rank_by_tournament(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
379-
"""Rank using tournament-style pairwise comparisons."""
391+
"""Rank using tournament-style pairwise comparisons.
392+
393+
Args:
394+
responses: List of response strings to rank
395+
criteria: Evaluation criteria to use for comparisons
396+
context: Optional context for evaluation
397+
398+
Returns:
399+
RankingResult containing ranked responses based on tournament wins
400+
"""
380401
n = len(responses)
381402
wins = [0] * n
382403

@@ -414,7 +435,16 @@ async def _rank_by_tournament(self, responses: List[str], criteria: List[Evaluat
414435
return RankingResult(rankings=ranked_results, consistency_score=max(0.0, consistency), reasoning="Ranked by tournament-style pairwise comparisons")
415436

416437
async def _rank_by_round_robin(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
417-
"""Rank using round-robin pairwise comparisons."""
438+
"""Rank using round-robin pairwise comparisons.
439+
440+
Args:
441+
responses: List of response strings to rank
442+
criteria: Evaluation criteria to use for comparisons
443+
context: Optional context for evaluation
444+
445+
Returns:
446+
RankingResult containing ranked responses based on round-robin wins
447+
"""
418448
# For now, implement same as tournament
419449
return await self._rank_by_tournament(responses, criteria, context)
420450

mcp-servers/python/mcp_eval_server/mcp_eval_server/server.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,7 @@ async def main():
548548
deployment = os.getenv("AZURE_DEPLOYMENT_NAME", "not configured")
549549
endpoint_info = f" → {endpoint} (deployment: {deployment})"
550550
elif provider == "anthropic":
551-
endpoint_info = f" → https://api.anthropic.com"
551+
endpoint_info = " → https://api.anthropic.com"
552552
elif provider == "ollama":
553553
base_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
554554
# Test OLLAMA connectivity for status display
@@ -562,19 +562,19 @@ async def test_ollama():
562562
async with aiohttp.ClientSession(timeout=timeout) as session:
563563
async with session.get(f"{base_url}/api/tags") as response:
564564
return response.status == 200
565-
except:
565+
except Exception:
566566
return False
567567

568568
is_connected = await test_ollama()
569569
status = "🟢 connected" if is_connected else "🔴 not reachable"
570570
endpoint_info = f" → {base_url} ({status})"
571-
except:
571+
except Exception:
572572
endpoint_info = f" → {base_url} (🔴 not reachable)"
573573
elif provider == "bedrock":
574574
region = os.getenv("AWS_REGION", "us-east-1")
575575
endpoint_info = f" → AWS Bedrock ({region})"
576576
elif provider == "gemini":
577-
endpoint_info = f" → Google AI Studio"
577+
endpoint_info = " → Google AI Studio"
578578
elif provider == "watsonx":
579579
watsonx_url = os.getenv("WATSONX_URL", "https://us-south.ml.cloud.ibm.com")
580580
project_id = os.getenv("WATSONX_PROJECT_ID", "not configured")

mcp-servers/python/mcp_eval_server/mcp_eval_server/tools/judge_tools.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ def _load_judges(self) -> None:
130130
if GeminiJudge:
131131
google_api_key = os.getenv("GOOGLE_API_KEY")
132132
if google_api_key:
133-
self.logger.debug(f"Loading Google Gemini judges")
133+
self.logger.debug("Loading Google Gemini judges")
134134
for model_name, model_config in config.get("models", {}).get("gemini", {}).items():
135135
try:
136136
self.judges[model_name] = GeminiJudge(model_config)
@@ -146,7 +146,7 @@ def _load_judges(self) -> None:
146146
watsonx_api_key = os.getenv("WATSONX_API_KEY")
147147
watsonx_project_id = os.getenv("WATSONX_PROJECT_ID")
148148
if watsonx_api_key and watsonx_project_id:
149-
self.logger.debug(f"Loading IBM Watsonx.ai judges")
149+
self.logger.debug("Loading IBM Watsonx.ai judges")
150150
for model_name, model_config in config.get("models", {}).get("watsonx", {}).items():
151151
try:
152152
self.judges[model_name] = WatsonxJudge(model_config)

0 commit comments

Comments
 (0)