A2A lint

crivetimihai · crivetimihai · commit 7ac08671c2bf · 2025-08-20T21:52:26.000+01:00
Signed-off-by: Mihai Criveti &lt;crivetimihai@gmail.com&gt;
diff --git a/mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/anthropic_judge.py b/mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/anthropic_judge.py
@@ -308,7 +308,16 @@ async def rank_responses(
         raise ValueError(f"Unknown ranking method: {ranking_method}")
 
     async def _rank_by_scoring(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank by scoring each response individually."""
+        """Rank by scoring each response individually.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for scoring
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses with scores and reasoning
+        """
         rubric = EvaluationRubric(criteria=criteria, scale_description={"1": "Poor", "2": "Below Average", "3": "Average", "4": "Good", "5": "Excellent"})
 
         # Evaluate each response
@@ -325,7 +334,16 @@ async def _rank_by_scoring(self, responses: List[str], criteria: List[Evaluation
         return RankingResult(rankings=ranked_results, consistency_score=1.0, reasoning="Ranked by individual scoring of each response")
 
     async def _rank_by_tournament(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank using tournament-style pairwise comparisons."""
+        """Rank using tournament-style pairwise comparisons.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for comparisons
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses based on tournament wins
+        """
         n = len(responses)
         wins = [0] * n
 
@@ -363,7 +381,16 @@ async def _rank_by_tournament(self, responses: List[str], criteria: List[Evaluat
         return RankingResult(rankings=ranked_results, consistency_score=max(0.0, consistency), reasoning="Ranked by tournament-style pairwise comparisons")
 
     async def _rank_by_round_robin(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank using round-robin pairwise comparisons."""
+        """Rank using round-robin pairwise comparisons.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for comparisons
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses based on round-robin wins
+        """
         # For now, implement same as tournament
         return await self._rank_by_tournament(responses, criteria, context)
 
diff --git a/mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/bedrock_judge.py b/mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/bedrock_judge.py
@@ -77,6 +77,9 @@ async def _make_api_call(self, messages: List[Dict[str, str]], temperature: Opti
 
         Returns:
             Response content from the API
+
+        Raises:
+            Exception: If Bedrock API call fails
         """
         # Format for Anthropic models on Bedrock
         system_message = ""
@@ -338,7 +341,16 @@ async def rank_responses(
         raise ValueError(f"Unknown ranking method: {ranking_method}")
 
     async def _rank_by_scoring(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank by scoring each response individually."""
+        """Rank by scoring each response individually.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for scoring
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses with scores and reasoning
+        """
         rubric = EvaluationRubric(criteria=criteria, scale_description={"1": "Poor", "2": "Below Average", "3": "Average", "4": "Good", "5": "Excellent"})
 
         # Evaluate each response
@@ -355,7 +367,16 @@ async def _rank_by_scoring(self, responses: List[str], criteria: List[Evaluation
         return RankingResult(rankings=ranked_results, consistency_score=1.0, reasoning="Ranked by individual scoring of each response")
 
     async def _rank_by_tournament(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank using tournament-style pairwise comparisons."""
+        """Rank using tournament-style pairwise comparisons.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for comparisons
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses based on tournament wins
+        """
         n = len(responses)
         wins = [0] * n
 
@@ -393,7 +414,16 @@ async def _rank_by_tournament(self, responses: List[str], criteria: List[Evaluat
         return RankingResult(rankings=ranked_results, consistency_score=max(0.0, consistency), reasoning="Ranked by tournament-style pairwise comparisons")
 
     async def _rank_by_round_robin(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank using round-robin pairwise comparisons."""
+        """Rank using round-robin pairwise comparisons.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for comparisons
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses based on round-robin wins
+        """
         # For now, implement same as tournament
         return await self._rank_by_tournament(responses, criteria, context)
 
diff --git a/mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/gemini_judge.py b/mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/gemini_judge.py
@@ -80,6 +80,9 @@ async def _make_api_call(self, messages: List[Dict[str, str]], temperature: Opti
 
         Returns:
             Response content from the API
+
+        Raises:
+            Exception: If Gemini API call fails
         """
         self.logger.debug(f"🔗 Making Gemini API call to {self.model_name}")
         self.logger.debug(f"   Messages: {len(messages)}, Temperature: {temperature or self.temperature}, Max tokens: {max_tokens or self.max_tokens}")
@@ -342,7 +345,16 @@ async def rank_responses(
         raise ValueError(f"Unknown ranking method: {ranking_method}")
 
     async def _rank_by_scoring(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank by scoring each response individually."""
+        """Rank by scoring each response individually.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for scoring
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses with scores and reasoning
+        """
         rubric = EvaluationRubric(criteria=criteria, scale_description={"1": "Poor", "2": "Below Average", "3": "Average", "4": "Good", "5": "Excellent"})
 
         # Evaluate each response
@@ -359,7 +371,16 @@ async def _rank_by_scoring(self, responses: List[str], criteria: List[Evaluation
         return RankingResult(rankings=ranked_results, consistency_score=1.0, reasoning="Ranked by individual scoring of each response")
 
     async def _rank_by_tournament(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank using tournament-style pairwise comparisons."""
+        """Rank using tournament-style pairwise comparisons.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for comparisons
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses based on tournament wins
+        """
         n = len(responses)
         wins = [0] * n
 
@@ -397,7 +418,16 @@ async def _rank_by_tournament(self, responses: List[str], criteria: List[Evaluat
         return RankingResult(rankings=ranked_results, consistency_score=max(0.0, consistency), reasoning="Ranked by tournament-style pairwise comparisons")
 
     async def _rank_by_round_robin(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank using round-robin pairwise comparisons."""
+        """Rank using round-robin pairwise comparisons.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for comparisons
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses based on round-robin wins
+        """
         # For now, implement same as tournament
         return await self._rank_by_tournament(responses, criteria, context)
 
diff --git a/mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/ollama_judge.py b/mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/ollama_judge.py
@@ -55,7 +55,11 @@ def __init__(self, config: Dict[str, Any]) -> None:
         self._is_healthy = None
 
     async def _get_session(self):
-        """Get or create HTTP session."""
+        """Get or create HTTP session.
+
+        Returns:
+            aiohttp.ClientSession: HTTP session for making requests
+        """
         if self.session is None:
             timeout = aiohttp.ClientTimeout(total=self.request_timeout)
             self.session = aiohttp.ClientSession(timeout=timeout)
@@ -77,7 +81,11 @@ def __del__(self):
                 pass
 
     async def is_healthy(self) -> bool:
-        """Check if OLLAMA server is healthy and model is available."""
+        """Check if OLLAMA server is healthy and model is available.
+
+        Returns:
+            bool: True if OLLAMA server is healthy and model is available, False otherwise
+        """
         if self._is_healthy is not None:
             return self._is_healthy
 
@@ -112,6 +120,9 @@ async def _make_api_call(self, messages: List[Dict[str, str]], temperature: Opti
 
         Returns:
             Response content from the API
+
+        Raises:
+            Exception: If OLLAMA API call fails
         """
         session = await self._get_session()
 
@@ -354,7 +365,16 @@ async def rank_responses(
         raise ValueError(f"Unknown ranking method: {ranking_method}")
 
     async def _rank_by_scoring(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank by scoring each response individually."""
+        """Rank by scoring each response individually.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for scoring
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses with scores and reasoning
+        """
         rubric = EvaluationRubric(criteria=criteria, scale_description={"1": "Poor", "2": "Below Average", "3": "Average", "4": "Good", "5": "Excellent"})
 
         # Evaluate each response
@@ -371,7 +391,16 @@ async def _rank_by_scoring(self, responses: List[str], criteria: List[Evaluation
         return RankingResult(rankings=ranked_results, consistency_score=1.0, reasoning="Ranked by individual scoring of each response")
 
     async def _rank_by_tournament(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank using tournament-style pairwise comparisons."""
+        """Rank using tournament-style pairwise comparisons.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for comparisons
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses based on tournament wins
+        """
         n = len(responses)
         wins = [0] * n
 
@@ -409,7 +438,16 @@ async def _rank_by_tournament(self, responses: List[str], criteria: List[Evaluat
         return RankingResult(rankings=ranked_results, consistency_score=max(0.0, consistency), reasoning="Ranked by tournament-style pairwise comparisons")
 
     async def _rank_by_round_robin(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank using round-robin pairwise comparisons."""
+        """Rank using round-robin pairwise comparisons.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for comparisons
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses based on round-robin wins
+        """
         # For now, implement same as tournament
         return await self._rank_by_tournament(responses, criteria, context)
 
diff --git a/mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/watsonx_judge.py b/mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/watsonx_judge.py
@@ -95,6 +95,9 @@ async def _make_api_call(self, messages: List[Dict[str, str]], temperature: Opti
 
         Returns:
             Response content from the API
+
+        Raises:
+            Exception: If Watsonx.ai API call fails
         """
         self.logger.debug(f"🔗 Making Watsonx.ai API call to {self.model}")
         self.logger.debug(f"   Messages: {len(messages)}, Temperature: {temperature or self.temperature}, Max tokens: {max_tokens or self.max_tokens}")
@@ -359,7 +362,16 @@ async def rank_responses(
         raise ValueError(f"Unknown ranking method: {ranking_method}")
 
     async def _rank_by_scoring(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank by scoring each response individually."""
+        """Rank by scoring each response individually.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for scoring
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses with scores and reasoning
+        """
         rubric = EvaluationRubric(criteria=criteria, scale_description={"1": "Poor", "2": "Below Average", "3": "Average", "4": "Good", "5": "Excellent"})
 
         # Evaluate each response
@@ -376,7 +388,16 @@ async def _rank_by_scoring(self, responses: List[str], criteria: List[Evaluation
         return RankingResult(rankings=ranked_results, consistency_score=1.0, reasoning="Ranked by individual scoring of each response")
 
     async def _rank_by_tournament(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank using tournament-style pairwise comparisons."""
+        """Rank using tournament-style pairwise comparisons.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for comparisons
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses based on tournament wins
+        """
         n = len(responses)
         wins = [0] * n
 
@@ -414,7 +435,16 @@ async def _rank_by_tournament(self, responses: List[str], criteria: List[Evaluat
         return RankingResult(rankings=ranked_results, consistency_score=max(0.0, consistency), reasoning="Ranked by tournament-style pairwise comparisons")
 
     async def _rank_by_round_robin(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
-        """Rank using round-robin pairwise comparisons."""
+        """Rank using round-robin pairwise comparisons.
+
+        Args:
+            responses: List of response strings to rank
+            criteria: Evaluation criteria to use for comparisons
+            context: Optional context for evaluation
+
+        Returns:
+            RankingResult containing ranked responses based on round-robin wins
+        """
         # For now, implement same as tournament
         return await self._rank_by_tournament(responses, criteria, context)
 
diff --git a/mcp-servers/python/mcp_eval_server/mcp_eval_server/server.py b/mcp-servers/python/mcp_eval_server/mcp_eval_server/server.py
@@ -548,7 +548,7 @@ async def main():
             deployment = os.getenv("AZURE_DEPLOYMENT_NAME", "not configured")
             endpoint_info = f" → {endpoint} (deployment: {deployment})"
         elif provider == "anthropic":
-            endpoint_info = f" → https://api.anthropic.com"
+            endpoint_info = " → https://api.anthropic.com"
         elif provider == "ollama":
             base_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
             # Test OLLAMA connectivity for status display
@@ -562,19 +562,19 @@ async def test_ollama():
                         async with aiohttp.ClientSession(timeout=timeout) as session:
                             async with session.get(f"{base_url}/api/tags") as response:
                                 return response.status == 200
-                    except:
+                    except Exception:
                         return False
 
                 is_connected = await test_ollama()
                 status = "🟢 connected" if is_connected else "🔴 not reachable"
                 endpoint_info = f" → {base_url} ({status})"
-            except:
+            except Exception:
                 endpoint_info = f" → {base_url} (🔴 not reachable)"
         elif provider == "bedrock":
             region = os.getenv("AWS_REGION", "us-east-1")
             endpoint_info = f" → AWS Bedrock ({region})"
         elif provider == "gemini":
-            endpoint_info = f" → Google AI Studio"
+            endpoint_info = " → Google AI Studio"
         elif provider == "watsonx":
             watsonx_url = os.getenv("WATSONX_URL", "https://us-south.ml.cloud.ibm.com")
             project_id = os.getenv("WATSONX_PROJECT_ID", "not configured")
diff --git a/mcp-servers/python/mcp_eval_server/mcp_eval_server/tools/judge_tools.py b/mcp-servers/python/mcp_eval_server/mcp_eval_server/tools/judge_tools.py
@@ -130,7 +130,7 @@ def _load_judges(self) -> None:
         if GeminiJudge:
             google_api_key = os.getenv("GOOGLE_API_KEY")
             if google_api_key:
-                self.logger.debug(f"Loading Google Gemini judges")
+                self.logger.debug("Loading Google Gemini judges")
                 for model_name, model_config in config.get("models", {}).get("gemini", {}).items():
                     try:
                         self.judges[model_name] = GeminiJudge(model_config)
@@ -146,7 +146,7 @@ def _load_judges(self) -> None:
             watsonx_api_key = os.getenv("WATSONX_API_KEY")
             watsonx_project_id = os.getenv("WATSONX_PROJECT_ID")
             if watsonx_api_key and watsonx_project_id:
-                self.logger.debug(f"Loading IBM Watsonx.ai judges")
+                self.logger.debug("Loading IBM Watsonx.ai judges")
                 for model_name, model_config in config.get("models", {}).get("watsonx", {}).items():
                     try:
                         self.judges[model_name] = WatsonxJudge(model_config)
diff --git a/mcp-servers/python/mcp_eval_server/test_all_providers.py b/mcp-servers/python/mcp_eval_server/test_all_providers.py
diff --git a/mcp-servers/python/mcp_eval_server/validate_models.py b/mcp-servers/python/mcp_eval_server/validate_models.py
diff --git a/mcpgateway/alembic/versions/1fc1795f6983_merge_a2a_and_custom_name_changes.py b/mcpgateway/alembic/versions/1fc1795f6983_merge_a2a_and_custom_name_changes.py