Skip to content

Commit df0ad5d

Browse files
authored
Pylint fixes (#797)
* Pylint Signed-off-by: Mihai Criveti <[email protected]> * Pylint Signed-off-by: Mihai Criveti <[email protected]> * Pylint Signed-off-by: Mihai Criveti <[email protected]> * Pylint Signed-off-by: Mihai Criveti <[email protected]> * Pylint Signed-off-by: Mihai Criveti <[email protected]> --------- Signed-off-by: Mihai Criveti <[email protected]>
1 parent 7ac0867 commit df0ad5d

File tree

16 files changed

+582
-1832
lines changed

16 files changed

+582
-1832
lines changed

mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/anthropic_judge.py

Lines changed: 9 additions & 289 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,8 @@
22
"""Anthropic judge implementation for LLM-as-a-judge evaluation."""
33

44
# Standard
5-
import asyncio
6-
import json
75
import logging
86
import os
9-
import secrets
107
from typing import Any, Dict, List, Optional
118

129
try:
@@ -55,7 +52,7 @@ def __init__(self, config: Dict[str, Any]) -> None:
5552
self.client = AsyncAnthropic(api_key=api_key)
5653
self.model = config["model_name"]
5754

58-
self.logger.debug(f"🔧 Initialized Anthropic judge: {self.model}")
55+
self.logger.debug("🔧 Initialized Anthropic judge: %s", self.model)
5956

6057
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
6158
async def _make_api_call(self, messages: List[Dict[str, str]], temperature: Optional[float] = None, max_tokens: Optional[int] = None) -> str:
@@ -116,71 +113,13 @@ async def evaluate_response(
116113
criteria_text = self._format_criteria(criteria)
117114
rubric_text = self._format_rubric(rubric)
118115

119-
context_section = f"\n\nCONTEXT:\n{context}" if context else ""
120-
121-
cot_instruction = "Please think step by step and provide detailed reasoning for each score before giving your final scores." if use_cot else ""
122-
123-
prompt = f"""You are an expert evaluator. Assess the following response based on the given criteria.
124-
125-
{context_section}
126-
127-
RESPONSE TO EVALUATE:
128-
{response}
129-
130-
EVALUATION CRITERIA:
131-
{criteria_text}
132-
133-
SCORING RUBRIC:
134-
{rubric_text}
135-
136-
{cot_instruction}
137-
138-
Please provide your evaluation in the following JSON format:
139-
{{
140-
"reasoning": {{
141-
"criterion_name": "detailed reasoning for this criterion",
142-
...
143-
}},
144-
"scores": {{
145-
"criterion_name": score_value,
146-
...
147-
}},
148-
"confidence": confidence_level_0_to_1
149-
}}
150-
151-
Ensure all scores are within the specified scale for each criterion."""
116+
prompt = self._render_template("evaluation", context=context, response=response, criteria_text=criteria_text, rubric_text=rubric_text, use_cot=use_cot)
152117

153118
messages = [{"role": "system", "content": "You are a professional evaluation expert. Provide thorough, unbiased assessments."}, {"role": "user", "content": prompt}]
154119

155120
response_text = await self._make_api_call(messages)
156121

157-
try:
158-
# Extract JSON from response
159-
json_start = response_text.find("{")
160-
json_end = response_text.rfind("}") + 1
161-
json_text = response_text[json_start:json_end]
162-
result_data = json.loads(json_text)
163-
164-
# Calculate overall score
165-
overall_score = self._calculate_overall_score(result_data["scores"], criteria)
166-
167-
return EvaluationResult(
168-
scores=result_data["scores"],
169-
reasoning=result_data["reasoning"],
170-
overall_score=overall_score,
171-
confidence=result_data.get("confidence", 0.8),
172-
metadata={"model": self.model, "temperature": self.temperature, "use_cot": use_cot},
173-
)
174-
175-
except (json.JSONDecodeError, KeyError) as e:
176-
# Fallback parsing if JSON is malformed
177-
return EvaluationResult(
178-
scores={c.name: 3.0 for c in criteria}, # Default middle scores
179-
reasoning={c.name: "Error parsing judge response" for c in criteria},
180-
overall_score=3.0,
181-
confidence=0.3,
182-
metadata={"model": self.model, "error": str(e), "raw_response": response_text},
183-
)
122+
return self._parse_evaluation_response(response_text, criteria, model=self.model, temperature=self.temperature, use_cot=use_cot)
184123

185124
async def pairwise_comparison(
186125
self,
@@ -202,77 +141,7 @@ async def pairwise_comparison(
202141
Returns:
203142
Pairwise comparison result
204143
"""
205-
206-
# Position bias mitigation: randomly swap A and B
207-
original_order = True
208-
if position_bias_mitigation and secrets.randbelow(2) == 0:
209-
response_a, response_b = response_b, response_a
210-
original_order = False
211-
212-
criteria_text = self._format_criteria(criteria)
213-
context_section = f"\n\nCONTEXT:\n{context}" if context else ""
214-
215-
prompt = f"""You are an expert evaluator. Compare the following two responses and determine which is better.
216-
217-
{context_section}
218-
219-
RESPONSE A:
220-
{response_a}
221-
222-
RESPONSE B:
223-
{response_b}
224-
225-
COMPARISON CRITERIA:
226-
{criteria_text}
227-
228-
Please provide a detailed comparison and determine the winner. Consider each criterion carefully.
229-
230-
Provide your evaluation in the following JSON format:
231-
{{
232-
"winner": "A" | "B" | "tie",
233-
"confidence_score": confidence_level_0_to_1,
234-
"reasoning": "detailed comparison reasoning",
235-
"criterion_scores": {{
236-
"criterion_name": "A" | "B" | "tie",
237-
...
238-
}},
239-
"margin": strength_of_preference_0_to_1
240-
}}"""
241-
242-
messages = [{"role": "system", "content": "You are a professional evaluation expert. Provide fair, detailed comparisons."}, {"role": "user", "content": prompt}]
243-
244-
response_text = await self._make_api_call(messages)
245-
246-
try:
247-
json_start = response_text.find("{")
248-
json_end = response_text.rfind("}") + 1
249-
json_text = response_text[json_start:json_end]
250-
result_data = json.loads(json_text)
251-
252-
# Adjust winner if we swapped positions
253-
winner = result_data["winner"]
254-
if not original_order and winner in ["A", "B"]:
255-
winner = "B" if winner == "A" else "A"
256-
257-
# Adjust criterion scores
258-
criterion_scores = result_data.get("criterion_scores", {})
259-
if not original_order:
260-
for k, v in criterion_scores.items():
261-
if v == "A":
262-
criterion_scores[k] = "B"
263-
elif v == "B":
264-
criterion_scores[k] = "A"
265-
266-
return PairwiseResult(
267-
winner=winner,
268-
confidence_score=result_data.get("confidence_score", 0.8),
269-
reasoning=result_data.get("reasoning", ""),
270-
criterion_scores=criterion_scores,
271-
margin=result_data.get("margin", 0.5),
272-
)
273-
274-
except (json.JSONDecodeError, KeyError) as e:
275-
return PairwiseResult(winner="tie", confidence_score=0.3, reasoning=f"Error parsing judge response: {str(e)}", criterion_scores={}, margin=0.0)
144+
return await self._base_pairwise_comparison(response_a, response_b, criteria, context, position_bias_mitigation)
276145

277146
async def rank_responses(
278147
self,
@@ -287,112 +156,15 @@ async def rank_responses(
287156
responses: List of response strings to rank
288157
criteria: List of evaluation criteria for ranking
289158
context: Optional context for evaluation
290-
ranking_method: Method to use for ranking ("tournament", "scoring", "round_robin")
159+
ranking_method: Method to use for ranking
291160
292161
Returns:
293162
RankingResult containing ranked responses and consistency score
294163
295164
Raises:
296165
ValueError: If less than 2 responses provided or unknown ranking method
297166
"""
298-
299-
if len(responses) < 2:
300-
raise ValueError("Need at least 2 responses to rank")
301-
302-
if ranking_method == "scoring":
303-
return await self._rank_by_scoring(responses, criteria, context)
304-
if ranking_method == "tournament":
305-
return await self._rank_by_tournament(responses, criteria, context)
306-
if ranking_method == "round_robin":
307-
return await self._rank_by_round_robin(responses, criteria, context)
308-
raise ValueError(f"Unknown ranking method: {ranking_method}")
309-
310-
async def _rank_by_scoring(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
311-
"""Rank by scoring each response individually.
312-
313-
Args:
314-
responses: List of response strings to rank
315-
criteria: Evaluation criteria to use for scoring
316-
context: Optional context for evaluation
317-
318-
Returns:
319-
RankingResult containing ranked responses with scores and reasoning
320-
"""
321-
rubric = EvaluationRubric(criteria=criteria, scale_description={"1": "Poor", "2": "Below Average", "3": "Average", "4": "Good", "5": "Excellent"})
322-
323-
# Evaluate each response
324-
evaluation_tasks = [self.evaluate_response(response, criteria, rubric, context) for response in responses]
325-
evaluations = await asyncio.gather(*evaluation_tasks)
326-
327-
# Sort by overall score
328-
ranked_results = []
329-
for i, evaluation in enumerate(evaluations):
330-
ranked_results.append({"response_index": i, "response": responses[i], "score": evaluation.overall_score, "reasoning": evaluation.reasoning})
331-
332-
ranked_results.sort(key=lambda x: x["score"], reverse=True)
333-
334-
return RankingResult(rankings=ranked_results, consistency_score=1.0, reasoning="Ranked by individual scoring of each response")
335-
336-
async def _rank_by_tournament(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
337-
"""Rank using tournament-style pairwise comparisons.
338-
339-
Args:
340-
responses: List of response strings to rank
341-
criteria: Evaluation criteria to use for comparisons
342-
context: Optional context for evaluation
343-
344-
Returns:
345-
RankingResult containing ranked responses based on tournament wins
346-
"""
347-
n = len(responses)
348-
wins = [0] * n
349-
350-
# Perform all pairwise comparisons
351-
comparison_tasks = []
352-
pairs = []
353-
354-
for i in range(n):
355-
for j in range(i + 1, n):
356-
pairs.append((i, j))
357-
comparison_tasks.append(self.pairwise_comparison(responses[i], responses[j], criteria, context))
358-
359-
comparisons = await asyncio.gather(*comparison_tasks)
360-
361-
# Count wins
362-
for (i, j), comparison in zip(pairs, comparisons):
363-
if comparison.winner == "A":
364-
wins[i] += 1
365-
elif comparison.winner == "B":
366-
wins[j] += 1
367-
else: # tie
368-
wins[i] += 0.5
369-
wins[j] += 0.5
370-
371-
# Sort by wins
372-
ranked_indices = sorted(range(n), key=lambda i: wins[i], reverse=True)
373-
374-
ranked_results = []
375-
for rank, idx in enumerate(ranked_indices):
376-
ranked_results.append({"response_index": idx, "response": responses[idx], "score": wins[idx] / (n - 1), "wins": wins[idx], "rank": rank + 1})
377-
378-
# Calculate consistency (simplified)
379-
consistency = 1.0 - (sum(abs(wins[i] - wins[j]) for i in range(n) for j in range(i + 1, n)) / (n * (n - 1) / 2)) / n
380-
381-
return RankingResult(rankings=ranked_results, consistency_score=max(0.0, consistency), reasoning="Ranked by tournament-style pairwise comparisons")
382-
383-
async def _rank_by_round_robin(self, responses: List[str], criteria: List[EvaluationCriteria], context: Optional[str] = None) -> RankingResult:
384-
"""Rank using round-robin pairwise comparisons.
385-
386-
Args:
387-
responses: List of response strings to rank
388-
criteria: Evaluation criteria to use for comparisons
389-
context: Optional context for evaluation
390-
391-
Returns:
392-
RankingResult containing ranked responses based on round-robin wins
393-
"""
394-
# For now, implement same as tournament
395-
return await self._rank_by_tournament(responses, criteria, context)
167+
return await self._base_rank_responses(responses, criteria, context, ranking_method)
396168

397169
async def evaluate_with_reference(
398170
self,
@@ -406,62 +178,10 @@ async def evaluate_with_reference(
406178
Args:
407179
response: Response text to evaluate
408180
reference: Reference text to compare against
409-
evaluation_type: Type of evaluation ("factuality", "completeness", "style_match")
410-
tolerance: Tolerance level for evaluation ("strict", "moderate", "lenient")
181+
evaluation_type: Type of evaluation
182+
tolerance: Tolerance level for evaluation
411183
412184
Returns:
413185
ReferenceEvaluationResult containing score and analysis
414186
"""
415-
416-
type_descriptions = {
417-
"factuality": "Compare the factual accuracy and correctness of information",
418-
"completeness": "Assess how completely the response covers the reference content",
419-
"style_match": "Evaluate how well the writing style and tone match the reference",
420-
}
421-
422-
tolerance_descriptions = {
423-
"strict": "Require exact matches and perfect alignment",
424-
"moderate": "Allow reasonable variations while maintaining core accuracy",
425-
"loose": "Accept substantial variations as long as general meaning is preserved",
426-
}
427-
428-
prompt = f"""You are an expert evaluator. Compare the following response against the reference and evaluate based on {evaluation_type}.
429-
430-
REFERENCE (Gold Standard):
431-
{reference}
432-
433-
RESPONSE TO EVALUATE:
434-
{response}
435-
436-
EVALUATION TYPE: {type_descriptions.get(evaluation_type, evaluation_type)}
437-
TOLERANCE LEVEL: {tolerance_descriptions.get(tolerance, tolerance)}
438-
439-
Please provide your evaluation in the following JSON format:
440-
{{
441-
"similarity_score": overall_similarity_0_to_1,
442-
"missing_elements": ["element1", "element2", ...],
443-
"extra_elements": ["element1", "element2", ...],
444-
"factual_errors": ["error1", "error2", ...],
445-
"reasoning": "detailed comparison reasoning"
446-
}}"""
447-
448-
messages = [{"role": "system", "content": "You are a professional evaluation expert. Provide thorough, accurate assessments against reference standards."}, {"role": "user", "content": prompt}]
449-
450-
response_text = await self._make_api_call(messages)
451-
452-
try:
453-
json_start = response_text.find("{")
454-
json_end = response_text.rfind("}") + 1
455-
json_text = response_text[json_start:json_end]
456-
result_data = json.loads(json_text)
457-
458-
return ReferenceEvaluationResult(
459-
similarity_score=result_data.get("similarity_score", 0.5),
460-
missing_elements=result_data.get("missing_elements", []),
461-
extra_elements=result_data.get("extra_elements", []),
462-
factual_errors=result_data.get("factual_errors", []),
463-
reasoning=result_data.get("reasoning", ""),
464-
)
465-
466-
except (json.JSONDecodeError, KeyError) as e:
467-
return ReferenceEvaluationResult(similarity_score=0.5, missing_elements=[], extra_elements=[], factual_errors=[], reasoning=f"Error parsing judge response: {str(e)}")
187+
return await self._base_reference_evaluation(response, reference, evaluation_type, tolerance)

mcp-servers/python/mcp_eval_server/mcp_eval_server/judges/azure_judge.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from typing import Any, Dict
88

99
# Third-Party
10+
from jinja2 import Environment, FileSystemLoader
1011
from openai import AsyncAzureOpenAI
1112

1213
# Local
@@ -32,6 +33,10 @@ def __init__(self, config: Dict[str, Any]) -> None: # pylint: disable=super-ini
3233
self.max_tokens = config.get("max_tokens", 2000)
3334
self.logger = logging.getLogger(__name__)
3435

36+
# Set up Jinja2 template environment (from BaseJudge)
37+
template_dir = os.path.join(os.path.dirname(__file__), "templates")
38+
self.jinja_env = Environment(loader=FileSystemLoader(template_dir), trim_blocks=True, lstrip_blocks=True)
39+
3540
# Azure-specific client setup
3641
api_key = os.getenv(config["api_key_env"])
3742
if not api_key:

0 commit comments

Comments
 (0)