2
2
"""Anthropic judge implementation for LLM-as-a-judge evaluation."""
3
3
4
4
# Standard
5
- import asyncio
6
- import json
7
5
import logging
8
6
import os
9
- import secrets
10
7
from typing import Any , Dict , List , Optional
11
8
12
9
try :
@@ -55,7 +52,7 @@ def __init__(self, config: Dict[str, Any]) -> None:
55
52
self .client = AsyncAnthropic (api_key = api_key )
56
53
self .model = config ["model_name" ]
57
54
58
- self .logger .debug (f "🔧 Initialized Anthropic judge: { self .model } " )
55
+ self .logger .debug ("🔧 Initialized Anthropic judge: %s" , self .model )
59
56
60
57
@retry (stop = stop_after_attempt (3 ), wait = wait_exponential (multiplier = 1 , min = 4 , max = 10 ))
61
58
async def _make_api_call (self , messages : List [Dict [str , str ]], temperature : Optional [float ] = None , max_tokens : Optional [int ] = None ) -> str :
@@ -116,71 +113,13 @@ async def evaluate_response(
116
113
criteria_text = self ._format_criteria (criteria )
117
114
rubric_text = self ._format_rubric (rubric )
118
115
119
- context_section = f"\n \n CONTEXT:\n { context } " if context else ""
120
-
121
- cot_instruction = "Please think step by step and provide detailed reasoning for each score before giving your final scores." if use_cot else ""
122
-
123
- prompt = f"""You are an expert evaluator. Assess the following response based on the given criteria.
124
-
125
- { context_section }
126
-
127
- RESPONSE TO EVALUATE:
128
- { response }
129
-
130
- EVALUATION CRITERIA:
131
- { criteria_text }
132
-
133
- SCORING RUBRIC:
134
- { rubric_text }
135
-
136
- { cot_instruction }
137
-
138
- Please provide your evaluation in the following JSON format:
139
- {{
140
- "reasoning": {{
141
- "criterion_name": "detailed reasoning for this criterion",
142
- ...
143
- }},
144
- "scores": {{
145
- "criterion_name": score_value,
146
- ...
147
- }},
148
- "confidence": confidence_level_0_to_1
149
- }}
150
-
151
- Ensure all scores are within the specified scale for each criterion."""
116
+ prompt = self ._render_template ("evaluation" , context = context , response = response , criteria_text = criteria_text , rubric_text = rubric_text , use_cot = use_cot )
152
117
153
118
messages = [{"role" : "system" , "content" : "You are a professional evaluation expert. Provide thorough, unbiased assessments." }, {"role" : "user" , "content" : prompt }]
154
119
155
120
response_text = await self ._make_api_call (messages )
156
121
157
- try :
158
- # Extract JSON from response
159
- json_start = response_text .find ("{" )
160
- json_end = response_text .rfind ("}" ) + 1
161
- json_text = response_text [json_start :json_end ]
162
- result_data = json .loads (json_text )
163
-
164
- # Calculate overall score
165
- overall_score = self ._calculate_overall_score (result_data ["scores" ], criteria )
166
-
167
- return EvaluationResult (
168
- scores = result_data ["scores" ],
169
- reasoning = result_data ["reasoning" ],
170
- overall_score = overall_score ,
171
- confidence = result_data .get ("confidence" , 0.8 ),
172
- metadata = {"model" : self .model , "temperature" : self .temperature , "use_cot" : use_cot },
173
- )
174
-
175
- except (json .JSONDecodeError , KeyError ) as e :
176
- # Fallback parsing if JSON is malformed
177
- return EvaluationResult (
178
- scores = {c .name : 3.0 for c in criteria }, # Default middle scores
179
- reasoning = {c .name : "Error parsing judge response" for c in criteria },
180
- overall_score = 3.0 ,
181
- confidence = 0.3 ,
182
- metadata = {"model" : self .model , "error" : str (e ), "raw_response" : response_text },
183
- )
122
+ return self ._parse_evaluation_response (response_text , criteria , model = self .model , temperature = self .temperature , use_cot = use_cot )
184
123
185
124
async def pairwise_comparison (
186
125
self ,
@@ -202,77 +141,7 @@ async def pairwise_comparison(
202
141
Returns:
203
142
Pairwise comparison result
204
143
"""
205
-
206
- # Position bias mitigation: randomly swap A and B
207
- original_order = True
208
- if position_bias_mitigation and secrets .randbelow (2 ) == 0 :
209
- response_a , response_b = response_b , response_a
210
- original_order = False
211
-
212
- criteria_text = self ._format_criteria (criteria )
213
- context_section = f"\n \n CONTEXT:\n { context } " if context else ""
214
-
215
- prompt = f"""You are an expert evaluator. Compare the following two responses and determine which is better.
216
-
217
- { context_section }
218
-
219
- RESPONSE A:
220
- { response_a }
221
-
222
- RESPONSE B:
223
- { response_b }
224
-
225
- COMPARISON CRITERIA:
226
- { criteria_text }
227
-
228
- Please provide a detailed comparison and determine the winner. Consider each criterion carefully.
229
-
230
- Provide your evaluation in the following JSON format:
231
- {{
232
- "winner": "A" | "B" | "tie",
233
- "confidence_score": confidence_level_0_to_1,
234
- "reasoning": "detailed comparison reasoning",
235
- "criterion_scores": {{
236
- "criterion_name": "A" | "B" | "tie",
237
- ...
238
- }},
239
- "margin": strength_of_preference_0_to_1
240
- }}"""
241
-
242
- messages = [{"role" : "system" , "content" : "You are a professional evaluation expert. Provide fair, detailed comparisons." }, {"role" : "user" , "content" : prompt }]
243
-
244
- response_text = await self ._make_api_call (messages )
245
-
246
- try :
247
- json_start = response_text .find ("{" )
248
- json_end = response_text .rfind ("}" ) + 1
249
- json_text = response_text [json_start :json_end ]
250
- result_data = json .loads (json_text )
251
-
252
- # Adjust winner if we swapped positions
253
- winner = result_data ["winner" ]
254
- if not original_order and winner in ["A" , "B" ]:
255
- winner = "B" if winner == "A" else "A"
256
-
257
- # Adjust criterion scores
258
- criterion_scores = result_data .get ("criterion_scores" , {})
259
- if not original_order :
260
- for k , v in criterion_scores .items ():
261
- if v == "A" :
262
- criterion_scores [k ] = "B"
263
- elif v == "B" :
264
- criterion_scores [k ] = "A"
265
-
266
- return PairwiseResult (
267
- winner = winner ,
268
- confidence_score = result_data .get ("confidence_score" , 0.8 ),
269
- reasoning = result_data .get ("reasoning" , "" ),
270
- criterion_scores = criterion_scores ,
271
- margin = result_data .get ("margin" , 0.5 ),
272
- )
273
-
274
- except (json .JSONDecodeError , KeyError ) as e :
275
- return PairwiseResult (winner = "tie" , confidence_score = 0.3 , reasoning = f"Error parsing judge response: { str (e )} " , criterion_scores = {}, margin = 0.0 )
144
+ return await self ._base_pairwise_comparison (response_a , response_b , criteria , context , position_bias_mitigation )
276
145
277
146
async def rank_responses (
278
147
self ,
@@ -287,112 +156,15 @@ async def rank_responses(
287
156
responses: List of response strings to rank
288
157
criteria: List of evaluation criteria for ranking
289
158
context: Optional context for evaluation
290
- ranking_method: Method to use for ranking ("tournament", "scoring", "round_robin")
159
+ ranking_method: Method to use for ranking
291
160
292
161
Returns:
293
162
RankingResult containing ranked responses and consistency score
294
163
295
164
Raises:
296
165
ValueError: If less than 2 responses provided or unknown ranking method
297
166
"""
298
-
299
- if len (responses ) < 2 :
300
- raise ValueError ("Need at least 2 responses to rank" )
301
-
302
- if ranking_method == "scoring" :
303
- return await self ._rank_by_scoring (responses , criteria , context )
304
- if ranking_method == "tournament" :
305
- return await self ._rank_by_tournament (responses , criteria , context )
306
- if ranking_method == "round_robin" :
307
- return await self ._rank_by_round_robin (responses , criteria , context )
308
- raise ValueError (f"Unknown ranking method: { ranking_method } " )
309
-
310
- async def _rank_by_scoring (self , responses : List [str ], criteria : List [EvaluationCriteria ], context : Optional [str ] = None ) -> RankingResult :
311
- """Rank by scoring each response individually.
312
-
313
- Args:
314
- responses: List of response strings to rank
315
- criteria: Evaluation criteria to use for scoring
316
- context: Optional context for evaluation
317
-
318
- Returns:
319
- RankingResult containing ranked responses with scores and reasoning
320
- """
321
- rubric = EvaluationRubric (criteria = criteria , scale_description = {"1" : "Poor" , "2" : "Below Average" , "3" : "Average" , "4" : "Good" , "5" : "Excellent" })
322
-
323
- # Evaluate each response
324
- evaluation_tasks = [self .evaluate_response (response , criteria , rubric , context ) for response in responses ]
325
- evaluations = await asyncio .gather (* evaluation_tasks )
326
-
327
- # Sort by overall score
328
- ranked_results = []
329
- for i , evaluation in enumerate (evaluations ):
330
- ranked_results .append ({"response_index" : i , "response" : responses [i ], "score" : evaluation .overall_score , "reasoning" : evaluation .reasoning })
331
-
332
- ranked_results .sort (key = lambda x : x ["score" ], reverse = True )
333
-
334
- return RankingResult (rankings = ranked_results , consistency_score = 1.0 , reasoning = "Ranked by individual scoring of each response" )
335
-
336
- async def _rank_by_tournament (self , responses : List [str ], criteria : List [EvaluationCriteria ], context : Optional [str ] = None ) -> RankingResult :
337
- """Rank using tournament-style pairwise comparisons.
338
-
339
- Args:
340
- responses: List of response strings to rank
341
- criteria: Evaluation criteria to use for comparisons
342
- context: Optional context for evaluation
343
-
344
- Returns:
345
- RankingResult containing ranked responses based on tournament wins
346
- """
347
- n = len (responses )
348
- wins = [0 ] * n
349
-
350
- # Perform all pairwise comparisons
351
- comparison_tasks = []
352
- pairs = []
353
-
354
- for i in range (n ):
355
- for j in range (i + 1 , n ):
356
- pairs .append ((i , j ))
357
- comparison_tasks .append (self .pairwise_comparison (responses [i ], responses [j ], criteria , context ))
358
-
359
- comparisons = await asyncio .gather (* comparison_tasks )
360
-
361
- # Count wins
362
- for (i , j ), comparison in zip (pairs , comparisons ):
363
- if comparison .winner == "A" :
364
- wins [i ] += 1
365
- elif comparison .winner == "B" :
366
- wins [j ] += 1
367
- else : # tie
368
- wins [i ] += 0.5
369
- wins [j ] += 0.5
370
-
371
- # Sort by wins
372
- ranked_indices = sorted (range (n ), key = lambda i : wins [i ], reverse = True )
373
-
374
- ranked_results = []
375
- for rank , idx in enumerate (ranked_indices ):
376
- ranked_results .append ({"response_index" : idx , "response" : responses [idx ], "score" : wins [idx ] / (n - 1 ), "wins" : wins [idx ], "rank" : rank + 1 })
377
-
378
- # Calculate consistency (simplified)
379
- consistency = 1.0 - (sum (abs (wins [i ] - wins [j ]) for i in range (n ) for j in range (i + 1 , n )) / (n * (n - 1 ) / 2 )) / n
380
-
381
- return RankingResult (rankings = ranked_results , consistency_score = max (0.0 , consistency ), reasoning = "Ranked by tournament-style pairwise comparisons" )
382
-
383
- async def _rank_by_round_robin (self , responses : List [str ], criteria : List [EvaluationCriteria ], context : Optional [str ] = None ) -> RankingResult :
384
- """Rank using round-robin pairwise comparisons.
385
-
386
- Args:
387
- responses: List of response strings to rank
388
- criteria: Evaluation criteria to use for comparisons
389
- context: Optional context for evaluation
390
-
391
- Returns:
392
- RankingResult containing ranked responses based on round-robin wins
393
- """
394
- # For now, implement same as tournament
395
- return await self ._rank_by_tournament (responses , criteria , context )
167
+ return await self ._base_rank_responses (responses , criteria , context , ranking_method )
396
168
397
169
async def evaluate_with_reference (
398
170
self ,
@@ -406,62 +178,10 @@ async def evaluate_with_reference(
406
178
Args:
407
179
response: Response text to evaluate
408
180
reference: Reference text to compare against
409
- evaluation_type: Type of evaluation ("factuality", "completeness", "style_match")
410
- tolerance: Tolerance level for evaluation ("strict", "moderate", "lenient")
181
+ evaluation_type: Type of evaluation
182
+ tolerance: Tolerance level for evaluation
411
183
412
184
Returns:
413
185
ReferenceEvaluationResult containing score and analysis
414
186
"""
415
-
416
- type_descriptions = {
417
- "factuality" : "Compare the factual accuracy and correctness of information" ,
418
- "completeness" : "Assess how completely the response covers the reference content" ,
419
- "style_match" : "Evaluate how well the writing style and tone match the reference" ,
420
- }
421
-
422
- tolerance_descriptions = {
423
- "strict" : "Require exact matches and perfect alignment" ,
424
- "moderate" : "Allow reasonable variations while maintaining core accuracy" ,
425
- "loose" : "Accept substantial variations as long as general meaning is preserved" ,
426
- }
427
-
428
- prompt = f"""You are an expert evaluator. Compare the following response against the reference and evaluate based on { evaluation_type } .
429
-
430
- REFERENCE (Gold Standard):
431
- { reference }
432
-
433
- RESPONSE TO EVALUATE:
434
- { response }
435
-
436
- EVALUATION TYPE: { type_descriptions .get (evaluation_type , evaluation_type )}
437
- TOLERANCE LEVEL: { tolerance_descriptions .get (tolerance , tolerance )}
438
-
439
- Please provide your evaluation in the following JSON format:
440
- {{
441
- "similarity_score": overall_similarity_0_to_1,
442
- "missing_elements": ["element1", "element2", ...],
443
- "extra_elements": ["element1", "element2", ...],
444
- "factual_errors": ["error1", "error2", ...],
445
- "reasoning": "detailed comparison reasoning"
446
- }}"""
447
-
448
- messages = [{"role" : "system" , "content" : "You are a professional evaluation expert. Provide thorough, accurate assessments against reference standards." }, {"role" : "user" , "content" : prompt }]
449
-
450
- response_text = await self ._make_api_call (messages )
451
-
452
- try :
453
- json_start = response_text .find ("{" )
454
- json_end = response_text .rfind ("}" ) + 1
455
- json_text = response_text [json_start :json_end ]
456
- result_data = json .loads (json_text )
457
-
458
- return ReferenceEvaluationResult (
459
- similarity_score = result_data .get ("similarity_score" , 0.5 ),
460
- missing_elements = result_data .get ("missing_elements" , []),
461
- extra_elements = result_data .get ("extra_elements" , []),
462
- factual_errors = result_data .get ("factual_errors" , []),
463
- reasoning = result_data .get ("reasoning" , "" ),
464
- )
465
-
466
- except (json .JSONDecodeError , KeyError ) as e :
467
- return ReferenceEvaluationResult (similarity_score = 0.5 , missing_elements = [], extra_elements = [], factual_errors = [], reasoning = f"Error parsing judge response: { str (e )} " )
187
+ return await self ._base_reference_evaluation (response , reference , evaluation_type , tolerance )
0 commit comments