diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py index b44c6ec..f2fa7d9 100644 --- a/src/metacoder/coders/base_coder.py +++ b/src/metacoder/coders/base_coder.py @@ -173,11 +173,15 @@ def run_process( """ if env is None: env = self.expand_env(self.env) + + # Decode the child process output as UTF-8 (instead of default encoding) process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, + encoding="utf-8", + errors="replace", # avoid crashes on the occasional bad byte env=env, bufsize=1, universal_newlines=True, @@ -189,6 +193,15 @@ def run_process( # check verbosity level quiet_mode = logger.getEffectiveLevel() <= logging.INFO + # Ensure our own stdout/stderr won't choke on non-ASCII (Windows consoles often do). + for s in (sys.stdout, sys.stderr): + try: + s.reconfigure(encoding="utf-8", errors="replace") # Python 3.7+ + except Exception as e: + logger.info(f"{e}") + pass # OK if not available (e.g., redirected or older Python) + + # lines are already str decoded as UTF-8 def stream_output(pipe, output_lines, stream): for line in iter(pipe.readline, ""): if not quiet_mode: diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py index ee31b74..1a43295 100644 --- a/src/metacoder/coders/claude.py +++ b/src/metacoder/coders/claude.py @@ -246,7 +246,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]: ao.tool_uses = tool_uses end_time = time.time() - logger.info(f"πŸ€– Command took {end_time - start_time} seconds") + logger.info(f"πŸ€– Command took {end_time - start_time:.2f} seconds") ao.total_cost_usd = total_cost_usd ao.success = not is_error if not ao.success: diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py index 8e9169e..3451ebe 100644 --- a/src/metacoder/coders/codex.py +++ b/src/metacoder/coders/codex.py @@ -115,7 +115,7 @@ def run(self, input_text: str) -> CoderOutput: if "result" in message: ao.result_text = message["result"] end_time = time.time() - print(f"πŸ€– Command took {end_time - start_time} seconds") + print(f"πŸ€– Command took {end_time - start_time:.2f} seconds") ao.total_cost_usd = total_cost_usd ao.success = not is_error if not ao.success: diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py index 20564a9..6af35c4 100644 --- a/src/metacoder/coders/gemini.py +++ b/src/metacoder/coders/gemini.py @@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput: ) end_time = time.time() - logger.info(f"πŸ’Ž Command took {end_time - start_time} seconds") + logger.info(f"πŸ’Ž Command took {end_time - start_time:.2f} seconds") # Parse the output ao = CoderOutput(stdout=result.stdout, stderr=result.stderr) diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py index 514dc2b..6b0b5c0 100644 --- a/src/metacoder/coders/goose.py +++ b/src/metacoder/coders/goose.py @@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput: result = self.run_process(command, env) end_time = time.time() ao = CoderOutput(stdout=result.stdout, stderr=result.stderr) - logger.info(f"πŸ¦† Command took {end_time - start_time} seconds") + logger.info(f"πŸ¦† Command took {end_time - start_time:.2f} seconds") # look in output text for a file like: logging to ./.local/share/goose/sessions/20250613_120403.jsonl session_file: Optional[Path] = None for line in result.stdout.split("\n"): @@ -165,7 +165,7 @@ def run(self, input_text: str) -> CoderOutput: session_file = Path(session_file_str) break if session_file and session_file.exists(): - with open(session_file, "r") as f: + with open(session_file, "r", encoding="utf-8") as f: ao.structured_messages = [ json.loads(line) for line in f if line.strip() ] diff --git a/src/metacoder/coders/qwen.py b/src/metacoder/coders/qwen.py index 43aefb6..b6f4080 100644 --- a/src/metacoder/coders/qwen.py +++ b/src/metacoder/coders/qwen.py @@ -90,7 +90,7 @@ def run(self, input_text: str) -> CoderOutput: ) end_time = time.time() - print(f"πŸ€– Command took {end_time - start_time} seconds") + print(f"πŸ€– Command took {end_time - start_time:.2f} seconds") # Create output - Qwen CLI doesn't provide structured output ao = CoderOutput( diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py new file mode 100644 index 0000000..24b4277 --- /dev/null +++ b/src/metacoder/evals/judges.py @@ -0,0 +1,93 @@ +# metacoder/evals/judges.py +import logging +import os + +from anthropic import Anthropic +from anthropic.types import MessageParam, TextBlockParam, TextBlock + +from deepeval.models.base_model import DeepEvalBaseLLM + +logger = logging.getLogger(__name__) + + +class ClaudeJudge(DeepEvalBaseLLM): + """ + Wraps Anthropic's Claude models so they can be used as + the `model` parameter to DeepEval metrics like GEval. + """ + + # Note: Anthropic models can be listed via: + # curl https://api.anthropic.com/v1/models --header "x-api-key: %ANTHROPIC_API_KEY%" --header "anthropic-version: 2023-06-01" + # {"data": [{"type": "model", "id": "claude-opus-4-1-20250805", "display_name": "Claude Opus 4.1", "created_at": "2025-08-05T00:00:00Z"}, ... ]} + # Current list (September 3, 2025): + # claude-opus-4-1-20250805, claude-opus-4-20250514, claude-sonnet-4-20250514, claude-3-7-sonnet-20250219, + # claude-3-5-sonnet-20241022, claude-3-5-haiku-20241022, claude-3-5-sonnet-20240620, claude-3-haiku-20240307, + # claude-3-opus-20240229 + + def __init__( + self, + model_name: str = "claude-sonnet-4-20250514", + max_tokens: int = 1024, + temperature: float = 0.0, + ): + super().__init__() + api_key = os.getenv("ANTHROPIC_API_KEY") + if not api_key: + raise Exception("ANTHROPIC_API_KEY is not set in environment") + self.client = Anthropic(api_key=api_key) + self.model_name = model_name + self.max_tokens = max_tokens + self.temperature = temperature + + def load_model(self): + return self + + def generate(self, prompt: str) -> str: + # Build typed content blocks and messages to satisfy the SDK's type hints + content: list[TextBlockParam] = [{"type": "text", "text": prompt}] + messages: list[MessageParam] = [{"role": "user", "content": content}] + resp = self.client.messages.create( + model=self.model_name, + max_tokens=self.max_tokens, + temperature=self.temperature, + messages=messages, + ) + # anthropic returns a list of content blocks; collect only the text blocks. + parts: list[str] = [] + for block in resp.content: + if isinstance(block, TextBlock): + parts.append(block.text) + return "".join(parts) + + async def a_generate(self, prompt: str) -> str: + # for now just call the sync path + return self.generate(prompt) + + def get_model_name(self) -> str: + return self.model_name + + def has_available_quota(self) -> bool: + """ + Try a very lightweight request to check if quota is available. + Returns True if quota exists, False if Anthropic responds with + quota-related errors. + """ + try: + # Use a minimal "ping" request + content: list[TextBlockParam] = [{"type": "text", "text": "ping"}] + messages: list[MessageParam] = [{"role": "user", "content": content}] + self.client.messages.create( + model=self.model_name, + max_tokens=1, # cheapest possible + temperature=0.0, + messages=messages, + ) + return True + except Exception as e: + msg = str(e).lower() + # Check for insufficient quota: + # 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits. + if "credit balance is too low" in msg or "400" in msg: + logger.warning(f"ClaudeJudge quota check failed: {e}") + return False + raise diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py index 67a9619..f1c7126 100644 --- a/src/metacoder/evals/runner.py +++ b/src/metacoder/evals/runner.py @@ -5,27 +5,32 @@ """ import copy +import functools import importlib import logging +import os import time +import traceback from pathlib import Path -from typing import Any, Dict, List, Optional, Type +from typing import Any, Dict, List, Optional, Type, cast from pydantic import BaseModel import yaml + from deepeval import evaluate -from deepeval.metrics import BaseMetric -from deepeval.test_case import LLMTestCase -from deepeval.metrics import GEval -from deepeval.test_case import LLMTestCaseParams +from deepeval.evaluate import AsyncConfig, DisplayConfig, CacheConfig, ErrorConfig +from deepeval.models import DeepEvalBaseLLM +from deepeval.metrics import BaseMetric, GEval +from deepeval.test_case import LLMTestCase, LLMTestCaseParams +from openai import APIStatusError +from openai.types.chat import ChatCompletionMessageParam from metacoder.coders.base_coder import BaseCoder, CoderOutput from metacoder.registry import AVAILABLE_CODERS from metacoder.evals.eval_model import EvalCase, EvalDataset from metacoder.configuration import AIModelConfig, CoderConfig - logger = logging.getLogger(__name__) @@ -59,24 +64,34 @@ def is_successful(self) -> bool: return self.success -def get_default_metrics() -> Dict[str, BaseMetric]: - """Get default metrics. Creates instances lazily to avoid network calls during import.""" +def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval: + """Creates a GEval instance with the specified model.""" + return GEval( + name="Correctness", + criteria="Determine whether the actual output is factually correct based on the expected output.", + # NOTE: you can only provide either criteria or evaluation_steps, and not both + evaluation_steps=[ + "Check whether the facts in 'actual output' contradicts any facts in 'expected output'", + "You should also heavily penalize omission of detail", + "Vague language, or contradicting OPINIONS, are OK", + ], + threshold=0.8, + evaluation_params=[ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.EXPECTED_OUTPUT, + ], + model=model, # may be None (defaults to OpenAI) or a Claude judge + ) + + +def get_default_metrics( + model: Optional[DeepEvalBaseLLM] = None, +) -> Dict[str, BaseMetric]: + """Get default metrics with the specified model. Creates instances lazily to avoid network calls during import.""" return { - "CorrectnessMetric": GEval( - name="Correctness", - criteria="Determine whether the actual output is factually correct based on the expected output.", - # NOTE: you can only provide either criteria or evaluation_steps, and not both - evaluation_steps=[ - "Check whether the facts in 'actual output' contradicts any facts in 'expected output'", - "You should also heavily penalize omission of detail", - "Vague language, or contradicting OPINIONS, are OK", - ], - threshold=0.8, - evaluation_params=[ - LLMTestCaseParams.INPUT, - LLMTestCaseParams.ACTUAL_OUTPUT, - LLMTestCaseParams.EXPECTED_OUTPUT, - ], + "CorrectnessMetric": make_geval( + model=model # Note: GEval defaults to OpenAI if no model is specified. ), "DummyMetric": DummyMetric(threshold=0.5), } @@ -123,6 +138,8 @@ class EvalRunner: def __init__(self, verbose: bool = False): self.verbose = verbose + self.use_openai = True # GEval will default to OpenAI, avoid it and downgrade to another provider or metric if quota runs out. + if verbose: logging.basicConfig(level=logging.DEBUG) else: @@ -183,6 +200,48 @@ def create_test_case(self, case: EvalCase, actual_output: str) -> LLMTestCase: additional_metadata=case.additional_metadata, ) + @functools.lru_cache(maxsize=1) + def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool: + if not os.getenv("OPENAI_API_KEY"): + logger.info("OPENAI_API_KEY is not set.") + return False + """ + Preflight: detect β€œno OpenAI quota” and skip/redirect before calling evaluate. + Fast probe of the /chat/completions endpoint (the one GEval uses). + Returns False on 429 (insufficient_quota) or any exception. + """ + try: + from openai import OpenAI + + # turn off SDK retries for the check so it returns fast + client = OpenAI(max_retries=0, timeout=8) # NO retries, quick fail + # messages = cast(List[ChatCompletionMessageParam], [{"role": "user", "content": "ping"}]) + raw = [{"role": "user", "content": "ping"}] + messages = cast(List[ChatCompletionMessageParam], raw) + client.chat.completions.create( + model=model, + messages=messages, + max_tokens=1, + temperature=0, + ) + return True + except APIStatusError as e: + # 429 insufficient quota or too many requests + if e.status_code == 429: + logger.warning(f"OpenAI API Key has insufficient quota: {e}") + return False + # 401 authentication problem, including invalid API key + if e.status_code == 401: + logger.warning(f"OpenAI API Authentication Error: {e}") + return False + # all other errors + logger.warning(f"OpenAI API Status Error; treating as no-quota: {e}") + return False + except Exception as e: + # includes network issues, etc. + logger.warning(f"OpenAI preflight failed; treating as no-quota: {e}") + return False + def run_single_eval( self, model_name: str, @@ -235,8 +294,65 @@ def run_single_eval( test_case = self.create_test_case(case, actual_output) # Evaluate - logger.info(f"Evaluating with {metric_name}") - eval_results = evaluate([test_case], [metric]) + logger.info( + f"Evaluating {metric_name} using model {metric.model.model_name}" + ) + + if isinstance(metric, GEval): + # Assume GEval will use OpenAI until is disabled. + if self.use_openai and not self._openai_quota_ok(): + logger.warning( + "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval." + ) + self.use_openai = False + + # Note: This will downgrade a metric if needed each time it is about to be used without modifying the default metrics. + if not self.use_openai: + claude_model = "claude-3-5-sonnet-20240620" + logger.warning( + f"Downgrading {metric_name} model from {metric.model.model_name} to {claude_model}." + ) + + try: + # Downgrade metric model to Claude judge. + from metacoder.evals.judges import ClaudeJudge + + judge = ClaudeJudge(claude_model) + + if not judge.has_available_quota(): + raise Exception( + "No Anthropic credits available for ClaudeJudge." + ) + + metric = make_geval(model=judge) + logger.info( + f"Successfully downgraded {metric_name} model to {metric.model.model_name}." + ) + except Exception as e: + # Fallback: if you can't use Claude, downgrade gracefully. + logging.debug(traceback.format_exc()) + logger.debug(e) + logger.warning( + f"Claude unavailable ({e}); downgrading {metric_name} to DummyMetric." + ) + metric = DummyMetric(threshold=0.5) + logger.warning(f"Downgraded {metric_name} to {metric.name}.") + + eval_results = evaluate( + [test_case], + [metric], + async_config=AsyncConfig(run_async=False), # disable async + display_config=DisplayConfig( + show_indicator=False, # hide the progress meter + print_results=False, + verbose_mode=self.verbose, + ), + cache_config=CacheConfig(use_cache=False, write_cache=False), + error_config=ErrorConfig( + ignore_errors=False, # actually fail on failure + skip_on_missing_params=True, + ), + ) # Extract results - the structure varies by deepeval version test_result = eval_results.test_results[0] @@ -408,7 +524,7 @@ def save_results(self, results: List[EvalResult], output_path: Path): results_data.append(result.model_dump()) # Save as YAML - with open(output_path, "w") as f: + with open(output_path, "w", encoding="utf-8") as f: yaml.dump( {"results": results_data, "summary": self.generate_summary(results)}, f, diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py index f62d3df..5e1d616 100644 --- a/src/metacoder/metacoder.py +++ b/src/metacoder/metacoder.py @@ -1,4 +1,5 @@ import logging +import sys from pathlib import Path from typing import Optional, Union @@ -543,6 +544,17 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose: output_path = Path(output) workdir_path = Path(workdir) + try: + # Create the output file only if it doesn't exist; fail if it does + with output_path.open("x", encoding="utf-8") as _: + pass + except FileExistsError: + print( + f"Error: '{output_path}' already exists. Please delete it or specify a different filename.", + file=sys.stderr, + ) + sys.exit(1) + # Convert coders tuple to list (empty tuple if not specified) coders_list = list(coders) if coders else None @@ -592,37 +604,43 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose: # Print summary summary = runner.generate_summary(results) - click.echo("\nπŸ“ˆ Summary:") - click.echo(f" Total: {summary['total_evaluations']}") - click.echo( - f" Passed: {summary['passed']} ({summary['passed'] / summary['total_evaluations'] * 100:.1f}%)" + frac_passed = ( + summary["passed"] / summary["total_evaluations"] + if summary["total_evaluations"] + else 0 ) - click.echo( - f" Failed: {summary['failed']} ({summary['failed'] / summary['total_evaluations'] * 100:.1f}%)" + frac_failed = ( + summary["failed"] / summary["total_evaluations"] + if summary["total_evaluations"] + else 0 ) - if summary["errors"] > 0: - click.echo(f" Errors: {summary['errors']} ⚠️") + + click.echo("\nπŸ“ˆ Summary:") + click.echo(f" Total: {summary['total_evaluations']}") + click.echo(f" Passed: {summary['passed']} ({frac_passed:.1%})") + click.echo(f" Failed: {summary['failed']} ({frac_failed:.1%})") + click.echo(f" Errors: {summary['errors']} ⚠️") if summary["errors"] else None # Print by-coder summary if len(summary["by_coder"]) > 1: click.echo("\n By Coder:") for coder, stats in summary["by_coder"].items(): - pass_rate = ( - stats["passed"] / stats["total"] * 100 if stats["total"] > 0 else 0 + coder_frac_passed = ( + stats["passed"] / stats["total"] if stats["total"] else 0 ) click.echo( - f" {coder}: {stats['passed']}/{stats['total']} ({pass_rate:.1f}%)" + f" {coder}: {stats['passed']} / {stats['total']} ({coder_frac_passed:.1%})" ) # Print by-model summary if len(summary["by_model"]) > 1: click.echo("\n By Model:") for model, stats in summary["by_model"].items(): - pass_rate = ( - stats["passed"] / stats["total"] * 100 if stats["total"] > 0 else 0 + model_frac_passed = ( + stats["passed"] / stats["total"] if stats["total"] else 0 ) click.echo( - f" {model}: {stats['passed']}/{stats['total']} ({pass_rate:.1f}%)" + f" {model}: {stats['passed']} / {stats['total']} ({model_frac_passed:.1%})" ) click.echo("\nβœ… Evaluation complete!") diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..95f4c37 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,10 @@ +import logging +import sys + + +def pytest_configure(config): + logging.basicConfig( + level=logging.WARNING, + format="\n%(asctime)s [%(levelname)s] %(name)s: %(message)s", + stream=sys.stdout, + ) diff --git a/tests/input/goose_eval_claude_downgrade_test.yaml b/tests/input/goose_eval_claude_downgrade_test.yaml new file mode 100644 index 0000000..6f0eb31 --- /dev/null +++ b/tests/input/goose_eval_claude_downgrade_test.yaml @@ -0,0 +1,30 @@ +name: pubmed tools evals +description: | + Evaluations for multiple pubmed MCPs + +coders: + goose: {} + +models: + claude-sonnet: + provider: anthropic + name: claude-sonnet-4-20250514 + +servers: + mcp-simple-pubmed: + name: pubmed + command: uvx + args: [mcp-simple-pubmed] + env: + PUBMED_EMAIL: ctparker@lbl.gov + +server_combinations: + - [mcp-simple-pubmed] + +cases: +- name: PMID_28027860_Full_Text + metrics: [CorrectnessMetric] + input: "What is the first sentence of section 2 in PMID: 28027860?" + expected_output: | + Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial." + threshold: 0.9 diff --git a/tests/input/goose_eval_test.yaml b/tests/input/goose_eval_test.yaml index 1037215..f41e249 100644 --- a/tests/input/goose_eval_test.yaml +++ b/tests/input/goose_eval_test.yaml @@ -7,7 +7,7 @@ coders: goose: {} models: - gpt-4o: + claude-sonnet: provider: anthropic name: claude-sonnet-4-20250514 @@ -34,6 +34,9 @@ cases: MONDO:0011694 (spinocerebellar ataxia type 15/16, aka SCA15) MONDO:0007298 (spinocerebellar ataxia type 29, aka SCA29) MONDO:0008795 (aniridia-cerebellar ataxia-intellectual disability syndrome; aka Gillespie syndrome) - threshold: 0.7 - + - name: character_encoding_test + metrics: [CorrectnessMetric] + input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses? + expected_output: 'The paper says No but it is retracted so the results should not be trusted.' + threshold: 0.9 diff --git a/tests/input/goose_no_server_test.yaml b/tests/input/goose_no_server_test.yaml new file mode 100644 index 0000000..2dc5551 --- /dev/null +++ b/tests/input/goose_no_server_test.yaml @@ -0,0 +1,30 @@ +name: pubmed tools evals +description: | + Evaluations for multiple pubmed MCPs + +coders: + goose: {} + +models: + claude-sonnet: + provider: anthropic + name: claude-sonnet-4-20250514 + +servers: + mcp-simple-pubmed: + name: pubmed + command: uvx + args: [mcp-simple-pubmed] + env: + PUBMED_EMAIL: ctparker@lbl.gov + +#server_combinations: +# - [mcp-simple-pubmed] + +cases: +- name: PMID_28027860_Full_Text + metrics: [CorrectnessMetric] + input: "What is the first sentence of section 2 in PMID: 28027860?" + expected_output: | + Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial." + threshold: 0.9 diff --git a/tests/input/literature_mcp_encoding_test.yaml b/tests/input/literature_mcp_encoding_test.yaml new file mode 100644 index 0000000..d0fea1b --- /dev/null +++ b/tests/input/literature_mcp_encoding_test.yaml @@ -0,0 +1,29 @@ +name: pubmed tools evals +description: | + Evaluations for multiple pubmed MCPs + + +coders: + goose: {} + +models: + claude-sonnet: + provider: anthropic + name: claude-sonnet-4-20250514 + +servers: + ols: + name: ols + command: uvx + args: [mcp-ols] + +server_combinations: + - [simple-pubmed] + +cases: +- name: character_encoding_test + metrics: + - CorrectnessMetric + input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses? + expected_output: 'The paper says No but it is retracted so the results should not be trusted.' + threshold: 0.9 diff --git a/tests/test_coders/test_coders_basic.py b/tests/test_coders/test_coders_basic.py index a9498b6..5d9daf1 100644 --- a/tests/test_coders/test_coders_basic.py +++ b/tests/test_coders/test_coders_basic.py @@ -3,6 +3,7 @@ These tests check that each coder can handle a simple arithmetic question. """ +import json import tempfile import pytest @@ -164,3 +165,16 @@ def test_dummy_coder_always_works(): assert result is not None assert result.result_text == "you said: Hello, world!" assert result.stdout == "you said: Hello, world!" + + +@pytest.mark.integration +def test_goose_utf8_session_file(tmp_path): + """Test session files with UTF-8 content are read correctly.""" + session_content = '{"role": "assistant", "content": "ζ΅‹θ―• rΓ©sumΓ© πŸš€"}\n' + session_file = tmp_path / "test_session.jsonl" + session_file.write_text(session_content, encoding="utf-8") + + with open(session_file, "r", encoding="utf-8") as f: + messages = [json.loads(line) for line in f if line.strip()] + assert len(messages) == 1 + assert "ζ΅‹θ―•" in messages[0]["content"] diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py new file mode 100644 index 0000000..9ed23a6 --- /dev/null +++ b/tests/test_evals/test_claude_judge.py @@ -0,0 +1,139 @@ +import logging +import traceback +from pathlib import Path + +from metacoder.evals.runner import EvalRunner + +logger = logging.getLogger(__name__) + + +def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch): + """Test that ClaudeJudge is used when OpenAI is disabled.""" + # TODO: This test should avoid running the coder and only perform the eval step. + # Otherwise, it is impossible to get to the eval step if no valid API key is present or no quota is available (testing the wrong part of the process). + + runner = EvalRunner() + + try: + dataset = runner.load_dataset( + Path("tests/input/goose_eval_claude_downgrade_test.yaml") + ) + + # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used. + # One enhancement might be to introduce metric_model=claude-3-5-sonnet-20240620 to each result at eval time. + # Instead, resort to capturing the WARNING logs for assertions related to the downgrade. + with caplog.at_level(logging.WARNING): + # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail. + # (no need to reset, `monkeypatch` automatically reverts after the test) + monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing") + + results = runner.run_all_evals( + dataset, workdir=tmp_path, coders=["goose", "dummy"] + ) + + # Test that the quota exhaustion fallback logic worked as expected. + assert ( + "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval." + in caplog.text + ) + + # Test that the new evaluation judge was correctly selected for the metric model downgrade. + assert ( + "Downgrading CorrectnessMetric model from gpt-4.1 to claude-3-5-sonnet-20240620." + in caplog.text + ) + + # Test that the eval completed by checking for a non-zero score. + assert results[0].score > 0, ( + f"Expected a {results[0].metric_name} score for {results[0].case_name}." + ) + + except Exception as e: + # Test that fallback logic does not result in an Exception. + logger.error(f"An error occurred: {e}") + logging.error(traceback.format_exc()) + assert False # This assertion will fail if an Exception is caught here. + finally: + pass + + +def test_correctnessmetric_downgrade_success(tmp_path, caplog, monkeypatch): + """Test that the CorrectnessMatric is successfully downgraded to DummyMetric if no model is available.""" + + runner = EvalRunner() + + try: + dataset = runner.load_dataset( + Path("tests/input/goose_eval_claude_downgrade_test.yaml") + ) + + # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used. + # One enhancement might be to introduce metric_model=claude-3-5-sonnet-20240620 to each result at eval time. + # Instead, resort to capturing the WARNING logs for assertions related to the downgrade. + with caplog.at_level(logging.WARNING): + # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail. + # (no need to reset, `monkeypatch` automatically reverts after the test) + monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing") + + # Delete the Anthropic API Key from the environment to force ClaudeJudge instantiation to fail. + # (no need to reset, `monkeypatch` automatically reverts after the test) + monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False) + + # One more OpenAI API test case also needs to be handled (401 errors): + # Temporarily set an invalid ANTHROPIC_API_KEY in order to force ClaudeJudge to fail. + # monkeypatch.delenv("OPENAI_API_KEY", raising=False) + + # One more Anthropic API test case also needs to be handled (401 errors): + # Temporarily set an invalid ANTHROPIC_API_KEY in order to force ClaudeJudge to fail. + # monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-api-key-for-testing") + + # TODO: Also need to test this for Anthropic: + # Provider + # request + # failed + # with status: 400 + # Bad + # Request.Payload: Some(Object + # {"error": Object {"message": String("Your credit balance is too low + # to access the Anthropic API.Please go to Plans & Billing to upgrade or purchase + # credits."), "type": String("invalid_request_error")}, "request_id": String(" + # req_011CSeQZTjJvmcxzrhXuPES4"), "type": Strin + # g("error")}).Returning + # error: RequestFailed( + # "Request failed with status: 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits." + + results = runner.run_all_evals(dataset, workdir=tmp_path, coders=["dummy"]) + + # Test that the quota exhaustion fallback logic worked as expected. + assert ( + "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval." + in caplog.text + ) + + # Test that the new evaluation judge was correctly selected for the metric model downgrade. + assert ( + "Downgrading CorrectnessMetric model from gpt-4.1 to claude-3-5-sonnet-20240620." + in caplog.text + ) + + # Test that the ClaudeJudge was unable to be used as the model for the CorrectnessMetric. + assert ( + "Claude unavailable (ANTHROPIC_API_KEY is not set in environment); downgrading CorrectnessMetric to DummyMetric." + in caplog.text + ) + + # Test that the CorrectnessMetric was successfully downgraded to DummyMetric. + assert "Downgraded CorrectnessMetric to DummyMetric." in caplog.text + + # Test that the eval completed by checking for a non-zero score. + assert results[0].score > 0, ( + f"Expected a {results[0].metric_name} score for {results[0].case_name}." + ) + + except Exception as e: + # Test that fallback logic does not result in an Exception. + logger.error(f"An error occurred: {e}") + logging.error(traceback.format_exc()) + assert False # This assertion will fail if an Exception is caught here. + finally: + pass