diff --git a/src/metacoder/coders/base_coder.py b/src/metacoder/coders/base_coder.py
index b44c6ec..f2fa7d9 100644
--- a/src/metacoder/coders/base_coder.py
+++ b/src/metacoder/coders/base_coder.py
@@ -173,11 +173,15 @@ def run_process(
         """
         if env is None:
             env = self.expand_env(self.env)
+
+        # Decode the child process output as UTF-8 (instead of default encoding)
         process = subprocess.Popen(
             command,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             text=True,
+            encoding="utf-8",
+            errors="replace",  # avoid crashes on the occasional bad byte
             env=env,
             bufsize=1,
             universal_newlines=True,
@@ -189,6 +193,15 @@ def run_process(
         # check verbosity level
         quiet_mode = logger.getEffectiveLevel() <= logging.INFO
 
+        # Ensure our own stdout/stderr won't choke on non-ASCII (Windows consoles often do).
+        for s in (sys.stdout, sys.stderr):
+            try:
+                s.reconfigure(encoding="utf-8", errors="replace")  # Python 3.7+
+            except Exception as e:
+                logger.info(f"{e}")
+                pass  # OK if not available (e.g., redirected or older Python)
+
+        # lines are already str decoded as UTF-8
         def stream_output(pipe, output_lines, stream):
             for line in iter(pipe.readline, ""):
                 if not quiet_mode:
diff --git a/src/metacoder/coders/claude.py b/src/metacoder/coders/claude.py
index ee31b74..1a43295 100644
--- a/src/metacoder/coders/claude.py
+++ b/src/metacoder/coders/claude.py
@@ -246,7 +246,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
                 ao.tool_uses = tool_uses
 
             end_time = time.time()
-            logger.info(f"🤖 Command took {end_time - start_time} seconds")
+            logger.info(f"🤖 Command took {end_time - start_time:.2f} seconds")
             ao.total_cost_usd = total_cost_usd
             ao.success = not is_error
             if not ao.success:
diff --git a/src/metacoder/coders/codex.py b/src/metacoder/coders/codex.py
index 8e9169e..3451ebe 100644
--- a/src/metacoder/coders/codex.py
+++ b/src/metacoder/coders/codex.py
@@ -115,7 +115,7 @@ def run(self, input_text: str) -> CoderOutput:
             if "result" in message:
                 ao.result_text = message["result"]
         end_time = time.time()
-        print(f"🤖 Command took {end_time - start_time} seconds")
+        print(f"🤖 Command took {end_time - start_time:.2f} seconds")
         ao.total_cost_usd = total_cost_usd
         ao.success = not is_error
         if not ao.success:
diff --git a/src/metacoder/coders/gemini.py b/src/metacoder/coders/gemini.py
index 20564a9..6af35c4 100644
--- a/src/metacoder/coders/gemini.py
+++ b/src/metacoder/coders/gemini.py
@@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
                 )
 
             end_time = time.time()
-            logger.info(f"💎 Command took {end_time - start_time} seconds")
+            logger.info(f"💎 Command took {end_time - start_time:.2f} seconds")
 
             # Parse the output
             ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
diff --git a/src/metacoder/coders/goose.py b/src/metacoder/coders/goose.py
index 514dc2b..6b0b5c0 100644
--- a/src/metacoder/coders/goose.py
+++ b/src/metacoder/coders/goose.py
@@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
             result = self.run_process(command, env)
             end_time = time.time()
             ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
-            logger.info(f"🦆 Command took {end_time - start_time} seconds")
+            logger.info(f"🦆 Command took {end_time - start_time:.2f} seconds")
             # look in output text for a file like: logging to ./.local/share/goose/sessions/20250613_120403.jsonl
             session_file: Optional[Path] = None
             for line in result.stdout.split("\n"):
@@ -165,7 +165,7 @@ def run(self, input_text: str) -> CoderOutput:
                     session_file = Path(session_file_str)
                     break
             if session_file and session_file.exists():
-                with open(session_file, "r") as f:
+                with open(session_file, "r", encoding="utf-8") as f:
                     ao.structured_messages = [
                         json.loads(line) for line in f if line.strip()
                     ]
diff --git a/src/metacoder/coders/qwen.py b/src/metacoder/coders/qwen.py
index 43aefb6..b6f4080 100644
--- a/src/metacoder/coders/qwen.py
+++ b/src/metacoder/coders/qwen.py
@@ -90,7 +90,7 @@ def run(self, input_text: str) -> CoderOutput:
                 )
 
             end_time = time.time()
-            print(f"🤖 Command took {end_time - start_time} seconds")
+            print(f"🤖 Command took {end_time - start_time:.2f} seconds")
 
             # Create output - Qwen CLI doesn't provide structured output
             ao = CoderOutput(
diff --git a/src/metacoder/evals/judges.py b/src/metacoder/evals/judges.py
new file mode 100644
index 0000000..24b4277
--- /dev/null
+++ b/src/metacoder/evals/judges.py
@@ -0,0 +1,93 @@
+# metacoder/evals/judges.py
+import logging
+import os
+
+from anthropic import Anthropic
+from anthropic.types import MessageParam, TextBlockParam, TextBlock
+
+from deepeval.models.base_model import DeepEvalBaseLLM
+
+logger = logging.getLogger(__name__)
+
+
+class ClaudeJudge(DeepEvalBaseLLM):
+    """
+    Wraps Anthropic's Claude models so they can be used as
+    the `model` parameter to DeepEval metrics like GEval.
+    """
+
+    # Note: Anthropic models can be listed via:
+    # curl https://api.anthropic.com/v1/models --header "x-api-key: %ANTHROPIC_API_KEY%" --header "anthropic-version: 2023-06-01"
+    # {"data": [{"type": "model", "id": "claude-opus-4-1-20250805", "display_name": "Claude Opus 4.1", "created_at": "2025-08-05T00:00:00Z"}, ... ]}
+    # Current list (September 3, 2025):
+    # claude-opus-4-1-20250805, claude-opus-4-20250514, claude-sonnet-4-20250514, claude-3-7-sonnet-20250219,
+    # claude-3-5-sonnet-20241022, claude-3-5-haiku-20241022, claude-3-5-sonnet-20240620, claude-3-haiku-20240307,
+    # claude-3-opus-20240229
+
+    def __init__(
+        self,
+        model_name: str = "claude-sonnet-4-20250514",
+        max_tokens: int = 1024,
+        temperature: float = 0.0,
+    ):
+        super().__init__()
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            raise Exception("ANTHROPIC_API_KEY is not set in environment")
+        self.client = Anthropic(api_key=api_key)
+        self.model_name = model_name
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+
+    def load_model(self):
+        return self
+
+    def generate(self, prompt: str) -> str:
+        # Build typed content blocks and messages to satisfy the SDK's type hints
+        content: list[TextBlockParam] = [{"type": "text", "text": prompt}]
+        messages: list[MessageParam] = [{"role": "user", "content": content}]
+        resp = self.client.messages.create(
+            model=self.model_name,
+            max_tokens=self.max_tokens,
+            temperature=self.temperature,
+            messages=messages,
+        )
+        # anthropic returns a list of content blocks; collect only the text blocks.
+        parts: list[str] = []
+        for block in resp.content:
+            if isinstance(block, TextBlock):
+                parts.append(block.text)
+        return "".join(parts)
+
+    async def a_generate(self, prompt: str) -> str:
+        # for now just call the sync path
+        return self.generate(prompt)
+
+    def get_model_name(self) -> str:
+        return self.model_name
+
+    def has_available_quota(self) -> bool:
+        """
+        Try a very lightweight request to check if quota is available.
+        Returns True if quota exists, False if Anthropic responds with
+        quota-related errors.
+        """
+        try:
+            # Use a minimal "ping" request
+            content: list[TextBlockParam] = [{"type": "text", "text": "ping"}]
+            messages: list[MessageParam] = [{"role": "user", "content": content}]
+            self.client.messages.create(
+                model=self.model_name,
+                max_tokens=1,  # cheapest possible
+                temperature=0.0,
+                messages=messages,
+            )
+            return True
+        except Exception as e:
+            msg = str(e).lower()
+            # Check for insufficient quota:
+            # 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.
+            if "credit balance is too low" in msg or "400" in msg:
+                logger.warning(f"ClaudeJudge quota check failed: {e}")
+                return False
+            raise
diff --git a/src/metacoder/evals/runner.py b/src/metacoder/evals/runner.py
index 67a9619..f1c7126 100644
--- a/src/metacoder/evals/runner.py
+++ b/src/metacoder/evals/runner.py
@@ -5,27 +5,32 @@
 """
 
 import copy
+import functools
 import importlib
 import logging
+import os
 import time
+import traceback
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Type
+from typing import Any, Dict, List, Optional, Type, cast
 
 from pydantic import BaseModel
 import yaml
+
 from deepeval import evaluate
-from deepeval.metrics import BaseMetric
-from deepeval.test_case import LLMTestCase
-from deepeval.metrics import GEval
-from deepeval.test_case import LLMTestCaseParams
+from deepeval.evaluate import AsyncConfig, DisplayConfig, CacheConfig, ErrorConfig
+from deepeval.models import DeepEvalBaseLLM
+from deepeval.metrics import BaseMetric, GEval
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
 
+from openai import APIStatusError
+from openai.types.chat import ChatCompletionMessageParam
 
 from metacoder.coders.base_coder import BaseCoder, CoderOutput
 from metacoder.registry import AVAILABLE_CODERS
 from metacoder.evals.eval_model import EvalCase, EvalDataset
 from metacoder.configuration import AIModelConfig, CoderConfig
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -59,24 +64,34 @@ def is_successful(self) -> bool:
         return self.success
 
 
-def get_default_metrics() -> Dict[str, BaseMetric]:
-    """Get default metrics. Creates instances lazily to avoid network calls during import."""
+def make_geval(model: Optional[DeepEvalBaseLLM] = None) -> GEval:
+    """Creates a GEval instance with the specified model."""
+    return GEval(
+        name="Correctness",
+        criteria="Determine whether the actual output is factually correct based on the expected output.",
+        # NOTE: you can only provide either criteria or evaluation_steps, and not both
+        evaluation_steps=[
+            "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
+            "You should also heavily penalize omission of detail",
+            "Vague language, or contradicting OPINIONS, are OK",
+        ],
+        threshold=0.8,
+        evaluation_params=[
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+        ],
+        model=model,  # may be None (defaults to OpenAI) or a Claude judge
+    )
+
+
+def get_default_metrics(
+    model: Optional[DeepEvalBaseLLM] = None,
+) -> Dict[str, BaseMetric]:
+    """Get default metrics with the specified model. Creates instances lazily to avoid network calls during import."""
     return {
-        "CorrectnessMetric": GEval(
-            name="Correctness",
-            criteria="Determine whether the actual output is factually correct based on the expected output.",
-            # NOTE: you can only provide either criteria or evaluation_steps, and not both
-            evaluation_steps=[
-                "Check whether the facts in 'actual output' contradicts any facts in 'expected output'",
-                "You should also heavily penalize omission of detail",
-                "Vague language, or contradicting OPINIONS, are OK",
-            ],
-            threshold=0.8,
-            evaluation_params=[
-                LLMTestCaseParams.INPUT,
-                LLMTestCaseParams.ACTUAL_OUTPUT,
-                LLMTestCaseParams.EXPECTED_OUTPUT,
-            ],
+        "CorrectnessMetric": make_geval(
+            model=model  # Note: GEval defaults to OpenAI if no model is specified.
         ),
         "DummyMetric": DummyMetric(threshold=0.5),
     }
@@ -123,6 +138,8 @@ class EvalRunner:
 
     def __init__(self, verbose: bool = False):
         self.verbose = verbose
+        self.use_openai = True  # GEval will default to OpenAI, avoid it and downgrade to another provider or metric if quota runs out.
+
         if verbose:
             logging.basicConfig(level=logging.DEBUG)
         else:
@@ -183,6 +200,48 @@ def create_test_case(self, case: EvalCase, actual_output: str) -> LLMTestCase:
             additional_metadata=case.additional_metadata,
         )
 
+    @functools.lru_cache(maxsize=1)
+    def _openai_quota_ok(self, model: str = "gpt-4o-mini") -> bool:
+        if not os.getenv("OPENAI_API_KEY"):
+            logger.info("OPENAI_API_KEY is not set.")
+            return False
+        """
+            Preflight: detect “no OpenAI quota” and skip/redirect before calling evaluate.
+            Fast probe of the /chat/completions endpoint (the one GEval uses).
+            Returns False on 429 (insufficient_quota) or any exception.
+        """
+        try:
+            from openai import OpenAI
+
+            # turn off SDK retries for the check so it returns fast
+            client = OpenAI(max_retries=0, timeout=8)  # NO retries, quick fail
+            # messages = cast(List[ChatCompletionMessageParam], [{"role": "user", "content": "ping"}])
+            raw = [{"role": "user", "content": "ping"}]
+            messages = cast(List[ChatCompletionMessageParam], raw)
+            client.chat.completions.create(
+                model=model,
+                messages=messages,
+                max_tokens=1,
+                temperature=0,
+            )
+            return True
+        except APIStatusError as e:
+            # 429 insufficient quota or too many requests
+            if e.status_code == 429:
+                logger.warning(f"OpenAI API Key has insufficient quota: {e}")
+                return False
+            # 401 authentication problem, including invalid API key
+            if e.status_code == 401:
+                logger.warning(f"OpenAI API Authentication Error: {e}")
+                return False
+            # all other errors
+            logger.warning(f"OpenAI API Status Error; treating as no-quota: {e}")
+            return False
+        except Exception as e:
+            # includes network issues, etc.
+            logger.warning(f"OpenAI preflight failed; treating as no-quota: {e}")
+            return False
+
     def run_single_eval(
         self,
         model_name: str,
@@ -235,8 +294,65 @@ def run_single_eval(
             test_case = self.create_test_case(case, actual_output)
 
             # Evaluate
-            logger.info(f"Evaluating with {metric_name}")
-            eval_results = evaluate([test_case], [metric])
+            logger.info(
+                f"Evaluating {metric_name} using model {metric.model.model_name}"
+            )
+
+            if isinstance(metric, GEval):
+                # Assume GEval will use OpenAI until is disabled.
+                if self.use_openai and not self._openai_quota_ok():
+                    logger.warning(
+                        "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
+                    )
+                    self.use_openai = False
+
+                # Note: This will downgrade a metric if needed each time it is about to be used without modifying the default metrics.
+                if not self.use_openai:
+                    claude_model = "claude-3-5-sonnet-20240620"
+                    logger.warning(
+                        f"Downgrading {metric_name} model from {metric.model.model_name} to {claude_model}."
+                    )
+
+                    try:
+                        # Downgrade metric model to Claude judge.
+                        from metacoder.evals.judges import ClaudeJudge
+
+                        judge = ClaudeJudge(claude_model)
+
+                        if not judge.has_available_quota():
+                            raise Exception(
+                                "No Anthropic credits available for ClaudeJudge."
+                            )
+
+                        metric = make_geval(model=judge)
+                        logger.info(
+                            f"Successfully downgraded {metric_name} model to {metric.model.model_name}."
+                        )
+                    except Exception as e:
+                        # Fallback: if you can't use Claude, downgrade gracefully.
+                        logging.debug(traceback.format_exc())
+                        logger.debug(e)
+                        logger.warning(
+                            f"Claude unavailable ({e}); downgrading {metric_name} to DummyMetric."
+                        )
+                        metric = DummyMetric(threshold=0.5)
+                        logger.warning(f"Downgraded {metric_name} to {metric.name}.")
+
+            eval_results = evaluate(
+                [test_case],
+                [metric],
+                async_config=AsyncConfig(run_async=False),  # disable async
+                display_config=DisplayConfig(
+                    show_indicator=False,  # hide the progress meter
+                    print_results=False,
+                    verbose_mode=self.verbose,
+                ),
+                cache_config=CacheConfig(use_cache=False, write_cache=False),
+                error_config=ErrorConfig(
+                    ignore_errors=False,  # actually fail on failure
+                    skip_on_missing_params=True,
+                ),
+            )
 
             # Extract results - the structure varies by deepeval version
             test_result = eval_results.test_results[0]
@@ -408,7 +524,7 @@ def save_results(self, results: List[EvalResult], output_path: Path):
             results_data.append(result.model_dump())
 
         # Save as YAML
-        with open(output_path, "w") as f:
+        with open(output_path, "w", encoding="utf-8") as f:
             yaml.dump(
                 {"results": results_data, "summary": self.generate_summary(results)},
                 f,
diff --git a/src/metacoder/metacoder.py b/src/metacoder/metacoder.py
index f62d3df..5e1d616 100644
--- a/src/metacoder/metacoder.py
+++ b/src/metacoder/metacoder.py
@@ -1,4 +1,5 @@
 import logging
+import sys
 from pathlib import Path
 from typing import Optional, Union
 
@@ -543,6 +544,17 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
     output_path = Path(output)
     workdir_path = Path(workdir)
 
+    try:
+        # Create the output file only if it doesn't exist; fail if it does
+        with output_path.open("x", encoding="utf-8") as _:
+            pass
+    except FileExistsError:
+        print(
+            f"Error: '{output_path}' already exists. Please delete it or specify a different filename.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
     # Convert coders tuple to list (empty tuple if not specified)
     coders_list = list(coders) if coders else None
 
@@ -592,37 +604,43 @@ def eval_command(config: str, output: str, workdir: str, coders: tuple, verbose:
 
     # Print summary
     summary = runner.generate_summary(results)
-    click.echo("\n📈 Summary:")
-    click.echo(f"   Total: {summary['total_evaluations']}")
-    click.echo(
-        f"   Passed: {summary['passed']} ({summary['passed'] / summary['total_evaluations'] * 100:.1f}%)"
+    frac_passed = (
+        summary["passed"] / summary["total_evaluations"]
+        if summary["total_evaluations"]
+        else 0
     )
-    click.echo(
-        f"   Failed: {summary['failed']} ({summary['failed'] / summary['total_evaluations'] * 100:.1f}%)"
+    frac_failed = (
+        summary["failed"] / summary["total_evaluations"]
+        if summary["total_evaluations"]
+        else 0
     )
-    if summary["errors"] > 0:
-        click.echo(f"   Errors: {summary['errors']} ⚠️")
+
+    click.echo("\n📈 Summary:")
+    click.echo(f"   Total: {summary['total_evaluations']}")
+    click.echo(f"   Passed: {summary['passed']} ({frac_passed:.1%})")
+    click.echo(f"   Failed: {summary['failed']} ({frac_failed:.1%})")
+    click.echo(f"   Errors: {summary['errors']} ⚠️") if summary["errors"] else None
 
     # Print by-coder summary
     if len(summary["by_coder"]) > 1:
         click.echo("\n   By Coder:")
         for coder, stats in summary["by_coder"].items():
-            pass_rate = (
-                stats["passed"] / stats["total"] * 100 if stats["total"] > 0 else 0
+            coder_frac_passed = (
+                stats["passed"] / stats["total"] if stats["total"] else 0
             )
             click.echo(
-                f"     {coder}: {stats['passed']}/{stats['total']} ({pass_rate:.1f}%)"
+                f"     {coder}: {stats['passed']} / {stats['total']} ({coder_frac_passed:.1%})"
             )
 
     # Print by-model summary
     if len(summary["by_model"]) > 1:
         click.echo("\n   By Model:")
         for model, stats in summary["by_model"].items():
-            pass_rate = (
-                stats["passed"] / stats["total"] * 100 if stats["total"] > 0 else 0
+            model_frac_passed = (
+                stats["passed"] / stats["total"] if stats["total"] else 0
             )
             click.echo(
-                f"     {model}: {stats['passed']}/{stats['total']} ({pass_rate:.1f}%)"
+                f"     {model}: {stats['passed']} / {stats['total']} ({model_frac_passed:.1%})"
             )
 
     click.echo("\n✅ Evaluation complete!")
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..95f4c37
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,10 @@
+import logging
+import sys
+
+
+def pytest_configure(config):
+    logging.basicConfig(
+        level=logging.WARNING,
+        format="\n%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        stream=sys.stdout,
+    )
diff --git a/tests/input/goose_eval_claude_downgrade_test.yaml b/tests/input/goose_eval_claude_downgrade_test.yaml
new file mode 100644
index 0000000..6f0eb31
--- /dev/null
+++ b/tests/input/goose_eval_claude_downgrade_test.yaml
@@ -0,0 +1,30 @@
+name: pubmed tools evals
+description: |
+  Evaluations for multiple pubmed MCPs
+
+coders:
+  goose: {}
+
+models:
+  claude-sonnet:
+    provider: anthropic
+    name: claude-sonnet-4-20250514
+
+servers:
+  mcp-simple-pubmed:
+    name: pubmed
+    command: uvx
+    args: [mcp-simple-pubmed]
+    env:
+      PUBMED_EMAIL: ctparker@lbl.gov
+
+server_combinations:
+  - [mcp-simple-pubmed]
+
+cases:
+- name: PMID_28027860_Full_Text
+  metrics: [CorrectnessMetric]
+  input: "What is the first sentence of section 2 in PMID: 28027860?"
+  expected_output: |
+    Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial."
+  threshold: 0.9
diff --git a/tests/input/goose_eval_test.yaml b/tests/input/goose_eval_test.yaml
index 1037215..f41e249 100644
--- a/tests/input/goose_eval_test.yaml
+++ b/tests/input/goose_eval_test.yaml
@@ -7,7 +7,7 @@ coders:
   goose: {}
 
 models:
-  gpt-4o:
+  claude-sonnet:
     provider: anthropic
     name: claude-sonnet-4-20250514
 
@@ -34,6 +34,9 @@ cases:
       MONDO:0011694 (spinocerebellar ataxia type 15/16, aka SCA15)
       MONDO:0007298 (spinocerebellar ataxia type 29, aka SCA29)
       MONDO:0008795 (aniridia-cerebellar ataxia-intellectual disability syndrome; aka Gillespie syndrome)
-
     threshold: 0.7
-    
+  - name: character_encoding_test
+    metrics: [CorrectnessMetric]
+    input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?
+    expected_output: 'The paper says No but it is retracted so the results should not be trusted.'
+    threshold: 0.9
diff --git a/tests/input/goose_no_server_test.yaml b/tests/input/goose_no_server_test.yaml
new file mode 100644
index 0000000..2dc5551
--- /dev/null
+++ b/tests/input/goose_no_server_test.yaml
@@ -0,0 +1,30 @@
+name: pubmed tools evals
+description: |
+  Evaluations for multiple pubmed MCPs
+
+coders:
+  goose: {}
+
+models:
+  claude-sonnet:
+    provider: anthropic
+    name: claude-sonnet-4-20250514
+
+servers:
+  mcp-simple-pubmed:
+    name: pubmed
+    command: uvx
+    args: [mcp-simple-pubmed]
+    env:
+      PUBMED_EMAIL: ctparker@lbl.gov
+
+#server_combinations:
+#  - [mcp-simple-pubmed]
+
+cases:
+- name: PMID_28027860_Full_Text
+  metrics: [CorrectnessMetric]
+  input: "What is the first sentence of section 2 in PMID: 28027860?"
+  expected_output: |
+    Even though many of NFLE's core features have been clarified in the last two decades, some critical issues remain controversial."
+  threshold: 0.9
diff --git a/tests/input/literature_mcp_encoding_test.yaml b/tests/input/literature_mcp_encoding_test.yaml
new file mode 100644
index 0000000..d0fea1b
--- /dev/null
+++ b/tests/input/literature_mcp_encoding_test.yaml
@@ -0,0 +1,29 @@
+name: pubmed tools evals
+description: |
+  Evaluations for multiple pubmed MCPs
+
+
+coders:
+  goose: {}
+
+models:
+  claude-sonnet:
+    provider: anthropic
+    name: claude-sonnet-4-20250514
+
+servers:
+  ols:
+    name: ols
+    command: uvx
+    args: [mcp-ols]
+
+server_combinations:
+  - [simple-pubmed]
+
+cases:
+- name: character_encoding_test
+  metrics:
+  - CorrectnessMetric
+  input: Based on PMID 33926573 do microbes from alkaline sulphidic tailings show oxidative stresses?
+  expected_output: 'The paper says No but it is retracted so the results should not be trusted.'
+  threshold: 0.9
diff --git a/tests/test_coders/test_coders_basic.py b/tests/test_coders/test_coders_basic.py
index a9498b6..5d9daf1 100644
--- a/tests/test_coders/test_coders_basic.py
+++ b/tests/test_coders/test_coders_basic.py
@@ -3,6 +3,7 @@
 These tests check that each coder can handle a simple arithmetic question.
 """
 
+import json
 import tempfile
 import pytest
 
@@ -164,3 +165,16 @@ def test_dummy_coder_always_works():
         assert result is not None
         assert result.result_text == "you said: Hello, world!"
         assert result.stdout == "you said: Hello, world!"
+
+
+@pytest.mark.integration
+def test_goose_utf8_session_file(tmp_path):
+    """Test session files with UTF-8 content are read correctly."""
+    session_content = '{"role": "assistant", "content": "测试 résumé 🚀"}\n'
+    session_file = tmp_path / "test_session.jsonl"
+    session_file.write_text(session_content, encoding="utf-8")
+
+    with open(session_file, "r", encoding="utf-8") as f:
+        messages = [json.loads(line) for line in f if line.strip()]
+    assert len(messages) == 1
+    assert "测试" in messages[0]["content"]
diff --git a/tests/test_evals/test_claude_judge.py b/tests/test_evals/test_claude_judge.py
new file mode 100644
index 0000000..9ed23a6
--- /dev/null
+++ b/tests/test_evals/test_claude_judge.py
@@ -0,0 +1,139 @@
+import logging
+import traceback
+from pathlib import Path
+
+from metacoder.evals.runner import EvalRunner
+
+logger = logging.getLogger(__name__)
+
+
+def test_claude_judge_downgrade_success(tmp_path, caplog, monkeypatch):
+    """Test that ClaudeJudge is used when OpenAI is disabled."""
+    # TODO: This test should avoid running the coder and only perform the eval step.
+    # Otherwise, it is impossible to get to the eval step if no valid API key is present or no quota is available (testing the wrong part of the process).
+
+    runner = EvalRunner()
+
+    try:
+        dataset = runner.load_dataset(
+            Path("tests/input/goose_eval_claude_downgrade_test.yaml")
+        )
+
+        # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
+        # One enhancement might be to introduce metric_model=claude-3-5-sonnet-20240620 to each result at eval time.
+        # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
+        with caplog.at_level(logging.WARNING):
+            # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
+            # (no need to reset, `monkeypatch` automatically reverts after the test)
+            monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
+
+            results = runner.run_all_evals(
+                dataset, workdir=tmp_path, coders=["goose", "dummy"]
+            )
+
+            # Test that the quota exhaustion fallback logic worked as expected.
+            assert (
+                "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
+                in caplog.text
+            )
+
+            # Test that the new evaluation judge was correctly selected for the metric model downgrade.
+            assert (
+                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-3-5-sonnet-20240620."
+                in caplog.text
+            )
+
+            # Test that the eval completed by checking for a non-zero score.
+            assert results[0].score > 0, (
+                f"Expected a {results[0].metric_name} score for {results[0].case_name}."
+            )
+
+    except Exception as e:
+        # Test that fallback logic does not result in an Exception.
+        logger.error(f"An error occurred: {e}")
+        logging.error(traceback.format_exc())
+        assert False  # This assertion will fail if an Exception is caught here.
+    finally:
+        pass
+
+
+def test_correctnessmetric_downgrade_success(tmp_path, caplog, monkeypatch):
+    """Test that the CorrectnessMatric is successfully downgraded to DummyMetric if no model is available."""
+
+    runner = EvalRunner()
+
+    try:
+        dataset = runner.load_dataset(
+            Path("tests/input/goose_eval_claude_downgrade_test.yaml")
+        )
+
+        # Unfortunately, there is nothing available in the eval results that indicate which model DeepEval used.
+        # One enhancement might be to introduce metric_model=claude-3-5-sonnet-20240620 to each result at eval time.
+        # Instead, resort to capturing the WARNING logs for assertions related to the downgrade.
+        with caplog.at_level(logging.WARNING):
+            # Temporarily set an invalid OPENAI_API_KEY in order to force OpenAI calls to fail.
+            # (no need to reset, `monkeypatch` automatically reverts after the test)
+            monkeypatch.setenv("OPENAI_API_KEY", "fake-api-key-for-testing")
+
+            # Delete the Anthropic API Key from the environment to force ClaudeJudge instantiation to fail.
+            # (no need to reset, `monkeypatch` automatically reverts after the test)
+            monkeypatch.delenv("ANTHROPIC_API_KEY", raising=False)
+
+            # One more OpenAI API test case also needs to be handled (401 errors):
+            # Temporarily set an invalid ANTHROPIC_API_KEY in order to force ClaudeJudge to fail.
+            # monkeypatch.delenv("OPENAI_API_KEY", raising=False)
+
+            # One more Anthropic API test case also needs to be handled (401 errors):
+            # Temporarily set an invalid ANTHROPIC_API_KEY in order to force ClaudeJudge to fail.
+            # monkeypatch.setenv("ANTHROPIC_API_KEY", "fake-api-key-for-testing")
+
+            # TODO: Also need to test this for Anthropic:
+            # Provider
+            # request
+            # failed
+            # with status: 400
+            # Bad
+            # Request.Payload: Some(Object
+            # {"error": Object {"message": String("Your credit balance is too low
+            #                   to access the Anthropic API.Please go to Plans & Billing to upgrade or purchase
+            #                   credits."), "type": String("invalid_request_error")}, "request_id": String("
+            #                   req_011CSeQZTjJvmcxzrhXuPES4"), "type": Strin
+            #                   g("error")}).Returning
+            # error: RequestFailed(
+            #     "Request failed with status: 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits."
+
+            results = runner.run_all_evals(dataset, workdir=tmp_path, coders=["dummy"])
+
+            # Test that the quota exhaustion fallback logic worked as expected.
+            assert (
+                "OpenAI API quota exhausted or server unavailable; disabling OpenAI for DeepEval."
+                in caplog.text
+            )
+
+            # Test that the new evaluation judge was correctly selected for the metric model downgrade.
+            assert (
+                "Downgrading CorrectnessMetric model from gpt-4.1 to claude-3-5-sonnet-20240620."
+                in caplog.text
+            )
+
+            # Test that the ClaudeJudge was unable to be used as the model for the CorrectnessMetric.
+            assert (
+                "Claude unavailable (ANTHROPIC_API_KEY is not set in environment); downgrading CorrectnessMetric to DummyMetric."
+                in caplog.text
+            )
+
+            # Test that the CorrectnessMetric was successfully downgraded to DummyMetric.
+            assert "Downgraded CorrectnessMetric to DummyMetric." in caplog.text
+
+            # Test that the eval completed by checking for a non-zero score.
+            assert results[0].score > 0, (
+                f"Expected a {results[0].metric_name} score for {results[0].case_name}."
+            )
+
+    except Exception as e:
+        # Test that fallback logic does not result in an Exception.
+        logger.error(f"An error occurred: {e}")
+        logging.error(traceback.format_exc())
+        assert False  # This assertion will fail if an Exception is caught here.
+    finally:
+        pass