Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
cd95918
Enforce UTF-8 for Goose session files.
Aug 29, 2025
a791ce5
Fixes issue #15. Prevents divide by zero errors and cleans up summari…
Aug 29, 2025
49891a3
Cleaned up output by using consistent printing methods.
Aug 29, 2025
46ad344
Fixes Issue #18 by implementing metric downgrades to Claude if OpenAP…
Aug 30, 2025
fc7ba41
Satisfied ruff's bizarre rules.
Aug 30, 2025
54dd3d3
Added extra logging and test for goose UTF-8 handling.
Aug 30, 2025
72f586c
Added metacoder configuration test cases for claude downgrade and no …
Aug 30, 2025
d7beb19
Added unit test for claude downgrade to support Issue #18. Cleaned up…
Aug 30, 2025
d88ca90
Added unit test for claude downgrade to support Issue #18. Cleaned up…
Aug 30, 2025
e7bba40
Added assertion to confirm that ClaudeJudge completed scoring the met…
Aug 30, 2025
d27277b
Added assertion to force test to fail on Exception. Increased logging…
Aug 30, 2025
3f22fc6
Fixed runtime issues related to metric downgrade from CorrectnessMetr…
Aug 30, 2025
d6e1e44
Added test coverage of new evaluation judge functionality. Added test…
Aug 30, 2025
882a3d9
Reduced logging verbosity. Added Anthropic quota check. Added automat…
Sep 2, 2025
c98c9d7
Fixed issue #23. Forced processes to be launched with UTF-8 encoding …
Sep 2, 2025
4761d19
Addressed ruff formatting issue.
Sep 2, 2025
6b64a79
Added output file check to fail if the output file already exists. Ot…
Sep 2, 2025
c436e7f
Modified save_results to append to existing output file rather than o…
Sep 2, 2025
b0b1c8b
Updated ClaudeJudge model to claude-sonnet-4-20250514.
Sep 3, 2025
a7e71e3
Revert "Modified save_results to append to existing output file rathe…
Sep 3, 2025
7e143da
Added UTF-8 encoding to prevent character mangling during YAML export…
Sep 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/metacoder/coders/base_coder.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,11 +173,15 @@ def run_process(
"""
if env is None:
env = self.expand_env(self.env)

# Decode the child process output as UTF-8 (instead of default encoding)
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding="utf-8",
errors="replace", # avoid crashes on the occasional bad byte
env=env,
bufsize=1,
universal_newlines=True,
Expand All @@ -189,6 +193,15 @@ def run_process(
# check verbosity level
quiet_mode = logger.getEffectiveLevel() <= logging.INFO

# Ensure our own stdout/stderr won't choke on non-ASCII (Windows consoles often do).
for s in (sys.stdout, sys.stderr):
try:
s.reconfigure(encoding="utf-8", errors="replace") # Python 3.7+
except Exception as e:
logger.info(f"{e}")
pass # OK if not available (e.g., redirected or older Python)

# lines are already str decoded as UTF-8
def stream_output(pipe, output_lines, stream):
for line in iter(pipe.readline, ""):
if not quiet_mode:
Expand Down
2 changes: 1 addition & 1 deletion src/metacoder/coders/claude.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ def parse_jsonl_line(text: str) -> dict[str, Any]:
ao.tool_uses = tool_uses

end_time = time.time()
logger.info(f"🤖 Command took {end_time - start_time} seconds")
logger.info(f"🤖 Command took {end_time - start_time:.2f} seconds")
ao.total_cost_usd = total_cost_usd
ao.success = not is_error
if not ao.success:
Expand Down
2 changes: 1 addition & 1 deletion src/metacoder/coders/codex.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def run(self, input_text: str) -> CoderOutput:
if "result" in message:
ao.result_text = message["result"]
end_time = time.time()
print(f"🤖 Command took {end_time - start_time} seconds")
print(f"🤖 Command took {end_time - start_time:.2f} seconds")
ao.total_cost_usd = total_cost_usd
ao.success = not is_error
if not ao.success:
Expand Down
2 changes: 1 addition & 1 deletion src/metacoder/coders/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
)

end_time = time.time()
logger.info(f"💎 Command took {end_time - start_time} seconds")
logger.info(f"💎 Command took {end_time - start_time:.2f} seconds")

# Parse the output
ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
Expand Down
4 changes: 2 additions & 2 deletions src/metacoder/coders/goose.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def run(self, input_text: str) -> CoderOutput:
result = self.run_process(command, env)
end_time = time.time()
ao = CoderOutput(stdout=result.stdout, stderr=result.stderr)
logger.info(f"🦆 Command took {end_time - start_time} seconds")
logger.info(f"🦆 Command took {end_time - start_time:.2f} seconds")
# look in output text for a file like: logging to ./.local/share/goose/sessions/20250613_120403.jsonl
session_file: Optional[Path] = None
for line in result.stdout.split("\n"):
Expand All @@ -165,7 +165,7 @@ def run(self, input_text: str) -> CoderOutput:
session_file = Path(session_file_str)
break
if session_file and session_file.exists():
with open(session_file, "r") as f:
with open(session_file, "r", encoding="utf-8") as f:
ao.structured_messages = [
json.loads(line) for line in f if line.strip()
]
Expand Down
2 changes: 1 addition & 1 deletion src/metacoder/coders/qwen.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def run(self, input_text: str) -> CoderOutput:
)

end_time = time.time()
print(f"🤖 Command took {end_time - start_time} seconds")
print(f"🤖 Command took {end_time - start_time:.2f} seconds")

# Create output - Qwen CLI doesn't provide structured output
ao = CoderOutput(
Expand Down
93 changes: 93 additions & 0 deletions src/metacoder/evals/judges.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# metacoder/evals/judges.py
import logging
import os

from anthropic import Anthropic
from anthropic.types import MessageParam, TextBlockParam, TextBlock

from deepeval.models.base_model import DeepEvalBaseLLM

logger = logging.getLogger(__name__)


class ClaudeJudge(DeepEvalBaseLLM):
"""
Wraps Anthropic's Claude models so they can be used as
the `model` parameter to DeepEval metrics like GEval.
"""

# Note: Anthropic models can be listed via:
# curl https://api.anthropic.com/v1/models --header "x-api-key: %ANTHROPIC_API_KEY%" --header "anthropic-version: 2023-06-01"
# {"data": [{"type": "model", "id": "claude-opus-4-1-20250805", "display_name": "Claude Opus 4.1", "created_at": "2025-08-05T00:00:00Z"}, ... ]}
# Current list (September 3, 2025):
# claude-opus-4-1-20250805, claude-opus-4-20250514, claude-sonnet-4-20250514, claude-3-7-sonnet-20250219,
# claude-3-5-sonnet-20241022, claude-3-5-haiku-20241022, claude-3-5-sonnet-20240620, claude-3-haiku-20240307,
# claude-3-opus-20240229

def __init__(
self,
model_name: str = "claude-sonnet-4-20250514",
max_tokens: int = 1024,
temperature: float = 0.0,
):
super().__init__()
api_key = os.getenv("ANTHROPIC_API_KEY")
if not api_key:
raise Exception("ANTHROPIC_API_KEY is not set in environment")
self.client = Anthropic(api_key=api_key)
self.model_name = model_name
self.max_tokens = max_tokens
self.temperature = temperature

def load_model(self):
return self

def generate(self, prompt: str) -> str:
# Build typed content blocks and messages to satisfy the SDK's type hints
content: list[TextBlockParam] = [{"type": "text", "text": prompt}]
messages: list[MessageParam] = [{"role": "user", "content": content}]
resp = self.client.messages.create(
model=self.model_name,
max_tokens=self.max_tokens,
temperature=self.temperature,
messages=messages,
)
# anthropic returns a list of content blocks; collect only the text blocks.
parts: list[str] = []
for block in resp.content:
if isinstance(block, TextBlock):
parts.append(block.text)
return "".join(parts)

async def a_generate(self, prompt: str) -> str:
# for now just call the sync path
return self.generate(prompt)

def get_model_name(self) -> str:
return self.model_name

def has_available_quota(self) -> bool:
"""
Try a very lightweight request to check if quota is available.
Returns True if quota exists, False if Anthropic responds with
quota-related errors.
"""
try:
# Use a minimal "ping" request
content: list[TextBlockParam] = [{"type": "text", "text": "ping"}]
messages: list[MessageParam] = [{"role": "user", "content": content}]
self.client.messages.create(
model=self.model_name,
max_tokens=1, # cheapest possible
temperature=0.0,
messages=messages,
)
return True
except Exception as e:
msg = str(e).lower()
# Check for insufficient quota:
# 400 Bad Request. Message: Your credit balance is too low to access the Anthropic API. Please go to Plans & Billing to upgrade or purchase credits.
if "credit balance is too low" in msg or "400" in msg:
logger.warning(f"ClaudeJudge quota check failed: {e}")
return False
raise
Loading