willccbb
diff --git a/‎environments/math_python/math_python.py
Lines changed: 1 addition & 1 deletion b/‎environments/math_python/math_python.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/test_singleturn_env.py
Lines changed: 10 additions & 8 deletions b/‎tests/test_singleturn_env.py
Lines changed: 10 additions & 8 deletions
diff --git a/‎verifiers/envs/environment.py
Lines changed: 3 additions & 1 deletion b/‎verifiers/envs/environment.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎verifiers/envs/multiturn_env.py
Lines changed: 10 additions & 7 deletions b/‎verifiers/envs/multiturn_env.py
Lines changed: 10 additions & 7 deletions
diff --git a/‎verifiers/envs/singleturn_env.py
Lines changed: 2 additions & 2 deletions b/‎verifiers/envs/singleturn_env.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎verifiers/envs/stateful_tool_env.py
Lines changed: 5 additions & 3 deletions b/‎verifiers/envs/stateful_tool_env.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎verifiers/envs/textarena_env.py
Lines changed: 4 additions & 2 deletions b/‎verifiers/envs/textarena_env.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎verifiers/envs/tool_env.py
Lines changed: 10 additions & 5 deletions b/‎verifiers/envs/tool_env.py
Lines changed: 10 additions & 5 deletions
diff --git a/‎verifiers/rubrics/judge_rubric.py
Lines changed: 4 additions & 2 deletions b/‎verifiers/rubrics/judge_rubric.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎verifiers/rubrics/rubric.py
Lines changed: 3 additions & 7 deletions b/‎verifiers/rubrics/rubric.py
Lines changed: 3 additions & 7 deletions
@@ -43,7 +43,7 @@ def num_errors(parser, completion) -> float:
                 if "error" in msg["content"].lower()
             ]
         )
-        return num_errors
+        return float(num_errors)
 
     rubric = vf.Rubric(
         funcs=[correct_answer_reward_func, num_turns, num_tool_calls, num_errors],
 
@@ -48,23 +48,25 @@ def test_singleturn_env_initialization_completion(self, mock_openai_client):
         )
         assert env.message_type == "completion"
 
-    def test_is_completed_method(self, mock_singleturn_env):
+    @pytest.mark.asyncio
+    async def test_is_completed_method(self, mock_singleturn_env):
         """Test the is_completed method logic."""
         # No responses yet
         messages = [{"role": "user", "content": "Hello"}]
         state = {"responses": []}
-        assert not mock_singleturn_env.is_completed(messages, state)
+        assert not await mock_singleturn_env.is_completed(messages, state)
 
         # With responses
         state = {"responses": [MagicMock()]}
-        assert mock_singleturn_env.is_completed(messages, state)
+        assert await mock_singleturn_env.is_completed(messages, state)
 
-    def test_env_response_method(self, mock_singleturn_env):
+    @pytest.mark.asyncio
+    async def test_env_response_method(self, mock_singleturn_env):
         """Test the env_response method (which should never be called in practice)."""
         messages = [{"role": "user", "content": "Hello"}]
         state = {}
 
-        response, new_state = mock_singleturn_env.env_response(messages, state)
+        response, new_state = await mock_singleturn_env.env_response(messages, state)
 
         # Should return minimal response (env_response returns a list of messages)
         assert len(response) == 1
@@ -345,12 +347,12 @@ async def test_singleturn_stops_after_one_response(
 
         # Before any responses
         state = {"responses": []}
-        assert not env.is_completed([], state)
+        assert not await env.is_completed([], state)
 
         # After one response
         state = {"responses": [MagicMock()]}
-        assert env.is_completed([], state)
+        assert await env.is_completed([], state)
 
         # Even with multiple responses (shouldn't happen), it's still completed
         state = {"responses": [MagicMock(), MagicMock()]}
-        assert env.is_completed([], state)
+        assert await env.is_completed([], state)
@@ -27,6 +27,7 @@
     SamplingArgs,
     State,
 )
+from verifiers.utils.message_utils import cleanup_messages
 from verifiers.utils.tool_utils import sanitize_tool_calls
 
 if TYPE_CHECKING:
@@ -216,7 +217,6 @@ async def get_model_response(
         ):
             sampling_args.pop("max_completion_tokens")
         clean_sampling_args = {k: v for k, v in sampling_args.items() if v is not None}
-
         try:
             if message_type == "chat":
                 assert isinstance(prompt, list)
@@ -385,6 +385,8 @@ async def a_generate(
             if self.oai_tools and "oai_tools" not in info:
                 info["oai_tools"] = self.oai_tools
 
+        results_dict["prompt"] = [cleanup_messages(p) for p in results_dict["prompt"]]
+
         # prepare GenerateOutputs and run rollouts
         results = GenerateOutputs(
             prompt=results_dict["prompt"],
 
@@ -12,22 +12,23 @@
     SamplingArgs,
     State,
 )
+from verifiers.utils.async_utils import maybe_await
 
 
 class MultiTurnEnv(Environment):
     def __init__(self, max_turns: int = 10, **kwargs):
         super().__init__(**kwargs)
         self.max_turns = max_turns
 
-    def setup_state(self, state: State, **kwargs) -> State:
+    async def setup_state(self, state: State, **kwargs) -> State:
         return state
 
     @abstractmethod
-    def is_completed(self, messages: Messages, state: State, **kwargs) -> bool:
+    async def is_completed(self, messages: Messages, state: State, **kwargs) -> bool:
         pass
 
     @abstractmethod
-    def env_response(
+    async def env_response(
         self, messages: Messages, state: State, **kwargs
     ) -> tuple[Messages, State]:
         """
@@ -60,7 +61,7 @@ async def rollout(
             "responses": [],
             "turn": 0,
         }
-        state = self.setup_state(state)
+        state = await maybe_await(self.setup_state, state, **kwargs)
         if self.message_type == "chat":
             assert isinstance(prompt, list)
             completion = []
@@ -70,7 +71,7 @@ async def rollout(
             state["responses_start_idx"] = []
         rollout = list(prompt) if not isinstance(prompt, str) else prompt
         while not is_completed:
-            if self.is_completed(rollout, state, **kwargs):
+            if await maybe_await(self.is_completed, rollout, state, **kwargs):
                 is_completed = True
                 break
             response = await self.get_model_response(
@@ -107,12 +108,14 @@ async def rollout(
                 completion += response_text
             state["turn"] += 1
             if (
-                self.is_completed(rollout, state, **kwargs)
+                await maybe_await(self.is_completed, rollout, state, **kwargs)
                 or state["turn"] >= self.max_turns
             ):
                 is_completed = True
             else:
-                env_msgs, state = self.env_response(rollout, state, **kwargs)
+                env_msgs, state = await maybe_await(
+                    self.env_response, rollout, state, **kwargs
+                )
                 if self.message_type == "chat":
                     assert isinstance(env_msgs, list)
                     assert isinstance(rollout, list)
 
@@ -7,10 +7,10 @@ class SingleTurnEnv(MultiTurnEnv):
     Environment for single-turn tasks (chat or completion).
     """
 
-    def is_completed(self, messages: Messages, state: State, **kwargs) -> bool:
+    async def is_completed(self, messages: Messages, state: State, **kwargs) -> bool:
         return len(state["responses"]) > 0
 
-    def env_response(
+    async def env_response(
         self, messages: Messages, state: State, **kwargs
     ) -> tuple[Messages, State]:
         # never called in MultiTurnEnv.rollout
 
@@ -34,7 +34,7 @@ def update_tool_args(
         """Update tool arguments and/or state (in-place) based on messages and state."""
         pass
 
-    def call_tool(
+    async def call_tool(
         self, tool_name: str, tool_args: dict, tool_call_id: str, **kwargs
     ) -> Message:
         """Call a tool based on JSON command."""
@@ -53,7 +53,7 @@ def call_tool(
                 "tool_call_id": tool_call_id,
             }
 
-    def env_response(
+    async def env_response(
         self, messages: Messages, state: State, **kwargs
     ) -> tuple[Messages, State]:
         assert isinstance(messages, list)
@@ -65,6 +65,8 @@ def env_response(
             tool_args: dict = json.loads(tool_call.function.arguments)
             tool_call_id: str = tool_call.id or ""
             tool_args = self.update_tool_args(tool_args, messages, state, **kwargs)
-            tool_message: Message = self.call_tool(tool_name, tool_args, tool_call_id)
+            tool_message: Message = await self.call_tool(
+                tool_name, tool_args, tool_call_id
+            )
             tool_messages.append(tool_message)
         return tool_messages, state
@@ -74,14 +74,16 @@ def __init__(
             **kwargs,
         )
 
-    def is_completed(self, messages: Messages, state: State, **kwargs: Any) -> bool:
+    async def is_completed(
+        self, messages: Messages, state: State, **kwargs: Any
+    ) -> bool:
         if "is_finished" in state and state["is_finished"]:
             state.pop("ta_env")
             return state["is_finished"]
         self.parser
         return False
 
-    def env_response(
+    async def env_response(
         self, messages: Messages, state: State, **kwargs: Any
     ) -> tuple[Messages, State]:
         # load env
 
@@ -3,6 +3,7 @@
 
 from verifiers.envs.multiturn_env import MultiTurnEnv
 from verifiers.types import ChatCompletionMessageToolCall, Message, Messages, State
+from verifiers.utils.async_utils import maybe_await
 from verifiers.utils.tool_utils import convert_func_to_oai_tool
 
 
@@ -21,21 +22,23 @@ def __init__(
         self.tool_map = {tool.__name__: tool for tool in self.tools}
         super().__init__(oai_tools=self.oai_tools, max_turns=max_turns, **kwargs)
 
-    def is_completed(self, messages: Messages, state: State, **kwargs: Any) -> bool:
+    async def is_completed(
+        self, messages: Messages, state: State, **kwargs: Any
+    ) -> bool:
         assert isinstance(messages, list)
         is_assistant_message = messages[-1]["role"] == "assistant"
         no_tool_calls = (
             "tool_calls" not in messages[-1] or messages[-1]["tool_calls"] is None
         )
         return is_assistant_message and no_tool_calls
 
-    def call_tool(
+    async def call_tool(
         self, tool_name: str, tool_args: dict, tool_call_id: str, **kwargs
     ) -> Message:
         """Call a tool based on JSON command."""
         try:
             tool_func = self.tool_map[tool_name]
-            result = str(tool_func(**tool_args))
+            result = str(await maybe_await(tool_func, **tool_args))
             return {
                 "role": "tool",
                 "content": str(result),
@@ -48,7 +51,7 @@ def call_tool(
                 "tool_call_id": tool_call_id,
             }
 
-    def env_response(
+    async def env_response(
         self, messages: Messages, state: State, **kwargs
     ) -> tuple[Messages, State]:
         assert isinstance(messages, list)
@@ -59,6 +62,8 @@ def env_response(
             tool_name: str = tool_call.function.name
             tool_args: dict = json.loads(tool_call.function.arguments)
             tool_call_id: str = tool_call.id or ""
-            tool_message: Message = self.call_tool(tool_name, tool_args, tool_call_id)
+            tool_message: Message = await self.call_tool(
+                tool_name, tool_args, tool_call_id
+            )
             tool_messages.append(tool_message)
         return tool_messages, state
@@ -1,10 +1,11 @@
 from typing import Any
 
-from openai import OpenAI, AsyncOpenAI
+from openai import AsyncOpenAI, OpenAI
 
 from verifiers.parsers.parser import Parser
 from verifiers.rubrics.rubric import Rubric
 from verifiers.types import Messages, State
+from verifiers.utils.async_utils import maybe_await
 
 DEFAULT_JUDGE_PROMPT = """Given a ground truth answer \
 and a response, determine if the response is correct.
@@ -82,7 +83,8 @@ async def judge(
         ):
             judge_args.pop("max_completion_tokens")
         judge_args = {k: v for k, v in judge_args.items() if v is not None}
-        judge_response = await self.judge_client.chat.completions.create(
+        judge_response = await maybe_await(
+            self.judge_client.chat.completions.create,
             model=self.judge_model,
             messages=[{"role": "user", "content": judge_prompt}],
             **judge_args,
 
@@ -11,6 +11,7 @@
     RolloutScores,
     State,
 )
+from verifiers.utils.async_utils import maybe_await
 
 
 class Rubric:
@@ -94,22 +95,17 @@ def func(completion, answer, **kwargs):
             task=task,
             info=info,
         )
-        ans = 0.0
         merged = {**common, **kwargs}
         if any(p.kind == p.VAR_KEYWORD for p in sig.parameters.values()):
             try:
-                ans = func(**merged)
-                if inspect.iscoroutinefunction(func):
-                    ans = await ans
+                ans = float(await maybe_await(func, **merged))
             except Exception as e:
                 self.logger.error(f"Error calling reward function {func.__name__}: {e}")
                 ans = 0.0
         else:
             allowed = {k: v for k, v in merged.items() if k in sig.parameters}
             try:
-                ans = func(**allowed)
-                if inspect.iscoroutinefunction(func):
-                    ans = await ans
+                ans = float(await maybe_await(func, **allowed))
             except Exception as e:
                 self.logger.error(f"Error calling reward function {func.__name__}: {e}")
                 ans = 0.0
Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ def num_errors(parser, completion) -> float:`
`43`	`43`	`if "error" in msg["content"].lower()`
`44`	`44`	`]`
`45`	`45`	`)`
`46`		`- return num_errors`
	`46`	`+ return float(num_errors)`
`47`	`47`
`48`	`48`	`rubric = vf.Rubric(`
`49`	`49`	`funcs=[correct_answer_reward_func, num_turns, num_tool_calls, num_errors],`