Allow unsetting max_tokens in eval script (#241)

lakshyaag · cursoragent · web-flow · commit c054ff96dd4d · 2025-08-25T20:28:52.000-07:00
* Improve max_tokens handling and parsing across environments and scripts

Co-authored-by: lakshyajannu &lt;lakshyajannu@gmail.com&gt;

* Refactor max_tokens argument handling in eval.py to accept integer input directly and simplify parsing logic

---------

Co-authored-by: Cursor Agent &lt;cursoragent@cursor.com&gt;
diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py
@@ -199,27 +199,39 @@ async def get_model_response(
         Returns special error messages for context length issues.
         """
         sampling_args = sampling_args or {}
+        # Resolve message type first
+        if message_type is None:
+            message_type = self.message_type
+        # Normalize sampling args:
+        # - If max_tokens is provided for chat, rename to max_completion_tokens
+        # - Drop any None-valued entries to avoid sending them to the client
         if "max_tokens" in sampling_args:
-            sampling_args["max_completion_tokens"] = sampling_args.pop("max_tokens")
+            if sampling_args["max_tokens"] is None:
+                sampling_args.pop("max_tokens")
+            elif message_type == "chat":
+                sampling_args["max_completion_tokens"] = sampling_args.pop("max_tokens")
+        if (
+            "max_completion_tokens" in sampling_args
+            and sampling_args["max_completion_tokens"] is None
+        ):
+            sampling_args.pop("max_completion_tokens")
+        clean_sampling_args = {k: v for k, v in sampling_args.items() if v is not None}
 
         try:
-            if message_type is None:
-                message_type = self.message_type
-
             if message_type == "chat":
                 assert isinstance(prompt, list)
                 if oai_tools:
                     response = await client.chat.completions.create(
                         model=model,
                         messages=prompt,  # type: ignore
                         tools=oai_tools,
-                        **sampling_args,
+                        **clean_sampling_args,
                     )
                 else:
                     response = await client.chat.completions.create(
                         model=model,
                         messages=prompt,  # type: ignore
-                        **sampling_args,
+                        **clean_sampling_args,
                     )
                 return response
             elif message_type == "completion":
@@ -229,7 +241,7 @@ async def get_model_response(
                     )
                 assert isinstance(prompt, str)
                 response = await client.completions.create(
-                    model=model, prompt=prompt, **sampling_args
+                    model=model, prompt=prompt, **clean_sampling_args
                 )
                 return response
         except Exception as e:
diff --git a/verifiers/rubrics/judge_rubric.py b/verifiers/rubrics/judge_rubric.py
@@ -69,10 +69,23 @@ async def judge(
         cached = state.get("judge_response")
         if isinstance(cached, dict) and judge_prompt in cached:
             return cached[judge_prompt]
+        # Normalize judge sampling args for chat API
+        judge_args = dict(self.judge_sampling_args or {})
+        if "max_tokens" in judge_args:
+            if judge_args["max_tokens"] is None:
+                judge_args.pop("max_tokens")
+            else:
+                judge_args["max_completion_tokens"] = judge_args.pop("max_tokens")
+        if (
+            "max_completion_tokens" in judge_args
+            and judge_args["max_completion_tokens"] is None
+        ):
+            judge_args.pop("max_completion_tokens")
+        judge_args = {k: v for k, v in judge_args.items() if v is not None}
         judge_response = await self.judge_client.chat.completions.create(
             model=self.judge_model,
             messages=[{"role": "user", "content": judge_prompt}],
-            **self.judge_sampling_args,
+            **judge_args,
         )
         judge_response = str(judge_response.choices[0].message.content)
         if not isinstance(cached, dict):
diff --git a/verifiers/scripts/eval.py b/verifiers/scripts/eval.py
@@ -26,7 +26,7 @@ def eval_environment(
     num_examples: int,
     rollouts_per_example: int,
     max_concurrent_requests: int,
-    max_tokens: int,
+    max_tokens: int | None,
     temperature: float | None,
     verbose: bool,
     save_dataset: bool,
@@ -252,8 +252,8 @@ def main():
         "--max-tokens",
         "-t",
         type=int,
-        default=1024,
-        help="Maximum number of tokens to generate",
+        default=None,
+        help="Maximum number of tokens to generate (unset to use model default)",
     )
     parser.add_argument(
         "--temperature", "-T", type=float, default=None, help="Temperature for sampling"