Add sampling_args flag to vf-eval (#240)

lakshyaag · willccbb · web-flow · commit 8e38e7fa9779 · 2025-08-25T20:44:01.000-07:00
* Add `sampling_args` flag to `vf-eval`

* Update README to include usage of `sampling_args` in `vf-eval`

* ruff fix

---------

Co-authored-by: William Brown &lt;williambrown97@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -144,6 +144,12 @@ For tasks involving LLM judges, you may wish to use `vf.JudgeRubric()` for manag
 
 Note on concurrency: environment APIs accept `max_concurrent` to control parallel rollouts. The `vf-eval` CLI currently exposes `--max-concurrent-requests`; ensure this maps to your environment’s concurrency as expected.
 
+`vf-eval` also supports specifying `sampling_args` as a JSON object, which is sent to the vLLM inference engine:
+
+```bash
+vf-eval vf-environment-name --sampling-args '{"reasoning_effort": "low"}'
+```
+
 ### ToolEnv
 
 For many applications involving tool use, you can use `ToolEnv` to leverage models' native tool/function-calling capabilities in an agentic loop. Tools can be specified as generic Python functions (with type hints and docstrings), which will then be passed in JSON schema form to each inference request.
diff --git a/tests/test_eval_cli.py b/tests/test_eval_cli.py
@@ -0,0 +1,124 @@
+import verifiers.scripts.eval as vf_eval
+
+
+def _make_fake_env(captured):
+    class FakeEnv:
+        def evaluate(
+            self,
+            client,
+            model,
+            sampling_args=None,
+            num_examples=-1,
+            rollouts_per_example=1,
+            **kwargs,
+        ):
+            captured["sampling_args"] = dict(sampling_args or {})
+
+            class Result:
+                prompt = ["p"]
+                completion = ["c"]
+                reward = [1.0]
+                info = [{}]
+                task = ["default"]
+                answer = [""]
+                metrics = {}
+
+            return Result()
+
+    return FakeEnv()
+
+
+def test_cli_sampling_args_precedence_over_flags(monkeypatch):
+    captured = {}
+
+    # Patch environment loader to return our fake env
+    monkeypatch.setattr(
+        vf_eval.vf,
+        "load_environment",
+        lambda env_id, **env_args: _make_fake_env(captured),
+    )
+
+    # Patch OpenAI client used by the CLI to a simple dummy
+    class DummyOpenAI:
+        def __init__(self, api_key=None, base_url=None):
+            self.api_key = api_key
+            self.base_url = base_url
+
+    monkeypatch.setattr(vf_eval, "OpenAI", DummyOpenAI)
+
+    # Run evaluation with JSON sampling args overriding flags
+    vf_eval.eval_environment(
+        env="dummy-env",
+        env_args={},
+        env_dir_path="./environments",
+        endpoints_path="./configs/endpoints.py",
+        model="gpt-4.1-mini",
+        api_key_var="OPENAI_API_KEY",
+        api_base_url="https://api.openai.com/v1",
+        num_examples=1,
+        rollouts_per_example=1,
+        max_concurrent_requests=1,
+        max_tokens=42,
+        temperature=0.9,
+        sampling_args={
+            "enable_thinking": False,
+            "max_tokens": 77,
+            "temperature": 0.1,
+        },
+        verbose=False,
+        save_dataset=False,
+        save_to_hf_hub=False,
+        hf_hub_dataset_name="",
+    )
+
+    sa = captured["sampling_args"]
+    assert sa["max_tokens"] == 77
+    assert sa["temperature"] == 0.1
+    assert sa["enable_thinking"] is False
+
+
+def test_cli_sampling_args_fill_from_flags_when_missing(monkeypatch):
+    captured = {}
+
+    # Patch environment loader to return our fake env
+    monkeypatch.setattr(
+        vf_eval.vf,
+        "load_environment",
+        lambda env_id, **env_args: _make_fake_env(captured),
+    )
+
+    # Patch OpenAI client used by the CLI to a simple dummy
+    class DummyOpenAI:
+        def __init__(self, api_key=None, base_url=None):
+            self.api_key = api_key
+            self.base_url = base_url
+
+    monkeypatch.setattr(vf_eval, "OpenAI", DummyOpenAI)
+
+    # Run evaluation with JSON lacking max_tokens/temperature
+    vf_eval.eval_environment(
+        env="dummy-env",
+        env_args={},
+        env_dir_path="./environments",
+        endpoints_path="./configs/endpoints.py",
+        model="gpt-4.1-mini",
+        api_key_var="OPENAI_API_KEY",
+        api_base_url="https://api.openai.com/v1",
+        num_examples=1,
+        rollouts_per_example=1,
+        max_concurrent_requests=1,
+        max_tokens=55,
+        temperature=0.8,
+        sampling_args={
+            "enable_thinking": True,
+        },
+        verbose=False,
+        save_dataset=False,
+        save_to_hf_hub=False,
+        hf_hub_dataset_name="",
+    )
+
+    sa = captured["sampling_args"]
+    assert sa["max_tokens"] == 55
+    assert sa["temperature"] == 0.8
+    assert sa["enable_thinking"] is True
diff --git a/verifiers/scripts/eval.py b/verifiers/scripts/eval.py
@@ -28,6 +28,7 @@ def eval_environment(
     max_concurrent_requests: int,
     max_tokens: int | None,
     temperature: float | None,
+    sampling_args: dict | None,
     verbose: bool,
     save_dataset: bool,
     save_to_hf_hub: bool,
@@ -62,15 +63,18 @@ def eval_environment(
 
     client = OpenAI(api_key=os.getenv(api_key_var, "EMPTY"), base_url=api_base_url)
     vf_env = vf.load_environment(env_id=env, **env_args)
-    sampling_args: dict[str, int | float | None] = {
-        "max_tokens": max_tokens,
-    }
-    if temperature is not None:
-        sampling_args["temperature"] = temperature
+    # Merge sampling args with precedence to JSON payload over explicit flags
+    merged_sampling_args: dict = {}
+    if sampling_args is not None:
+        merged_sampling_args.update(sampling_args)
+    if "max_tokens" not in merged_sampling_args:
+        merged_sampling_args["max_tokens"] = max_tokens
+    if temperature is not None and "temperature" not in merged_sampling_args:
+        merged_sampling_args["temperature"] = temperature
     results = vf_env.evaluate(
         client=client,
         model=model,
-        sampling_args=sampling_args,
+        sampling_args=merged_sampling_args,
         num_examples=num_examples,
         rollouts_per_example=rollouts_per_example,
         max_concurrent_requests=max_concurrent_requests,
@@ -143,8 +147,7 @@ def eval_environment(
             "model": model,
             "num_examples": num_examples,
             "rollouts_per_example": rollouts_per_example,
-            "max_tokens": max_tokens,
-            "temperature": temperature,
+            "sampling_args": merged_sampling_args,
             "date": datetime.now().strftime("%Y-%m-%d"),
             "time": datetime.now().strftime("%H:%M:%S"),
             "avg_reward": sum(results.reward) / len(results.reward),
@@ -258,6 +261,16 @@ def main():
     parser.add_argument(
         "--temperature", "-T", type=float, default=None, help="Temperature for sampling"
     )
+    parser.add_argument(
+        "--sampling-args",
+        "-S",
+        type=json.loads,
+        default=None,
+        help=(
+            "Sampling arguments as JSON object. Keys here override --max-tokens/--temperature. "
+            'Example: \'{"enable_thinking": false, "max_tokens": 256}\''
+        ),
+    )
     parser.add_argument(
         "--verbose", "-v", default=False, action="store_true", help="Verbose output"
     )
@@ -297,6 +310,7 @@ def main():
         max_concurrent_requests=args.max_concurrent_requests,
         max_tokens=args.max_tokens,
         temperature=args.temperature,
+        sampling_args=args.sampling_args,
         verbose=args.verbose,
         save_dataset=args.save_dataset,
         save_to_hf_hub=args.save_to_hf_hub,