Add support for base model RL / message_type="completions" (#201)

vgel · web-flow · commit 0c1ff29fab3a · 2025-08-11T16:20:21.000-07:00
* add continuation quality environment using qwen 2.5 base model and gpt 4.1 mini judge

* delete duplicated message_type parameters

* implement completion methods for vllm

* ruff check
diff --git a/environments/vf_continuation_quality/README.md b/environments/vf_continuation_quality/README.md
@@ -0,0 +1,59 @@
+# vf-continuation-quality
+
+### Overview
+- **Environment ID**: `vf-continuation-quality`
+- **Short description**: Single-turn quality grades on base model continuations using a judge model.
+- **Tags**: single-turn, completions, base-model
+
+### Datasets
+- **Primary dataset(s)**: `agentlans/wikipedia-paragraphs` mapped to prefix/ground-truth continuation
+- **Source links**: Hugging Face Datasets
+- **Split sizes**: Train split filtered to adequately-long paragraphs
+
+### Task
+- **Type**: single-turn
+- **Parser**: custom
+- **Rubric overview**: Judge model letter grade (gpt-4.1-mini-based by default)
+
+### Quickstart
+Run an evaluation with default settings:
+
+```bash
+uv run vf-eval vf-continuation-quality
+```
+
+Configure model and sampling:
+
+```bash
+uv run vf-eval vf-continuation-quality   -m gpt-4.1-mini   -n 20 -r 3 -t 1024 -T 0.7   -a '{"key": "value"}'  # env-specific args as JSON
+```
+
+Notes:
+- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.
+- Reports are written under `./environments/vf_continuation_quality/reports/` and auto-embedded below.
+
+### Environment Arguments
+Document any supported environment arguments and their meaning. Example:
+
+| Arg | Type | Default | Description |
+| --- | ---- | ------- | ----------- |
+| `dataset_name` | str | `"agentlans/wikipedia-paragraphs"` | Training dataset |
+| `dataset_split` | str | `"train"` | Training dataset split |
+| `dataset_key` | str | `"text"` | Column in dataset with training text |
+| `judge_model` | str | `"gpt-4.1-mini"` | Model to judge continuations with |
+| `judge_base_url` | str | `"https://api.openai.com/v1"` | API base URL for judge model |
+| `judge_api_key_var` | str | `"OPENAI_API_KEY"` | Environment variable containing the judge model API key |
+
+### Metrics
+Summarize key metrics your rubric emits and how they’re interpreted.
+
+| Metric | Meaning |
+| ------ | ------- |
+| `reward` | Main scalar reward (weighted sum of criteria) |
+
+## Evaluation Reports
+
+<!-- Do not edit below this line. Content is auto-generated. -->
+<!-- vf:begin:reports -->
+<p>No reports found. Run <code>uv run vf-eval vf-continuation-quality -a '{"key": "value"}'</code> to generate one.</p>
+<!-- vf:end:reports -->
diff --git a/environments/vf_continuation_quality/pyproject.toml b/environments/vf_continuation_quality/pyproject.toml
@@ -0,0 +1,13 @@
+[project]
+name = "vf-continuation-quality"
+version = "0.1.0"
+dependencies = [
+    "verifiers>=0.1.2",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["vf_continuation_quality.py"]
diff --git a/environments/vf_continuation_quality/vf_continuation_quality.py b/environments/vf_continuation_quality/vf_continuation_quality.py
@@ -0,0 +1,93 @@
+import os
+import random
+
+from datasets import load_dataset
+from openai import OpenAI
+
+import verifiers as vf
+
+_rand = random.Random(777)
+def make_cut(text: str) -> dict[str, str]:
+    """Makes a random cut somewhere in the paragraph"""
+    n_spaces = text.count(" ")
+    # mostly split near the middle
+    split_space = int(_rand.normalvariate(0.5, 0.15) * n_spaces)
+    # make sure there's at least ~25 words before and after the split point
+    split_space = min(n_spaces - 25, max(25, split_space))
+    idx = -1
+    for _ in range(split_space):
+        idx = text.find(" ", idx + 1)
+    return { "prompt": text[:idx], "answer": text[idx:] }
+
+
+def load_environment(
+    dataset_name: str = "agentlans/wikipedia-paragraphs",
+    dataset_split: str | None = "train",
+    dataset_key: str = "text",
+    judge_model: str = "gpt-4.1-mini",
+    judge_base_url: str = "https://api.openai.com/v1",
+    judge_api_key_var: str = "OPENAI_API_KEY",
+) -> vf.Environment:
+    dataset = load_dataset(dataset_name, split=dataset_split)
+    # only accept examples with >~100 words or so
+    dataset = dataset.filter(lambda x: x[dataset_key].count(" ") > 100)
+    dataset = dataset.map(lambda x: make_cut(x[dataset_key]))
+    dataset = dataset.shuffle(seed=777)
+
+    judge_client = OpenAI(api_key=os.getenv(judge_api_key_var), base_url=judge_base_url)
+    judge_prompt = """Evaluate this base model contination from a prefix, compared to the true continuation from Wikipedia.
+
+<prefix>
+{question}
+</prefix>
+
+<true_continuation>
+{answer}
+</true_continuation>
+
+<model_continuation>
+{response}
+</model_continuation>
+
+Provide a letter grade from A-F where:
+- A: Smooth prose, facts are mostly accurate w.r.t the true continuation
+- B: Smooth prose, regardless of factual accuracy
+- C: Some awkward wording, spacing, or punctuation
+- D: Inclusions of awkward or glitchy text along with promising prose, some coherent sentences
+- F: Incoherent text
+
+Think aloud in a <scratchpad> for a few lines, then respond with the letter grade in <grade> ... </grade> tags."""
+    rubric = vf.JudgeRubric(
+        judge_client=judge_client,
+        judge_model=judge_model,
+        judge_prompt=judge_prompt,
+    )
+
+    grade_parser = vf.XMLParser(fields=["grade"], answer_field="grade")
+    def grade_reward(prompt, completion, answer, state, **kwargs) -> float:
+        judge_response = rubric.judge(prompt, completion, answer, state, **kwargs)
+        judge_grade = (
+            (grade_parser.parse_answer(judge_response) or "F")
+            .strip()
+            .replace("+", "")
+            .replace("-", "")
+            .upper()
+        )
+        return {
+            "A": 1.0,
+            "B": 0.75,
+            "C": 0.5,
+            "D": 0.25,
+        }.get(judge_grade, 0.0)
+
+    rubric.add_reward_func(grade_reward, weight=1.0)
+
+    return vf.SingleTurnEnv(
+        message_type="completion",
+        dataset=dataset,
+        parser=vf.Parser(),
+        rubric=rubric,
+        sampling_args={
+            "stop": ["\n"],
+        },
+    )
diff --git a/examples/grpo/train_continuation_quality.py b/examples/grpo/train_continuation_quality.py
@@ -0,0 +1,28 @@
+import verifiers as vf
+
+"""
+# install
+vf-install vf-continuation-quality (-p /path/to/environments)
+
+# quick eval
+vf-eval vf-continuation-quality (-m model_name in endpoints.py)
+
+inference:
+CUDA_VISIBLE_DEVICES=0 vf-vllm --model Qwen/Qwen2.5-0.5B \
+    --enforce-eager --disable-log-requests
+
+training:
+CUDA_VISIBLE_DEVICES=1 accelerate launch --num-processes 1 \
+    --config-file configs/zero3.yaml examples/grpo/train_continuation_quality.py
+"""
+
+model_name = "Qwen/Qwen2.5-0.5B"
+vf_env = vf.load_environment(env_id="vf-continuation-quality")
+model, tokenizer = vf.get_model_and_tokenizer(model_name)
+trainer = vf.GRPOTrainer(
+    env=vf_env,
+    model=model,
+    processing_class=tokenizer,
+    args=vf.grpo_defaults(run_name="continuation-quality"),
+)
+trainer.train()
diff --git a/verifiers/envs/environment.py b/verifiers/envs/environment.py
@@ -12,6 +12,7 @@
 from verifiers.parsers.parser import Parser
 from verifiers.rubrics.rubric import Rubric
 from verifiers.types import (
+    Completion,
     ChatCompletion,
     ChatCompletionToolParam,
     ChatMessage,
@@ -656,6 +657,21 @@ def parse_chat_completion_logprobs(
         ]
         return logprobs
 
+    def parse_completion_logprobs(
+        self, completion: Completion
+    ) -> List[float]:
+        """Parses the completion logprobs from a vLLM chat completion"""
+        assert len(completion.choices) == 1, (
+            "Response should always have one choice"
+        )
+        assert completion.choices[0].logprobs is not None, (
+            "Logprobs should not be None. Make sure to set logprobs=True in the extra body when making the request to /v1/completions"
+        )
+        assert completion.choices[0].logprobs.token_logprobs is not None, (
+            "Logprob token_logprobs should not be None. Make sure to set logprobs=True in the extra body when making the request to /v1/completions"
+        )
+        return completion.choices[0].logprobs.token_logprobs
+
     def parse_chat_completion_tokens(
         self, chat_completion: ChatCompletion
     ) -> list[int]:
@@ -670,11 +686,32 @@ def parse_chat_completion_tokens(
             "Logprob content should not be None. Make sure to set logprobs=True in the extra body when making the request to /v1/chat/completions"
         )
         tokens = [
+            # tokens are token_id:<int> because we request `return_tokens_as_token_ids` from vllm in GRPOTrainer
             int(token.token.split(":")[-1])
             for token in chat_completion.choices[0].logprobs.content
         ]
         return tokens
 
+    def parse_completion_tokens(
+        self, completion: Completion
+    ) -> List[int]:
+        """Parses the output token ids from a list of chat completions returned by vLLM OAI server."""
+        assert len(completion.choices) == 1, (
+            "Response should always have one choice"
+        )
+        assert completion.choices[0].logprobs is not None, (
+            "Logprobs should not be None. Make sure to set logprobs=True in the extra body when making the request to /v1/completions"
+        )
+        assert completion.choices[0].logprobs.tokens is not None, (
+            "Logprob tokens should not be None. Make sure to set logprobs=True in the extra body when making the request to /v1/completions"
+        )
+        tokens = [
+            # tokens are token_id:<int> because we request `return_tokens_as_token_ids` from vllm in GRPOTrainer
+            int(token.split(":")[-1])
+            for token in completion.choices[0].logprobs.tokens
+        ]
+        return tokens
+
     def process_chat_format_vllm(
         self,
         prompt: list[ChatMessage],
@@ -759,6 +796,77 @@ def process_chat_format_vllm(
             completion_logprobs,
         )
 
+    def process_completion_format_vllm(
+        self,
+        prompt: str,
+        completion: str,
+        state: State,
+        processing_class: "PreTrainedTokenizerBase",
+        mask_env_responses: bool = False,
+    ) -> Tuple[List[int], List[int], List[int], List[int], List[float]]:
+        """
+        Process completion format conversations using incremental prefixes.
+        """
+        responses: list[Completion] = state["responses"]
+        responses_start_idx: list[int] = state["responses_start_idx"]
+        assert len(responses) == len(responses_start_idx), "Should have an index for each completion response"
+
+        idx = 0
+        zipped: list[tuple[str, Completion | None]] = []
+        for response, response_start_idx in zip(responses, responses_start_idx):
+            if response_start_idx > idx:
+                # non-model-generated section
+                zipped.append((completion[idx:response_start_idx], None))
+            response_text = response.choices[0].text or ""
+            zipped.append((response_text, response))
+            idx = response_start_idx + len(response_text)
+        assert idx == len(completion), "Completion not fully consumed"
+
+        prompt_ids: list[int] = processing_class.encode(prompt)
+        rollout_consumed = prompt
+        prompt_mask: list[int] = [0] * len(prompt_ids)
+        completion_ids: list[int] = []
+        completion_mask: list[int] = []
+        completion_logprobs: list[float] = []
+        i = 0
+        while i < len(zipped):
+            text, response = zipped[i]
+            # model-generated case -- use response
+            if response is not None:
+                completion_turn_ids = self.parse_completion_tokens(response)
+                completion_turn_mask = [1] * len(completion_turn_ids)
+                completion_turn_logprobs = self.parse_completion_logprobs(response)
+                completion_ids.extend(completion_turn_ids)
+                completion_mask.extend(completion_turn_mask)
+                completion_logprobs.extend(completion_turn_logprobs)
+                rollout_consumed += text
+                i += 1
+            # non-model-generated (user/tool case) -- use text
+            else:
+                token_prefix: list[int] = processing_class.encode(rollout_consumed)
+                token_prefix_with_turn: list[int] = processing_class.encode(rollout_consumed + text)
+                assert token_prefix_with_turn[: len(token_prefix)] == token_prefix, (
+                    f"Token prefix mismatch. Token prefix: {token_prefix}, token prefix with turn: {token_prefix_with_turn}"
+                )
+                completion_turn_ids = token_prefix_with_turn[len(token_prefix) :]
+                if mask_env_responses:
+                    completion_turn_mask = [0] * len(completion_turn_ids)
+                else:
+                    completion_turn_mask = [1] * len(completion_turn_ids)
+                completion_turn_logprobs = [0.0] * len(completion_turn_ids)
+                completion_ids.extend(completion_turn_ids)
+                completion_mask.extend(completion_turn_mask)
+                completion_logprobs.extend(completion_turn_logprobs)
+                rollout_consumed += text
+                i += 1
+        return (
+            prompt_ids,
+            prompt_mask,
+            completion_ids,
+            completion_mask,
+            completion_logprobs,
+        )
+
     def process_env_results_vllm(
         self,
         prompts: list[Messages],
@@ -775,10 +883,8 @@ def process_env_results_vllm(
         Process results with vLLM tokens/logprobs.
         """
         # Determine format from first prompt
+        # TODO: why not from self.message_type?
         is_chat_format = isinstance(prompts[0], list)
-        assert is_chat_format, (
-            "vLLM output parsing is not yet supported for completion format"
-        )
 
         all_prompt_ids = []
         all_prompt_masks = []
@@ -803,10 +909,15 @@ def process_env_results_vllm(
                 )
             else:
                 assert isinstance(prompt, str) and isinstance(completion, str)
-                prompt_ids, prompt_mask, completion_ids, completion_mask = (
-                    self.process_completion_format(prompt, completion, processing_class)
+                (
+                    prompt_ids,
+                    prompt_mask,
+                    completion_ids,
+                    completion_mask,
+                    completion_logprobs,
+                ) = self.process_completion_format_vllm(
+                    prompt, completion, state, processing_class, mask_env_responses
                 )
-                completion_logprobs = [0] * len(completion_ids)
             is_truncated = False
             if max_seq_len > 0 and len(prompt_ids) + len(completion_ids) > max_seq_len:
                 if len(prompt_ids) > max_seq_len:
diff --git a/verifiers/envs/multiturn_env.py b/verifiers/envs/multiturn_env.py
diff --git a/verifiers/envs/singleturn_env.py b/verifiers/envs/singleturn_env.py