vllm-project
diff --git a/‎.github/workflows/_e2e_nightly.yaml‎
Lines changed: 9 additions & 0 deletions b/‎.github/workflows/_e2e_nightly.yaml‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎.github/workflows/_e2e_test.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/_e2e_test.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/vllm_ascend_test.yaml‎
Lines changed: 1 addition & 9 deletions b/‎.github/workflows/vllm_ascend_test.yaml‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎docs/source/user_guide/configuration/additional_config.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/user_guide/configuration/additional_config.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py‎
Lines changed: 167 additions & 63 deletions b/‎examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py‎
Lines changed: 167 additions & 63 deletions
@@ -96,6 +96,15 @@ jobs:
           pip install -r requirements-dev.txt
           pip install -v -e .
 
+      - name: Checkout aisbench repo and Install aisbench
+        run: |
+          git clone https://gitee.com/aisbench/benchmark.git
+          cd benchmark
+          git checkout v3.0-20250930-master
+          pip3 install -e ./
+          pip3 install -r requirements/api.txt
+          pip3 install -r requirements/extra.txt
+
       - name: Run vllm-project/vllm-ascend test
         env:
           VLLM_WORKER_MULTIPROC_METHOD: spawn
 
@@ -177,6 +177,7 @@ jobs:
           pytest -sv tests/e2e/multicard/test_data_parallel.py
           pytest -sv tests/e2e/multicard/test_expert_parallel.py
           pytest -sv tests/e2e/multicard/test_external_launcher.py
+          pytest -sv tests/e2e/multicard/test_single_request_aclgraph.py
           pytest -sv tests/e2e/multicard/test_fused_moe_allgather_ep.py
           pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
 
 
@@ -119,15 +119,7 @@ jobs:
           TORCH_DEVICE_BACKEND_AUTOLOAD: 0
         run: |
           export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
-          pytest -sv --cov --cov-report=xml:unittests-coverage.xml tests/ut \
-          --ignore=tests/ut/test_platform.py \
-          --ignore=tests/ut/patch/worker/patch_common/test_patch_minicpm.py \
-          --ignore=tests/ut/core/test_scheduler.py \
-          --ignore=tests/ut/kv_connector/test_llmdatadist_connector.py \
-          --ignore=tests/ut/kv_connector/test_mooncake_connector.py \
-          --ignore=tests/ut/kv_connector/test_remote_decode_lifecycle.py \
-          --ignore=tests/ut/kv_connector/test_remote_prefill_lifecycle.py \
-          --ignore=tests/ut/torchair/models/test_torchair_deepseek_v2.py \
+          pytest -sv --cov --cov-report=xml:unittests-coverage.xml tests/ut 
 
       - name: Upload coverage to Codecov
         # only upload coverage when commits merged
 
@@ -58,6 +58,7 @@ The details of each config option are as follows:
 | `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache |
 | `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
 | `enable_kv_nz`| bool | `False` | Whether to enable kvcache NZ layout. This option only takes effects on models using MLA (e.g., DeepSeek). |
+| `enable_super_kernel` | bool | `False` | Whether to enable super kernel to fuse operators in deepseek moe layers. This option only takes effects on moe models using dynamic w8a8 quantization.|
 
 **ascend_scheduler_config**
 
 
@@ -84,17 +84,18 @@
 #
 # For more details, see the code and comments in this file.
 
-
 import argparse
 import asyncio
 import functools
 import heapq
+import json
 import os
 import sys
-import uuid
 import threading
+import uuid
 from contextlib import asynccontextmanager
-from typing import List
+from dataclasses import dataclass
+from typing import Any, List
 
 import httpx
 from fastapi import FastAPI, Request
@@ -106,6 +107,7 @@
 # Add uvloop for faster event loop if available
 try:
     import uvloop
+
     asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
 except ImportError:
     pass
@@ -324,7 +326,7 @@ async def listen_for_disconnect(request: Request) -> None:
 
 
 def with_cancellation(handler_func):
-    
+
     @functools.wraps(handler_func)
     async def wrapper(*args, **kwargs):
         request = kwargs["request"]
@@ -337,9 +339,9 @@ async def wrapper(*args, **kwargs):
         if handler_task in done:
             return handler_task.result()
         return None
-    
+
     return wrapper
-        
+
 
 app = FastAPI(lifespan=lifespan)
 
@@ -362,7 +364,8 @@ async def send_request_to_service(client: httpx.AsyncClient,
         "remote_host": None,
         "remote_port": None,
         "aborted_request": list(aborted_requests),
-        "metaserver": f"http://{global_args.host}:{global_args.port}/v1/metaserver"
+        "metaserver":
+        f"http://{global_args.host}:{global_args.port}/v1/metaserver"
     }
     req_data["stream"] = False
     req_data["max_tokens"] = 1
@@ -455,72 +458,174 @@ def get_api_request_id(api, req_id):
         return "chatcmpl-" + req_id
 
 
+async def _handle_select_instance(api: str, req_data: Any,
+                                  request_length: int):
+    prefiller_score = proxy_state.calculate_prefill_scores(request_length)
+    logger.debug(
+        f"Request length: {request_length}, Prefiller score: {prefiller_score}"
+    )
+    request_id = await proxy_state.next_req_id()
+    # Select prefiller
+    prefiller_idx = proxy_state.select_prefiller(prefiller_score)
+    prefiller = proxy_state.prefillers[prefiller_idx]
+    result_future = asyncio.Future()  # type: ignore
+    request_id_api = get_api_request_id(api, request_id)
+    proxy_state.req_id_future[request_id_api] = result_future
+    # Send request to prefiller
+    asyncio.get_running_loop().create_task(
+        send_request_to_service(prefiller.client,
+                                prefiller_idx,
+                                api,
+                                req_data,
+                                request_id,
+                                max_retries=global_args.max_retries,
+                                base_delay=global_args.retry_delay))
+    proxy_state.release_prefiller(prefiller_idx, prefiller_score)
+
+    response = await result_future
+    del proxy_state.req_id_future[request_id_api]
+    req_data["kv_transfer_params"] = response
+
+    # Select decoder
+    decoder_score = proxy_state.calculate_decode_scores(request_length)
+    logger.debug("Decoder score: %f", decoder_score)
+    # Use the prefiller's kv_transfer_params to select decoder
+    decoder_idx = proxy_state.select_decoder(decoder_score)
+    decoder = proxy_state.decoders[decoder_idx]
+    logger.debug("Using %s %s", prefiller.url, decoder.url)
+    return InstanceInfo(request_id=request_id,
+                        prefiller_idx=prefiller_idx,
+                        prefiller_score=prefiller_score,
+                        prefiller=prefiller,
+                        decoder=decoder,
+                        decoder_idx=decoder_idx,
+                        decoder_score=decoder_score)
+
+
+@dataclass
+class InstanceInfo:
+    request_id: str
+    prefiller_idx: int
+    prefiller_score: float
+    prefiller: ServerState
+    decoder_idx: int
+    decoder_score: float
+    decoder: ServerState
+
+
 async def _handle_completions(api: str, request: Request):
     try:
         req_data = await request.json()
         req_body = await request.body()
         request_length = len(req_body)
-        prefiller_score = proxy_state.calculate_prefill_scores(request_length)
-        logger.debug(
-            f"Request length: {request_length}, Prefiller score: {prefiller_score}"
-        )
-        request_id = await proxy_state.next_req_id()
-        # Select prefiller
-        prefiller_idx = proxy_state.select_prefiller(prefiller_score)
-        prefiller = proxy_state.prefillers[prefiller_idx]
-        result_future = asyncio.Future()  # type: ignore
-        request_id_api = get_api_request_id(api, request_id)
-        proxy_state.req_id_future[request_id_api] = result_future
-        # Send request to prefiller
-        asyncio.get_running_loop().create_task(send_request_to_service(
-            prefiller.client,
-            prefiller_idx,
-            api,
-            req_data,
-            request_id,
-            max_retries=global_args.max_retries,
-            base_delay=global_args.retry_delay))
-        proxy_state.release_prefiller(prefiller_idx, prefiller_score)
-        
-        response = await result_future
-        del proxy_state.req_id_future[request_id_api]
-        req_data["kv_transfer_params"] = response
-
-        # Select decoder
-        decoder_score = proxy_state.calculate_decode_scores(request_length)
-        logger.debug("Decoder score: %f", decoder_score)
-        # Use the prefiller's kv_transfer_params to select decoder
-        decoder_idx = proxy_state.select_decoder(decoder_score)
-        decoder = proxy_state.decoders[decoder_idx]
-        logger.debug("Using %s %s", prefiller.url, decoder.url)
-        # Stream response from decoder
-        released_kv = False
+        instance_info = await _handle_select_instance(api, req_data,
+                                                      request_length)
+        stream_flag = bool(req_data.get("stream", False))
+        chat_flag = "messages" in req_data
+
+        if "prompt" in req_data:
+            origin_prompt = req_data["prompt"]
+        elif chat_flag:
+            messages = req_data["messages"]
+            origin_prompt = messages[0].get("content", "")
+        else:
+            origin_prompt = ""
+        # refer to vLLM sampling_params: max_token default value
+        origin_max_tokens = req_data.get("max_tokens", 16)
+
         async def generate_stream():
-            nonlocal released_kv
+            nonlocal instance_info
+            generated_token = ""
+            released_kv = False
+            retry_count = 0
+            retry = True
+            completion_tokens = 0
             # Only one await per chunk, minimal logic in loop
             try:
-                async for chunk in stream_service_response_with_retry(
-                        decoder.client,
-                        api,
-                        req_data,
-                        request_id=request_id,
-                        max_retries=global_args.max_retries,
-                        base_delay=global_args.retry_delay):
-                    if not released_kv and chunk:
-                        proxy_state.release_prefiller_kv(
-                            prefiller_idx, prefiller_score)
-                        released_kv = True
-                    yield chunk
+                while retry:
+                    retry = False
+                    async for chunk in stream_service_response_with_retry(
+                            instance_info.decoder.client,
+                            api,
+                            req_data,
+                            request_id=instance_info.request_id,
+                            max_retries=global_args.max_retries,
+                            base_delay=global_args.retry_delay):
+                        if not released_kv and chunk:
+                            proxy_state.release_prefiller_kv(
+                                instance_info.prefiller_idx,
+                                instance_info.prefiller_score)
+                            released_kv = True
+                        chunk_str = chunk.decode("utf-8").strip()
+                        if not chunk_str:
+                            continue
+                        if chunk_str.startswith("data: "):
+                            chunk_str = chunk_str[len("data: "):]
+                        try:
+                            chunk_json = json.loads(chunk_str)
+                        except json.JSONDecodeError:
+                            # if chunk is [done], skip it.
+                            logger.warning(
+                                f"Skipping chunk: {chunk_str}")
+                            yield chunk
+                            continue
+                        choices = chunk_json.get("choices", [])
+                        if not choices:
+                            yield chunk
+                            continue
+
+                        choice = choices[0]
+                        delta = choice.get("delta") or {}
+                        message = choice.get("message") or {}
+                        content = (
+                                delta.get("content")
+                                or message.get("content")
+                                or choice.get("text")
+                                or ""
+                                )
+                        generated_token += content
+
+                        stop_reason = choice.get(
+                            "stop_reason")
+                        usage = chunk_json.get("usage", {})
+                        completion_tokens = (completion_tokens + 1) if stream_flag else \
+                            (completion_tokens + usage.get("completion_tokens"))
+                        if stop_reason == "recomputed":
+                            retry = True
+                            retry_count += 1
+                            if chat_flag:
+                                messages[0][
+                                    "content"] = origin_prompt + generated_token
+                            else:
+                                req_data[
+                                    "prompt"] = origin_prompt + generated_token
+                            req_data[
+                                "max_tokens"] = origin_max_tokens - completion_tokens + retry_count
+                            tmp_request_length = len(
+                                json.dumps(req_data).encode("utf-8"))
+                            instance_info = await _handle_select_instance(
+                                api, req_data, tmp_request_length)
+                            break
+                        if retry_count > 0 and not stream_flag:
+                            if chat_flag:
+                                choices[0]["message"][
+                                    "content"] = generated_token
+                            else:
+                                choices[0]["text"] = generated_token
+                            chunk = json.dumps(chunk_json).encode("utf-8")
+                        yield chunk
             except Exception as e:
                 logger.error(
-                    f"Error during streaming from decoder {decoder.url}: {str(e)} the aborted request {request_id} will be routing to the target prefiller when new request is ready to dispatch to it"
+                    f"Error during streaming from decoder {instance_info.decoder.url}: {str(e)} the aborted request {instance_info.request_id} will be routing to the target prefiller when new request is ready to dispatch to it"
                 )
-                proxy_state.abort_prefiller_request(prefiller_idx, request_id)
-                proxy_state.release_prefiller_kv(prefiller_idx,
-                                                 prefiller_score)
+                proxy_state.abort_prefiller_request(
+                    instance_info.prefiller_idx, instance_info.request_id)
+                proxy_state.release_prefiller_kv(instance_info.prefiller_idx,
+                                                 instance_info.prefiller_score)
 
             # After streaming done, release tokens
-            proxy_state.release_decoder(decoder_idx, decoder_score)
+            proxy_state.release_decoder(instance_info.decoder_idx,
+                                        instance_info.decoder_score)
 
         return StreamingResponse(generate_stream(),
                                  media_type="application/json")
@@ -564,13 +669,12 @@ async def metaserver(request: Request):
             result_future = proxy_state.req_id_future[request_id]
             result_future.set_result(req_data)
     except Exception as e:
-        logger.error(
-            f"Post metaserver failed with: {str(e)}"
-        )
+        logger.error(f"Post metaserver failed with: {str(e)}")
 
 
 if __name__ == '__main__':
     global global_args
     global_args = parse_args()
     import uvicorn
+
     uvicorn.run(app, host=global_args.host, port=global_args.port)