pytorch · louie-tsai · Jun 13, 2025 · Jun 23, 2025 · Jun 26, 2025 · Jul 3, 2025
diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -26,6 +26,7 @@
     4: [
         "linux.aws.h100.4",
         "linux.rocm.gpu.mi300.4",
+        "intel-cpu-emr",
     ],
     8: [
         "linux.aws.h100.8",
@@ -76,10 +77,11 @@ def parse_args() -> Any:
         help="the comma-separated list of models to benchmark",
     )
     parser.add_argument(
-        "--gpus",
+        "--platforms",
         type=str,
         default="",
-        help="the comma-separated list of GPUs to benchmark",
+        help="the comma-separated list of platforms to benchmark",
+        required=True,
     )
 
     return parser.parse_args()
@@ -107,18 +109,21 @@ def set_output(name: str, val: Any) -> None:
 
 
 def generate_benchmark_matrix(
-    benchmark_configs_dir: str, models: List[str], gpus: List[str]
+    benchmark_configs_dir: str, models: List[str], platforms: List[str]
 ) -> Dict[str, Any]:
     """
     Parse all the JSON files in vLLM benchmark configs directory to get the
-    model name and tensor parallel size (aka number of GPUs)
+    model name and tensor parallel size (aka number of GPUs or CPU NUMA nodes)
     """
-    use_all_gpus = True if not gpus else False
+
+    use_all_platforms = True if not platforms else False
+
     benchmark_matrix: Dict[str, Any] = {
         "include": [],
     }
 
     selected_models = []
+
     for file in glob.glob(f"{benchmark_configs_dir}/*.json"):
         with open(file) as f:
             try:
@@ -155,12 +160,12 @@ def generate_benchmark_matrix(
 
             for runner in RUNNERS_MAPPING[tp]:
                 found_runner = False
-                for gpu in gpus:
-                    if gpu.lower() in runner:
+                for platform in platforms:
+                    if platform.lower() in runner:
                         found_runner = True
                         break
 
-                if found_runner or use_all_gpus:
+                if found_runner or use_all_platforms:
                     benchmark_matrix["include"].append(
                         {
                             "runner": runner,
@@ -176,11 +181,11 @@ def generate_benchmark_matrix(
 def main() -> None:
     args = parse_args()
     models = [m.strip().lower() for m in args.models.split(",") if m.strip()]
-    gpus = [m.strip().lower() for m in args.gpus.split(",") if m.strip()]
+    platforms = [m.strip().lower() for m in args.platforms.split(",") if m.strip()]
     benchmark_matrix = generate_benchmark_matrix(
         args.benchmark_configs_dir,
         models,
-        gpus,
+        platforms,
     )
     set_output("benchmark_matrix", benchmark_matrix)
 

diff --git a/.github/scripts/setup_vllm_benchmark.py b/.github/scripts/setup_vllm_benchmark.py
@@ -61,17 +61,24 @@ def parse_args() -> Any:
         help="the list of models to benchmark",
         required=True,
     )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="",
+        help="device for the runner",
+        required=True,
+    )
 
     return parser.parse_args()
 
 
 def setup_benchmark_configs(
-    from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str]
+        from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str], device: str
 ) -> None:
     """
     Setup the benchmark configs to run on this runner
     """
-    for file in glob.glob(f"{from_benchmark_configs_dir}/*.json"):
+    for file in glob.glob(f"{from_benchmark_configs_dir}/*{device}.json"):
         filename = os.path.basename(file)
         benchmark_configs = []
 
@@ -108,6 +115,7 @@ def main() -> None:
         args.from_benchmark_configs_dir,
         args.to_benchmark_configs_dir,
         args.models.split(","),
+        args.device,
     )
 
 

diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
@@ -20,9 +20,9 @@ on:
           A comma-separated list of models to benchmark, leave empty to run everything
         required: false
         type: string
-      gpus:
+      platforms:
         description: |
-          A comma-separated list of GPUs to benchmark, i.e. h100, mi300
+          A comma-separated list of platforms to benchmark, i.e. h100, mi300, emr
         required: true
         type: string
         default: h100,mi300
@@ -53,15 +53,15 @@ jobs:
         shell: bash
         env:
           MODELS: ${{ inputs.models || '' }}
-          GPUS: ${{ inputs.gpus || '' }}
+          PLATFORMS: ${{ inputs.platforms || '' }}
         run: |
           set -eux
 
           # The generated matrix is grouped by model and runner
           python .github/scripts/generate_vllm_benchmark_matrix.py \
             --benchmark-configs-dir vllm-benchmarks/benchmarks \
             --models "${MODELS}" \
-            --gpus "${GPUS}"
+            --platforms "${PLATFORMS}"
 
   benchmarks:
     name: Run vLLM benchmarks
@@ -103,8 +103,9 @@ jobs:
             DEVICE_NAME=rocm
             rocm-smi
           else
-            echo "Only CUDA and ROCm benchmarks are supported at the moment"
-            exit 1
+            echo "No accelerators. Use CPU instead"
+            DEVICE_NAME=cpu
+            lscpu
           fi
           echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV
 
@@ -118,6 +119,8 @@ jobs:
             DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
           elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
             DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
+          elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
+            DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
           fi
           echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV
 
@@ -126,22 +129,25 @@ jobs:
         run: |
           set -eux
 
-          if [[ "${DEVICE_NAME}" == "cuda" ]]; then
-            pip install -r .github/scripts/requirements.txt
-          elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
+          if [[ "${DEVICE_NAME}" == "rocm" ]]; then
             pip install -r .github/scripts/requirements.txt \
               --extra-index-url https://download.pytorch.org/whl/rocm6.3
+          else
+            pip install -r .github/scripts/requirements.txt
           fi
 
       - name: Set Docker registry
         shell: bash
         run: |
-          if [[ "${DEVICE_NAME}" == "cuda" ]]; then
-            DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
-          elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
+          DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
+          DOCKER_IMAGE_POSTFIX=""
+          if [[ "${DEVICE_NAME}" == "rocm" ]]; then
             DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
+          elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
+            DOCKER_IMAGE_POSTFIX=-cpu
           fi
           echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
+          echo "DOCKER_IMAGE_POSTFIX=$DOCKER_IMAGE_POSTFIX" >> $GITHUB_ENV
 
       - name: Check for last benchmark commit
         working-directory: vllm-benchmarks
@@ -160,7 +166,7 @@ jobs:
               # Check if the image is there, if it doesn't then check an older one
               # because the commit is too recent
               HEAD_SHA=$(git rev-parse --verify HEAD~${i})
-              DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}"
+              DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_POSTFIX}"
 
               # No Docker image available yet because the commit is too recent
               if ! docker manifest inspect "${DOCKER_IMAGE}"; then
@@ -197,6 +203,7 @@ jobs:
       - name: Setup benchmark tests
         env:
           MODELS: ${{ matrix.models }}
+          ARCH: ${{ inputs.arch || '' }}
         run: |
           set -eux
 
@@ -209,7 +216,8 @@ jobs:
           python .github/scripts/setup_vllm_benchmark.py \
             --from-benchmark-configs-dir vllm-benchmarks/benchmarks \
             --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \
-            --models "${MODELS}"
+            --models "${MODELS}" \
+            --device "${DEVICE_NAME// /_}"
 
           pushd vllm-benchmarks/vllm
           ls -lah .buildkite/nightly-benchmarks/tests
@@ -221,13 +229,18 @@ jobs:
           SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
           SCCACHE_REGION: us-east-1
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
-          DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}
+          DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_POSTFIX }}
           # vLLM-related environment variables
           ENGINE_VERSION: v1
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
+          ARCH: ${{ env.DEVICE_NAME }}
         run: |
           set -x
-
+          if [[ "$ARCH" == "cpu" ]]; then
+            on_cpu=1
+          else
+            on_cpu=0
+          fi
           docker run \
             ${GPU_FLAG:-} \
             ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
@@ -238,6 +251,7 @@ jobs:
             -e HF_TOKEN \
             -e ENGINE_VERSION \
             -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
+            -e ON_CPU="${on_cpu}" \
             --ipc=host \
             --tty \
             --security-opt seccomp=unconfined \

diff --git a/vllm-benchmarks/benchmarks/latency-tests-cpu.json b/vllm-benchmarks/benchmarks/latency-tests-cpu.json
@@ -0,0 +1,30 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]