first draft to enable CPU benchmark

louie-tsai · louie-tsai · commit ec0ac36c6c64 · 2025-06-25T16:28:01.000-07:00
diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -31,6 +31,9 @@
         "linux.aws.h100.8",
         "linux.rocm.gpu.mi300.8",
     ],
+    2: [
+        "intel-cpu-emr",
+    ],
 }
 
 # All the different names vLLM uses to refer to their benchmark configs
@@ -81,6 +84,14 @@ def parse_args() -> Any:
         default="",
         help="the comma-separated list of GPUs to benchmark",
     )
+    parser.add_argument(
+        "--arch",
+        type=str,
+        default="",
+        action=ValidateDir,
+        help="architect for the runner",
+        required=True,
+    )
 
     return parser.parse_args()
 
@@ -107,7 +118,7 @@ def set_output(name: str, val: Any) -> None:
 
 
 def generate_benchmark_matrix(
-    benchmark_configs_dir: str, models: List[str], gpus: List[str]
+    benchmark_configs_dir: str, models: List[str], gpus: List[str], arch: str
 ) -> Dict[str, Any]:
     """
     Parse all the JSON files in vLLM benchmark configs directory to get the
@@ -119,8 +130,7 @@ def generate_benchmark_matrix(
     benchmark_matrix: Dict[str, Any] = {
         "include": [],
     }
-
-    for file in glob.glob(f"{benchmark_configs_dir}/*.json"):
+    for file in glob.glob(f"{benchmark_configs_dir}/*{arch}.json"):
         with open(file) as f:
             try:
                 configs = json.load(f)
@@ -180,6 +190,7 @@ def main() -> None:
         args.benchmark_configs_dir,
         models,
         gpus,
+        args.arch,
     )
     set_output("benchmark_matrix", benchmark_matrix)
 
diff --git a/.github/scripts/setup_vllm_benchmark.py b/.github/scripts/setup_vllm_benchmark.py
@@ -61,17 +61,25 @@ def parse_args() -> Any:
         help="the list of models to benchmark",
         required=True,
     )
+    parser.add_argument(
+        "--arch",
+        type=str,
+        default="",
+        action=ValidateDir,
+        help="architect for the runner",
+        required=True,
+    )
 
     return parser.parse_args()
 
 
 def setup_benchmark_configs(
-    from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str]
+        from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str], arch: str
 ) -> None:
     """
     Setup the benchmark configs to run on this runner
     """
-    for file in glob.glob(f"{from_benchmark_configs_dir}/*.json"):
+    for file in glob.glob(f"{from_benchmark_configs_dir}/*{arch}.json"):
         filename = os.path.basename(file)
         benchmark_configs = []
 
@@ -108,6 +116,7 @@ def main() -> None:
         args.from_benchmark_configs_dir,
         args.to_benchmark_configs_dir,
         args.models.split(","),
+        args.arch,
     )
 
 
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
@@ -54,6 +54,7 @@ jobs:
         env:
           MODELS: ${{ inputs.models || '' }}
           GPUS: ${{ inputs.gpus || '' }}
+          ARCH: ${{ inputs.arch || '' }}
         run: |
           set -eux
 
@@ -62,6 +63,7 @@ jobs:
             --benchmark-configs-dir vllm-benchmarks/benchmarks \
             --models "${MODELS}" \
             --gpus "${GPUS}"
+            --arch "${ARCH}"
 
   benchmarks:
     name: Run vLLM benchmarks
@@ -197,6 +199,7 @@ jobs:
       - name: Setup benchmark tests
         env:
           MODELS: ${{ matrix.models }}
+          ARCH: ${{ inputs.arch || '' }}
         run: |
           set -eux
 
@@ -210,6 +213,7 @@ jobs:
             --from-benchmark-configs-dir vllm-benchmarks/benchmarks \
             --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \
             --models "${MODELS}"
+            --arch "${ARCH}"
 
           pushd vllm-benchmarks/vllm
           ls -lah .buildkite/nightly-benchmarks/tests
@@ -225,9 +229,12 @@ jobs:
           # vLLM-related environment variables
           ENGINE_VERSION: v1
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
+          ARCH: ${{ inputs.arch || '' }}
         run: |
           set -x
-
+          if [[ "$ARCH" == "cpu" ]]; then
+            on_cpu=1
+          fi
           docker run \
             ${GPU_FLAG:-} \
             ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
@@ -238,6 +245,7 @@ jobs:
             -e HF_TOKEN \
             -e ENGINE_VERSION \
             -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
+            -e ON_CPU="${on_cpu}" \
             --ipc=host \
             --tty \
             --security-opt seccomp=unconfined \
diff --git a/vllm-benchmarks/benchmarks/latency-tests-cpu.json b/vllm-benchmarks/benchmarks/latency-tests-cpu.json
@@ -0,0 +1,30 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/serving-tests-cpu.json b/vllm-benchmarks/benchmarks/serving-tests-cpu.json
@@ -0,0 +1,121 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_random_1024_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 1024,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 100
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/throughput-tests-cpu.json b/vllm-benchmarks/benchmarks/throughput-tests-cpu.json
@@ -0,0 +1,32 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]