From ec0ac36c6c64e686b8332d3a7ab702ee9fca95d1 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Fri, 13 Jun 2025 16:10:12 -0700 Subject: [PATCH 01/26] first draft to enable CPU benchmark --- .../scripts/generate_vllm_benchmark_matrix.py | 17 ++- .github/scripts/setup_vllm_benchmark.py | 13 +- .github/workflows/vllm-benchmark.yml | 10 +- .../benchmarks/latency-tests-cpu.json | 30 +++++ .../benchmarks/serving-tests-cpu.json | 121 ++++++++++++++++++ .../benchmarks/throughput-tests-cpu.json | 32 +++++ 6 files changed, 217 insertions(+), 6 deletions(-) create mode 100644 vllm-benchmarks/benchmarks/latency-tests-cpu.json create mode 100644 vllm-benchmarks/benchmarks/serving-tests-cpu.json create mode 100644 vllm-benchmarks/benchmarks/throughput-tests-cpu.json diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index 103fd533..37bc106f 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -31,6 +31,9 @@ "linux.aws.h100.8", "linux.rocm.gpu.mi300.8", ], + 2: [ + "intel-cpu-emr", + ], } # All the different names vLLM uses to refer to their benchmark configs @@ -81,6 +84,14 @@ def parse_args() -> Any: default="", help="the comma-separated list of GPUs to benchmark", ) + parser.add_argument( + "--arch", + type=str, + default="", + action=ValidateDir, + help="architect for the runner", + required=True, + ) return parser.parse_args() @@ -107,7 +118,7 @@ def set_output(name: str, val: Any) -> None: def generate_benchmark_matrix( - benchmark_configs_dir: str, models: List[str], gpus: List[str] + benchmark_configs_dir: str, models: List[str], gpus: List[str], arch: str ) -> Dict[str, Any]: """ Parse all the JSON files in vLLM benchmark configs directory to get the @@ -119,8 +130,7 @@ def generate_benchmark_matrix( benchmark_matrix: Dict[str, Any] = { "include": [], } - - for file in glob.glob(f"{benchmark_configs_dir}/*.json"): + for file in glob.glob(f"{benchmark_configs_dir}/*{arch}.json"): with open(file) as f: try: configs = json.load(f) @@ -180,6 +190,7 @@ def main() -> None: args.benchmark_configs_dir, models, gpus, + args.arch, ) set_output("benchmark_matrix", benchmark_matrix) diff --git a/.github/scripts/setup_vllm_benchmark.py b/.github/scripts/setup_vllm_benchmark.py index 98bfa17d..afc6f043 100755 --- a/.github/scripts/setup_vllm_benchmark.py +++ b/.github/scripts/setup_vllm_benchmark.py @@ -61,17 +61,25 @@ def parse_args() -> Any: help="the list of models to benchmark", required=True, ) + parser.add_argument( + "--arch", + type=str, + default="", + action=ValidateDir, + help="architect for the runner", + required=True, + ) return parser.parse_args() def setup_benchmark_configs( - from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str] + from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str], arch: str ) -> None: """ Setup the benchmark configs to run on this runner """ - for file in glob.glob(f"{from_benchmark_configs_dir}/*.json"): + for file in glob.glob(f"{from_benchmark_configs_dir}/*{arch}.json"): filename = os.path.basename(file) benchmark_configs = [] @@ -108,6 +116,7 @@ def main() -> None: args.from_benchmark_configs_dir, args.to_benchmark_configs_dir, args.models.split(","), + args.arch, ) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 1483be8b..4a940470 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -54,6 +54,7 @@ jobs: env: MODELS: ${{ inputs.models || '' }} GPUS: ${{ inputs.gpus || '' }} + ARCH: ${{ inputs.arch || '' }} run: | set -eux @@ -62,6 +63,7 @@ jobs: --benchmark-configs-dir vllm-benchmarks/benchmarks \ --models "${MODELS}" \ --gpus "${GPUS}" + --arch "${ARCH}" benchmarks: name: Run vLLM benchmarks @@ -197,6 +199,7 @@ jobs: - name: Setup benchmark tests env: MODELS: ${{ matrix.models }} + ARCH: ${{ inputs.arch || '' }} run: | set -eux @@ -210,6 +213,7 @@ jobs: --from-benchmark-configs-dir vllm-benchmarks/benchmarks \ --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \ --models "${MODELS}" + --arch "${ARCH}" pushd vllm-benchmarks/vllm ls -lah .buildkite/nightly-benchmarks/tests @@ -225,9 +229,12 @@ jobs: # vLLM-related environment variables ENGINE_VERSION: v1 SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 + ARCH: ${{ inputs.arch || '' }} run: | set -x - + if [[ "$ARCH" == "cpu" ]]; then + on_cpu=1 + fi docker run \ ${GPU_FLAG:-} \ ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ @@ -238,6 +245,7 @@ jobs: -e HF_TOKEN \ -e ENGINE_VERSION \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ + -e ON_CPU="${on_cpu}" \ --ipc=host \ --tty \ --security-opt seccomp=unconfined \ diff --git a/vllm-benchmarks/benchmarks/latency-tests-cpu.json b/vllm-benchmarks/benchmarks/latency-tests-cpu.json new file mode 100644 index 00000000..da93fdd1 --- /dev/null +++ b/vllm-benchmarks/benchmarks/latency-tests-cpu.json @@ -0,0 +1,30 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_llama8B_tp4", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + } +] diff --git a/vllm-benchmarks/benchmarks/serving-tests-cpu.json b/vllm-benchmarks/benchmarks/serving-tests-cpu.json new file mode 100644 index 00000000..cb6df159 --- /dev/null +++ b/vllm-benchmarks/benchmarks/serving-tests-cpu.json @@ -0,0 +1,121 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp4_random_1024_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 1024, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 100 + } + } +] diff --git a/vllm-benchmarks/benchmarks/throughput-tests-cpu.json b/vllm-benchmarks/benchmarks/throughput-tests-cpu.json new file mode 100644 index 00000000..f159c306 --- /dev/null +++ b/vllm-benchmarks/benchmarks/throughput-tests-cpu.json @@ -0,0 +1,32 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_llama8B_tp4", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] From 1201ea6d3dacb52806a4dbcab73a845898d9861b Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Mon, 23 Jun 2025 16:22:48 -0700 Subject: [PATCH 02/26] Update .github/workflows/vllm-benchmark.yml Co-authored-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 4a940470..6ebe898a 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -212,7 +212,7 @@ jobs: python .github/scripts/setup_vllm_benchmark.py \ --from-benchmark-configs-dir vllm-benchmarks/benchmarks \ --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \ - --models "${MODELS}" + --models "${MODELS}" \ --arch "${ARCH}" pushd vllm-benchmarks/vllm From c253948f1acd59682e0866992169ec1dc647b579 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Wed, 25 Jun 2025 22:59:32 -0700 Subject: [PATCH 03/26] fix for ROCm changes --- .../scripts/generate_vllm_benchmark_matrix.py | 34 +++++++------------ .github/scripts/setup_vllm_benchmark.py | 11 +++--- .github/workflows/vllm-benchmark.yml | 31 ++++++++++------- 3 files changed, 35 insertions(+), 41 deletions(-) diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index 37bc106f..a3548e69 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -22,6 +22,7 @@ 2: [ "linux.aws.h100.4", "linux.rocm.gpu.mi300.2", + "intel-cpu-emr", ], 4: [ "linux.aws.h100.4", @@ -31,9 +32,6 @@ "linux.aws.h100.8", "linux.rocm.gpu.mi300.8", ], - 2: [ - "intel-cpu-emr", - ], } # All the different names vLLM uses to refer to their benchmark configs @@ -79,17 +77,10 @@ def parse_args() -> Any: help="the comma-separated list of models to benchmark", ) parser.add_argument( - "--gpus", + "--platforms", type=str, default="", - help="the comma-separated list of GPUs to benchmark", - ) - parser.add_argument( - "--arch", - type=str, - default="", - action=ValidateDir, - help="architect for the runner", + help="the comma-separated list of platforms to benchmark", required=True, ) @@ -118,19 +109,19 @@ def set_output(name: str, val: Any) -> None: def generate_benchmark_matrix( - benchmark_configs_dir: str, models: List[str], gpus: List[str], arch: str + benchmark_configs_dir: str, models: List[str], platforms: List[str] ) -> Dict[str, Any]: """ Parse all the JSON files in vLLM benchmark configs directory to get the - model name and tensor parallel size (aka number of GPUs) + model name and tensor parallel size (aka number of GPUs or CPU NUMA nodes) """ get_all_models = True if not models else False - use_all_gpus = True if not gpus else False + use_all_platforms = True if not platforms else False benchmark_matrix: Dict[str, Any] = { "include": [], } - for file in glob.glob(f"{benchmark_configs_dir}/*{arch}.json"): + for file in glob.glob(f"{benchmark_configs_dir}/*.json"): with open(file) as f: try: configs = json.load(f) @@ -164,12 +155,12 @@ def generate_benchmark_matrix( for runner in RUNNERS_MAPPING[tp]: found_runner = False - for gpu in gpus: - if gpu.lower() in runner: + for platform in platforms: + if platform.lower() in runner: found_runner = True break - if found_runner or use_all_gpus: + if found_runner or use_all_platforms: benchmark_matrix["include"].append( { "runner": runner, @@ -185,12 +176,11 @@ def generate_benchmark_matrix( def main() -> None: args = parse_args() models = [m.strip().lower() for m in args.models.split(",") if m.strip()] - gpus = [m.strip().lower() for m in args.gpus.split(",") if m.strip()] + platforms = [m.strip().lower() for m in args.platforms.split(",") if m.strip()] benchmark_matrix = generate_benchmark_matrix( args.benchmark_configs_dir, models, - gpus, - args.arch, + platforms, ) set_output("benchmark_matrix", benchmark_matrix) diff --git a/.github/scripts/setup_vllm_benchmark.py b/.github/scripts/setup_vllm_benchmark.py index afc6f043..c8792bba 100755 --- a/.github/scripts/setup_vllm_benchmark.py +++ b/.github/scripts/setup_vllm_benchmark.py @@ -62,11 +62,10 @@ def parse_args() -> Any: required=True, ) parser.add_argument( - "--arch", + "--device", type=str, default="", - action=ValidateDir, - help="architect for the runner", + help="device for the runner", required=True, ) @@ -74,12 +73,12 @@ def parse_args() -> Any: def setup_benchmark_configs( - from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str], arch: str + from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str], device: str ) -> None: """ Setup the benchmark configs to run on this runner """ - for file in glob.glob(f"{from_benchmark_configs_dir}/*{arch}.json"): + for file in glob.glob(f"{from_benchmark_configs_dir}/*{device}.json"): filename = os.path.basename(file) benchmark_configs = [] @@ -116,7 +115,7 @@ def main() -> None: args.from_benchmark_configs_dir, args.to_benchmark_configs_dir, args.models.split(","), - args.arch, + args.device, ) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 6ebe898a..efbac1e1 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -20,9 +20,9 @@ on: A comma-separated list of models to benchmark, leave empty to run everything required: false type: string - gpus: + platforms: description: | - A comma-separated list of GPUs to benchmark, i.e. h100, mi300 + A comma-separated list of platforms to benchmark, i.e. h100, mi300, emr required: true type: string default: h100,mi300 @@ -53,8 +53,7 @@ jobs: shell: bash env: MODELS: ${{ inputs.models || '' }} - GPUS: ${{ inputs.gpus || '' }} - ARCH: ${{ inputs.arch || '' }} + PLATFORMS: ${{ inputs.platforms || '' }} run: | set -eux @@ -62,8 +61,7 @@ jobs: python .github/scripts/generate_vllm_benchmark_matrix.py \ --benchmark-configs-dir vllm-benchmarks/benchmarks \ --models "${MODELS}" \ - --gpus "${GPUS}" - --arch "${ARCH}" + --platforms "${PLATFORMS}" benchmarks: name: Run vLLM benchmarks @@ -105,8 +103,9 @@ jobs: DEVICE_NAME=rocm rocm-smi else - echo "Only CUDA and ROCm benchmarks are supported at the moment" - exit 1 + echo "No accelerators. Use CPU instead" + DEVICE_NAME=cpu + lscpu fi echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV @@ -120,6 +119,8 @@ jobs: DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}') elif [[ "${DEVICE_NAME}" == "rocm" ]]; then DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs) + elif [[ "${DEVICE_NAME}" == "cpu" ]]; then + DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ") fi echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV @@ -128,11 +129,11 @@ jobs: run: | set -eux - if [[ "${DEVICE_NAME}" == "cuda" ]]; then - pip install -r .github/scripts/requirements.txt - elif [[ "${DEVICE_NAME}" == "rocm" ]]; then + if [[ "${DEVICE_NAME}" == "rocm" ]]; then pip install -r .github/scripts/requirements.txt \ --extra-index-url https://download.pytorch.org/whl/rocm6.3 + else + pip install -r .github/scripts/requirements.txt fi - name: Set Docker registry @@ -142,6 +143,8 @@ jobs: DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo elif [[ "${DEVICE_NAME}" == "rocm" ]]; then DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci + elif [[ "${DEVICE_NAME}" == "cpu" ]]; then + DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo fi echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV @@ -213,7 +216,7 @@ jobs: --from-benchmark-configs-dir vllm-benchmarks/benchmarks \ --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \ --models "${MODELS}" \ - --arch "${ARCH}" + --device "${DEVICE_NAME// /_}" pushd vllm-benchmarks/vllm ls -lah .buildkite/nightly-benchmarks/tests @@ -229,11 +232,13 @@ jobs: # vLLM-related environment variables ENGINE_VERSION: v1 SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 - ARCH: ${{ inputs.arch || '' }} + ARCH: ${{ env.DEVICE_NAME }} run: | set -x if [[ "$ARCH" == "cpu" ]]; then on_cpu=1 + else + on_cpu=0 fi docker run \ ${GPU_FLAG:-} \ From 1d0271aad8017b662c7730e4c56b7d9a013dc963 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Wed, 2 Jul 2025 18:33:25 -0700 Subject: [PATCH 04/26] change to use public cpu vllm postmerge registry --- .github/workflows/vllm-benchmark.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index efbac1e1..a396d451 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -139,14 +139,15 @@ jobs: - name: Set Docker registry shell: bash run: | - if [[ "${DEVICE_NAME}" == "cuda" ]]; then - DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo - elif [[ "${DEVICE_NAME}" == "rocm" ]]; then + DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo + DOCKER_IMAGE_POSTFIX="" + if [[ "${DEVICE_NAME}" == "rocm" ]]; then DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci elif [[ "${DEVICE_NAME}" == "cpu" ]]; then - DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo + DOCKER_IMAGE_POSTFIX=-cpu fi echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV + echo "DOCKER_IMAGE_POSTFIX=$DOCKER_IMAGE_POSTFIX" >> $GITHUB_ENV - name: Check for last benchmark commit working-directory: vllm-benchmarks @@ -165,7 +166,7 @@ jobs: # Check if the image is there, if it doesn't then check an older one # because the commit is too recent HEAD_SHA=$(git rev-parse --verify HEAD~${i}) - DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}" + DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_POSTFIX}" # No Docker image available yet because the commit is too recent if ! docker manifest inspect "${DOCKER_IMAGE}"; then @@ -228,7 +229,7 @@ jobs: SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 SCCACHE_REGION: us-east-1 HF_TOKEN: ${{ secrets.HF_TOKEN }} - DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }} + DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_POSTFIX }} # vLLM-related environment variables ENGINE_VERSION: v1 SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 From 9cffd0ecc6e57e8243fcebd5e4861f8d17bca780 Mon Sep 17 00:00:00 2001 From: "Tsai, Louie" Date: Wed, 2 Jul 2025 18:36:06 -0700 Subject: [PATCH 05/26] target on 4 NUMA node EMR machine --- .github/scripts/generate_vllm_benchmark_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index a3548e69..bdf1ee73 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -22,11 +22,11 @@ 2: [ "linux.aws.h100.4", "linux.rocm.gpu.mi300.2", - "intel-cpu-emr", ], 4: [ "linux.aws.h100.4", "linux.rocm.gpu.mi300.4", + "intel-cpu-emr", ], 8: [ "linux.aws.h100.8", From caa0cf6165f2a82901a72d6312db020067dfd438 Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Thu, 10 Jul 2025 13:34:02 -0700 Subject: [PATCH 06/26] Update vllm-benchmark.yml --- .github/workflows/vllm-benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index a396d451..1e83c298 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -217,7 +217,7 @@ jobs: --from-benchmark-configs-dir vllm-benchmarks/benchmarks \ --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \ --models "${MODELS}" \ - --device "${DEVICE_NAME// /_}" + --device "${{DOCKER_IMAGE_POSTFIX}" pushd vllm-benchmarks/vllm ls -lah .buildkite/nightly-benchmarks/tests From e12e2c18020a2773c2452719c62d2059afa5e5ef Mon Sep 17 00:00:00 2001 From: Louie Tsai Date: Thu, 10 Jul 2025 13:34:31 -0700 Subject: [PATCH 07/26] Update vllm-benchmark.yml --- .github/workflows/vllm-benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 1e83c298..43190a16 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -217,7 +217,7 @@ jobs: --from-benchmark-configs-dir vllm-benchmarks/benchmarks \ --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \ --models "${MODELS}" \ - --device "${{DOCKER_IMAGE_POSTFIX}" + --device "${DOCKER_IMAGE_POSTFIX}" pushd vllm-benchmarks/vllm ls -lah .buildkite/nightly-benchmarks/tests From 42b6b76982e966e6d18936869703745ee6bc6840 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 10 Jul 2025 13:54:29 -0700 Subject: [PATCH 08/26] Fix CPU suffix --- .github/scripts/setup_vllm_benchmark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/scripts/setup_vllm_benchmark.py b/.github/scripts/setup_vllm_benchmark.py index c8792bba..e2c8a27d 100755 --- a/.github/scripts/setup_vllm_benchmark.py +++ b/.github/scripts/setup_vllm_benchmark.py @@ -115,7 +115,8 @@ def main() -> None: args.from_benchmark_configs_dir, args.to_benchmark_configs_dir, args.models.split(","), - args.device, + # Only need to CPU benchmark for now + args.device if args.device == "cpu" else "", ) From e317284317b6bda5afbbdafbef3216d7b04d1cc1 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 10 Jul 2025 14:11:00 -0700 Subject: [PATCH 09/26] Rename benchmark files to include the device name Signed-off-by: Huy Do --- .github/scripts/setup_vllm_benchmark.py | 5 ++--- .../{latency-tests.json => latency-tests_cuda_rocm.json} | 0 .../{serving-tests.json => serving-tests_cuda_rocm.json} | 0 ...throughput-tests.json => throughput-tests_cuda_rocm.json} | 0 4 files changed, 2 insertions(+), 3 deletions(-) rename vllm-benchmarks/benchmarks/{latency-tests.json => latency-tests_cuda_rocm.json} (100%) rename vllm-benchmarks/benchmarks/{serving-tests.json => serving-tests_cuda_rocm.json} (100%) rename vllm-benchmarks/benchmarks/{throughput-tests.json => throughput-tests_cuda_rocm.json} (100%) diff --git a/.github/scripts/setup_vllm_benchmark.py b/.github/scripts/setup_vllm_benchmark.py index e2c8a27d..32959fe2 100755 --- a/.github/scripts/setup_vllm_benchmark.py +++ b/.github/scripts/setup_vllm_benchmark.py @@ -78,7 +78,7 @@ def setup_benchmark_configs( """ Setup the benchmark configs to run on this runner """ - for file in glob.glob(f"{from_benchmark_configs_dir}/*{device}.json"): + for file in glob.glob(f"{from_benchmark_configs_dir}/*_{device}.json"): filename = os.path.basename(file) benchmark_configs = [] @@ -115,8 +115,7 @@ def main() -> None: args.from_benchmark_configs_dir, args.to_benchmark_configs_dir, args.models.split(","), - # Only need to CPU benchmark for now - args.device if args.device == "cpu" else "", + args.device, ) diff --git a/vllm-benchmarks/benchmarks/latency-tests.json b/vllm-benchmarks/benchmarks/latency-tests_cuda_rocm.json similarity index 100% rename from vllm-benchmarks/benchmarks/latency-tests.json rename to vllm-benchmarks/benchmarks/latency-tests_cuda_rocm.json diff --git a/vllm-benchmarks/benchmarks/serving-tests.json b/vllm-benchmarks/benchmarks/serving-tests_cuda_rocm.json similarity index 100% rename from vllm-benchmarks/benchmarks/serving-tests.json rename to vllm-benchmarks/benchmarks/serving-tests_cuda_rocm.json diff --git a/vllm-benchmarks/benchmarks/throughput-tests.json b/vllm-benchmarks/benchmarks/throughput-tests_cuda_rocm.json similarity index 100% rename from vllm-benchmarks/benchmarks/throughput-tests.json rename to vllm-benchmarks/benchmarks/throughput-tests_cuda_rocm.json From 5b2bc4923ad6f7e6256eab027a606955781bd7df Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 10 Jul 2025 19:41:11 -0700 Subject: [PATCH 10/26] Fix model selection for CPU devices Signed-off-by: Huy Do --- .../scripts/generate_vllm_benchmark_matrix.py | 126 ++--- .github/scripts/setup_vllm_benchmark.py | 2 +- .../test_generate_vllm_benchmark_matrix.py | 432 ++++++++++++++++++ .github/workflows/vllm-benchmark.yml | 10 +- ...rocm.json => latency-tests-cuda-rocm.json} | 0 ...rocm.json => serving-tests-cuda-rocm.json} | 0 ...m.json => throughput-tests-cuda-rocm.json} | 0 7 files changed, 512 insertions(+), 58 deletions(-) create mode 100644 .github/scripts/test_generate_vllm_benchmark_matrix.py rename vllm-benchmarks/benchmarks/{latency-tests_cuda_rocm.json => latency-tests-cuda-rocm.json} (100%) rename vllm-benchmarks/benchmarks/{serving-tests_cuda_rocm.json => serving-tests-cuda-rocm.json} (100%) rename vllm-benchmarks/benchmarks/{throughput-tests_cuda_rocm.json => throughput-tests-cuda-rocm.json} (100%) diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index 469d8de6..419922d6 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -12,21 +12,24 @@ logging.basicConfig(level=logging.INFO) # Those are H100 runners from https://github.com/pytorch-labs/pytorch-gha-infra/blob/main/multi-tenant/inventory/manual_inventory # while ROCm runner are provided by AMD -RUNNERS_MAPPING = { +TP_TO_RUNNER_MAPPING = { 1: [ "linux.aws.h100", "linux.rocm.gpu.mi300.2", # No single ROCm GPU? + "linux.24xl.spr-metal", ], # NB: There is no 2xH100 runner at the momement, so let's use the next one # in the list here which is 4xH100 2: [ "linux.aws.h100.4", "linux.rocm.gpu.mi300.2", + "linux.24xl.spr-metal", ], 4: [ "linux.aws.h100.4", "linux.rocm.gpu.mi300.4", - "intel-cpu-emr", + # TODO (huydhn): Enable this when Intel's runners are ready + # "intel-cpu-emr", ], 8: [ "linux.aws.h100.8", @@ -34,6 +37,17 @@ ], } +# This mapping is needed to find out the platform of the runner +RUNNER_TO_PLATFORM_MAPPING = { + "linux.aws.h100": "cuda", + "linux.aws.h100.4": "cuda", + "linux.aws.h100.8": "cuda", + "linux.rocm.gpu.mi300.2": "rocm", + "linux.rocm.gpu.mi300.4": "rocm", + "linux.rocm.gpu.mi300.8": "rocm", + "linux.24xl.spr-metal": "cpu", +} + # All the different names vLLM uses to refer to their benchmark configs VLLM_BENCHMARK_CONFIGS_PARAMETER = set( [ @@ -77,10 +91,10 @@ def parse_args() -> Any: help="the comma-separated list of models to benchmark", ) parser.add_argument( - "--platforms", + "--runners", type=str, default="", - help="the comma-separated list of platforms to benchmark", + help="the comma-separated list of runners to run the benchmark", required=True, ) @@ -109,63 +123,71 @@ def set_output(name: str, val: Any) -> None: def generate_benchmark_matrix( - benchmark_configs_dir: str, models: List[str], platforms: List[str] + benchmark_configs_dir: str, models: List[str], runners: List[str] ) -> Dict[str, Any]: """ Parse all the JSON files in vLLM benchmark configs directory to get the model name and tensor parallel size (aka number of GPUs or CPU NUMA nodes) """ - - use_all_platforms = True if not platforms else False - benchmark_matrix: Dict[str, Any] = { "include": [], } + platforms = set() + if not runners: + use_all_runners = True + platforms = set(v for v in RUNNER_TO_PLATFORM_MAPPING.values()) + else: + use_all_runners = False + for k, v in RUNNER_TO_PLATFORM_MAPPING.items(): + for r in runners: + if r.lower() in k: + platforms.add(v) + selected_models = [] - for file in glob.glob(f"{benchmark_configs_dir}/*.json"): - with open(file) as f: - try: - configs = json.load(f) - except json.JSONDecodeError as e: - warning(f"Fail to load {file}: {e}") - continue - - for config in configs: - param = list(VLLM_BENCHMARK_CONFIGS_PARAMETER & set(config.keys())) - assert len(param) == 1 - - benchmark_config = config[param[0]] - if "model" not in benchmark_config: - warning(f"Model name is not set in {benchmark_config}, skipping...") - continue - model = benchmark_config["model"].lower() - - # Dedup - if model in selected_models: - continue - # and only choose the selected model: - if models and model not in models: - continue - selected_models.append(model) - - if "tensor_parallel_size" in benchmark_config: - tp = benchmark_config["tensor_parallel_size"] - elif "tp" in benchmark_config: - tp = benchmark_config["tp"] - else: - tp = 8 - assert tp in RUNNERS_MAPPING - - for runner in RUNNERS_MAPPING[tp]: - found_runner = False - for platform in platforms: - if platform.lower() in runner: - found_runner = True - break - - if found_runner or use_all_platforms: + # Gather all possible benchmarks + for platform in platforms: + for file in glob.glob(f"{benchmark_configs_dir}/*-{platform}*.json"): + with open(file) as f: + try: + configs = json.load(f) + except json.JSONDecodeError as e: + warning(f"Fail to load {file}: {e}") + continue + + for config in configs: + param = list(VLLM_BENCHMARK_CONFIGS_PARAMETER & set(config.keys())) + assert len(param) == 1 + + benchmark_config = config[param[0]] + if "model" not in benchmark_config: + warning(f"Model name is not set in {benchmark_config}, skipping...") + continue + model = benchmark_config["model"].lower() + + # Dedup + if model in selected_models: + continue + # and only choose the selected model: + if models and model not in models: + continue + selected_models.append(model) + + if "tensor_parallel_size" in benchmark_config: + tp = benchmark_config["tensor_parallel_size"] + elif "tp" in benchmark_config: + tp = benchmark_config["tp"] + else: + tp = 8 + assert tp in TP_TO_RUNNER_MAPPING + + for runner in TP_TO_RUNNER_MAPPING[tp]: + found_runner = any([r and r.lower() in runner for r in runners]) + + if not found_runner and not use_all_runners: + continue + benchmark_matrix["include"].append( { "runner": runner, @@ -181,11 +203,11 @@ def generate_benchmark_matrix( def main() -> None: args = parse_args() models = [m.strip().lower() for m in args.models.split(",") if m.strip()] - platforms = [m.strip().lower() for m in args.platforms.split(",") if m.strip()] + runners = [m.strip().lower() for m in args.runners.split(",") if m.strip()] benchmark_matrix = generate_benchmark_matrix( args.benchmark_configs_dir, models, - platforms, + runners, ) set_output("benchmark_matrix", benchmark_matrix) diff --git a/.github/scripts/setup_vllm_benchmark.py b/.github/scripts/setup_vllm_benchmark.py index 32959fe2..5060dec8 100755 --- a/.github/scripts/setup_vllm_benchmark.py +++ b/.github/scripts/setup_vllm_benchmark.py @@ -78,7 +78,7 @@ def setup_benchmark_configs( """ Setup the benchmark configs to run on this runner """ - for file in glob.glob(f"{from_benchmark_configs_dir}/*_{device}.json"): + for file in glob.glob(f"{from_benchmark_configs_dir}/*-{device}*.json"): filename = os.path.basename(file) benchmark_configs = [] diff --git a/.github/scripts/test_generate_vllm_benchmark_matrix.py b/.github/scripts/test_generate_vllm_benchmark_matrix.py new file mode 100644 index 00000000..5c9242f2 --- /dev/null +++ b/.github/scripts/test_generate_vllm_benchmark_matrix.py @@ -0,0 +1,432 @@ +import os +import json + +from expecttest import assert_expected_inline +from generate_vllm_benchmark_matrix import generate_benchmark_matrix + +BENCHMARK_CONFIG_DIRS = os.path.join( + os.path.dirname(__file__), "..", "..", "vllm-benchmarks", "benchmarks" +) + + +def test_generate_benchmark_matrix(): + # All combinations, no duplication + models = [] + runners = [] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.24xl.spr-metal", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/meta-llama-3.1-70b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.4", + "models": "meta-llama/meta-llama-3.1-70b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.24xl.spr-metal", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/llama-4-scout-17b-16e-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.4", + "models": "meta-llama/llama-4-scout-17b-16e-instruct" + }, + { + "runner": "linux.aws.h100.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + }, + { + "runner": "linux.rocm.gpu.mi300.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + } + ] +}""", + ) + + # Select a model + models = ["meta-llama/meta-llama-3.1-8b-instruct"] + runners = [] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.24xl.spr-metal", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + } + ] +}""", + ) + + # Select multiple models + models = [ + "meta-llama/meta-llama-3.1-8b-instruct", + "meta-llama/llama-4-maverick-17b-128e-instruct-fp8", + ] + runners = [] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.24xl.spr-metal", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + }, + { + "runner": "linux.rocm.gpu.mi300.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + } + ] +}""", + ) + + # Select non-existing models + models = ["meta-llama/meta-llama-3.1-8b-instruct", "do-not-exist"] + runners = [] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.24xl.spr-metal", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + } + ] +}""", + ) + + # Select non-existing models + models = ["meta-llama/meta-llama-3.1-8b-instruct", ""] + runners = [] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.24xl.spr-metal", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + } + ] +}""", + ) + + # Select a runner + models = [] + runners = ["h100"] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/meta-llama-3.1-70b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/llama-4-scout-17b-16e-instruct" + }, + { + "runner": "linux.aws.h100.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + } + ] +}""", + ) + + # Select multiple runners + models = [] + runners = ["h100", "spr"] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.24xl.spr-metal", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/meta-llama-3.1-70b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.24xl.spr-metal", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/llama-4-scout-17b-16e-instruct" + }, + { + "runner": "linux.aws.h100.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + } + ] +}""", + ) + + # Select non-existing runners + models = [] + runners = ["h100", "do-not-exist"] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/meta-llama-3.1-70b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/llama-4-scout-17b-16e-instruct" + }, + { + "runner": "linux.aws.h100.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + } + ] +}""", + ) + + # Select non-existing runners + models = [] + runners = ["h100", ""] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/meta-llama-3.1-70b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/llama-4-scout-17b-16e-instruct" + }, + { + "runner": "linux.aws.h100.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + } + ] +}""", + ) + + # Select a model and a runner + models = ["meta-llama/meta-llama-3.1-8b-instruct"] + runners = ["h100"] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + } + ] +}""", + ) + + # Select multiple models and runners + models = [ + "meta-llama/meta-llama-3.1-8b-instruct", + "mistralai/mixtral-8x7b-instruct-v0.1", + ] + runners = ["rocm", "spr"] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.24xl.spr-metal", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.24xl.spr-metal", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + } + ] +}""", + ) + + # Select non-existing models and runners + models = ["meta-llama/meta-llama-3.1-8b-instruct", "do-not-exist"] + runners = ["rocm", "do-not-exist"] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + } + ] +}""", + ) + + # Select non-existing models and runners + models = ["meta-llama/meta-llama-3.1-8b-instruct", ""] + runners = ["rocm", ""] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + } + ] +}""", + ) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 43190a16..750db2c4 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -20,12 +20,12 @@ on: A comma-separated list of models to benchmark, leave empty to run everything required: false type: string - platforms: + runners: description: | - A comma-separated list of platforms to benchmark, i.e. h100, mi300, emr + A comma-separated list of runners to run the benchmark, i.e. h100, mi300, spr, emr required: true type: string - default: h100,mi300 + default: h100,mi300,spr pull_request: paths: - .github/workflows/vllm-benchmark.yml @@ -53,7 +53,7 @@ jobs: shell: bash env: MODELS: ${{ inputs.models || '' }} - PLATFORMS: ${{ inputs.platforms || '' }} + RUNNERS: ${{ inputs.runners || '' }} run: | set -eux @@ -61,7 +61,7 @@ jobs: python .github/scripts/generate_vllm_benchmark_matrix.py \ --benchmark-configs-dir vllm-benchmarks/benchmarks \ --models "${MODELS}" \ - --platforms "${PLATFORMS}" + --runners "${RUNNERS}" benchmarks: name: Run vLLM benchmarks diff --git a/vllm-benchmarks/benchmarks/latency-tests_cuda_rocm.json b/vllm-benchmarks/benchmarks/latency-tests-cuda-rocm.json similarity index 100% rename from vllm-benchmarks/benchmarks/latency-tests_cuda_rocm.json rename to vllm-benchmarks/benchmarks/latency-tests-cuda-rocm.json diff --git a/vllm-benchmarks/benchmarks/serving-tests_cuda_rocm.json b/vllm-benchmarks/benchmarks/serving-tests-cuda-rocm.json similarity index 100% rename from vllm-benchmarks/benchmarks/serving-tests_cuda_rocm.json rename to vllm-benchmarks/benchmarks/serving-tests-cuda-rocm.json diff --git a/vllm-benchmarks/benchmarks/throughput-tests_cuda_rocm.json b/vllm-benchmarks/benchmarks/throughput-tests-cuda-rocm.json similarity index 100% rename from vllm-benchmarks/benchmarks/throughput-tests_cuda_rocm.json rename to vllm-benchmarks/benchmarks/throughput-tests-cuda-rocm.json From 019482f7c92403708bf11f518fbad0105a3f6ee9 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 10 Jul 2025 19:49:38 -0700 Subject: [PATCH 11/26] Update the workflow Signed-off-by: Huy Do --- .github/scripts/setup_vllm_benchmark.py | 5 ++++- .github/workflows/vllm-benchmark.yml | 25 ++++++++++++------------- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/.github/scripts/setup_vllm_benchmark.py b/.github/scripts/setup_vllm_benchmark.py index 5060dec8..bae32b81 100755 --- a/.github/scripts/setup_vllm_benchmark.py +++ b/.github/scripts/setup_vllm_benchmark.py @@ -73,7 +73,10 @@ def parse_args() -> Any: def setup_benchmark_configs( - from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str], device: str + from_benchmark_configs_dir: str, + to_benchmark_configs_dir: str, + models: List[str], + device: str, ) -> None: """ Setup the benchmark configs to run on this runner diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 750db2c4..492598af 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -103,7 +103,6 @@ jobs: DEVICE_NAME=rocm rocm-smi else - echo "No accelerators. Use CPU instead" DEVICE_NAME=cpu lscpu fi @@ -140,14 +139,14 @@ jobs: shell: bash run: | DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo - DOCKER_IMAGE_POSTFIX="" + DOCKER_IMAGE_SUFFIX="" if [[ "${DEVICE_NAME}" == "rocm" ]]; then DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci elif [[ "${DEVICE_NAME}" == "cpu" ]]; then - DOCKER_IMAGE_POSTFIX=-cpu + DOCKER_IMAGE_SUFFIX=-cpu fi echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV - echo "DOCKER_IMAGE_POSTFIX=$DOCKER_IMAGE_POSTFIX" >> $GITHUB_ENV + echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV - name: Check for last benchmark commit working-directory: vllm-benchmarks @@ -166,7 +165,7 @@ jobs: # Check if the image is there, if it doesn't then check an older one # because the commit is too recent HEAD_SHA=$(git rev-parse --verify HEAD~${i}) - DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_POSTFIX}" + DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}" # No Docker image available yet because the commit is too recent if ! docker manifest inspect "${DOCKER_IMAGE}"; then @@ -203,7 +202,6 @@ jobs: - name: Setup benchmark tests env: MODELS: ${{ matrix.models }} - ARCH: ${{ inputs.arch || '' }} run: | set -eux @@ -217,7 +215,7 @@ jobs: --from-benchmark-configs-dir vllm-benchmarks/benchmarks \ --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \ --models "${MODELS}" \ - --device "${DOCKER_IMAGE_POSTFIX}" + --device "${DEVICE_NAME}" pushd vllm-benchmarks/vllm ls -lah .buildkite/nightly-benchmarks/tests @@ -229,18 +227,19 @@ jobs: SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 SCCACHE_REGION: us-east-1 HF_TOKEN: ${{ secrets.HF_TOKEN }} - DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_POSTFIX }} + DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_SUFFIX }} # vLLM-related environment variables ENGINE_VERSION: v1 SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 - ARCH: ${{ env.DEVICE_NAME }} run: | set -x - if [[ "$ARCH" == "cpu" ]]; then - on_cpu=1 + + if [[ "${DEVICE_NAME}" == "cpu" ]]; then + ON_CPU=1 else - on_cpu=0 + ON_CPU=0 fi + docker run \ ${GPU_FLAG:-} \ ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ @@ -251,7 +250,7 @@ jobs: -e HF_TOKEN \ -e ENGINE_VERSION \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ - -e ON_CPU="${on_cpu}" \ + -e ON_CPU \ --ipc=host \ --tty \ --security-opt seccomp=unconfined \ From 5805c342caeeb970c224f81c0abd9c84ceff3857 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 10 Jul 2025 20:33:23 -0700 Subject: [PATCH 12/26] Another try Signed-off-by: Huy Do --- .../scripts/generate_vllm_benchmark_matrix.py | 15 ++- .github/scripts/setup_vllm_benchmark.py | 2 +- .../test_generate_vllm_benchmark_matrix.py | 78 +++++------ .github/workflows/vllm-benchmark.yml | 2 + vllm-benchmarks/benchmarks/README.md | 4 - .../{ => cpu}/latency-tests-cpu.json | 0 .../{ => cpu}/serving-tests-cpu.json | 0 .../{ => cpu}/throughput-tests-cpu.json | 0 .../latency-tests.json} | 0 .../serving-tests.json} | 0 .../throughput-tests.json} | 0 .../benchmarks/rocm/latency-tests.json | 54 ++++++++ .../benchmarks/rocm/serving-tests.json | 121 ++++++++++++++++++ .../benchmarks/rocm/throughput-tests.json | 59 +++++++++ 14 files changed, 280 insertions(+), 55 deletions(-) rename vllm-benchmarks/benchmarks/{ => cpu}/latency-tests-cpu.json (100%) rename vllm-benchmarks/benchmarks/{ => cpu}/serving-tests-cpu.json (100%) rename vllm-benchmarks/benchmarks/{ => cpu}/throughput-tests-cpu.json (100%) rename vllm-benchmarks/benchmarks/{latency-tests-cuda-rocm.json => cuda/latency-tests.json} (100%) rename vllm-benchmarks/benchmarks/{serving-tests-cuda-rocm.json => cuda/serving-tests.json} (100%) rename vllm-benchmarks/benchmarks/{throughput-tests-cuda-rocm.json => cuda/throughput-tests.json} (100%) create mode 100644 vllm-benchmarks/benchmarks/rocm/latency-tests.json create mode 100644 vllm-benchmarks/benchmarks/rocm/serving-tests.json create mode 100644 vllm-benchmarks/benchmarks/rocm/throughput-tests.json diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index 419922d6..024f7262 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -144,11 +144,10 @@ def generate_benchmark_matrix( if r.lower() in k: platforms.add(v) - selected_models = [] - # Gather all possible benchmarks - for platform in platforms: - for file in glob.glob(f"{benchmark_configs_dir}/*-{platform}*.json"): + for platform in sorted(platforms): + selected_models = [] + for file in glob.glob(f"{benchmark_configs_dir}/{platform}/*.json"): with open(file) as f: try: configs = json.load(f) @@ -183,8 +182,14 @@ def generate_benchmark_matrix( assert tp in TP_TO_RUNNER_MAPPING for runner in TP_TO_RUNNER_MAPPING[tp]: - found_runner = any([r and r.lower() in runner for r in runners]) + # Wrong platform + if ( + runner not in RUNNER_TO_PLATFORM_MAPPING + or RUNNER_TO_PLATFORM_MAPPING[runner] != platform + ): + continue + found_runner = any([r and r.lower() in runner for r in runners]) if not found_runner and not use_all_runners: continue diff --git a/.github/scripts/setup_vllm_benchmark.py b/.github/scripts/setup_vllm_benchmark.py index bae32b81..e1edc30a 100755 --- a/.github/scripts/setup_vllm_benchmark.py +++ b/.github/scripts/setup_vllm_benchmark.py @@ -81,7 +81,7 @@ def setup_benchmark_configs( """ Setup the benchmark configs to run on this runner """ - for file in glob.glob(f"{from_benchmark_configs_dir}/*-{device}*.json"): + for file in glob.glob(f"{from_benchmark_configs_dir}/{device}/*.json"): filename = os.path.basename(file) benchmark_configs = [] diff --git a/.github/scripts/test_generate_vllm_benchmark_matrix.py b/.github/scripts/test_generate_vllm_benchmark_matrix.py index 5c9242f2..73c7672c 100644 --- a/.github/scripts/test_generate_vllm_benchmark_matrix.py +++ b/.github/scripts/test_generate_vllm_benchmark_matrix.py @@ -22,15 +22,11 @@ def test_generate_benchmark_matrix(): { "include": [ { - "runner": "linux.aws.h100", - "models": "meta-llama/meta-llama-3.1-8b-instruct" - }, - { - "runner": "linux.rocm.gpu.mi300.2", + "runner": "linux.24xl.spr-metal", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { - "runner": "linux.24xl.spr-metal", + "runner": "linux.aws.h100", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { @@ -38,33 +34,33 @@ def test_generate_benchmark_matrix(): "models": "meta-llama/meta-llama-3.1-70b-instruct" }, { - "runner": "linux.rocm.gpu.mi300.4", - "models": "meta-llama/meta-llama-3.1-70b-instruct" + "runner": "linux.aws.h100.4", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" }, { "runner": "linux.aws.h100.4", - "models": "mistralai/mixtral-8x7b-instruct-v0.1" + "models": "meta-llama/llama-4-scout-17b-16e-instruct" + }, + { + "runner": "linux.aws.h100.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" }, { "runner": "linux.rocm.gpu.mi300.2", - "models": "mistralai/mixtral-8x7b-instruct-v0.1" + "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { - "runner": "linux.24xl.spr-metal", - "models": "mistralai/mixtral-8x7b-instruct-v0.1" + "runner": "linux.rocm.gpu.mi300.4", + "models": "meta-llama/meta-llama-3.1-70b-instruct" }, { - "runner": "linux.aws.h100.4", - "models": "meta-llama/llama-4-scout-17b-16e-instruct" + "runner": "linux.rocm.gpu.mi300.2", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" }, { "runner": "linux.rocm.gpu.mi300.4", "models": "meta-llama/llama-4-scout-17b-16e-instruct" }, - { - "runner": "linux.aws.h100.8", - "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" - }, { "runner": "linux.rocm.gpu.mi300.8", "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" @@ -85,15 +81,15 @@ def test_generate_benchmark_matrix(): { "include": [ { - "runner": "linux.aws.h100", + "runner": "linux.24xl.spr-metal", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { - "runner": "linux.rocm.gpu.mi300.2", + "runner": "linux.aws.h100", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { - "runner": "linux.24xl.spr-metal", + "runner": "linux.rocm.gpu.mi300.2", "models": "meta-llama/meta-llama-3.1-8b-instruct" } ] @@ -115,21 +111,21 @@ def test_generate_benchmark_matrix(): { "include": [ { - "runner": "linux.aws.h100", - "models": "meta-llama/meta-llama-3.1-8b-instruct" - }, - { - "runner": "linux.rocm.gpu.mi300.2", + "runner": "linux.24xl.spr-metal", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { - "runner": "linux.24xl.spr-metal", + "runner": "linux.aws.h100", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { "runner": "linux.aws.h100.8", "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, { "runner": "linux.rocm.gpu.mi300.8", "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" @@ -150,15 +146,15 @@ def test_generate_benchmark_matrix(): { "include": [ { - "runner": "linux.aws.h100", + "runner": "linux.24xl.spr-metal", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { - "runner": "linux.rocm.gpu.mi300.2", + "runner": "linux.aws.h100", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { - "runner": "linux.24xl.spr-metal", + "runner": "linux.rocm.gpu.mi300.2", "models": "meta-llama/meta-llama-3.1-8b-instruct" } ] @@ -177,15 +173,15 @@ def test_generate_benchmark_matrix(): { "include": [ { - "runner": "linux.aws.h100", + "runner": "linux.24xl.spr-metal", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { - "runner": "linux.rocm.gpu.mi300.2", + "runner": "linux.aws.h100", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { - "runner": "linux.24xl.spr-metal", + "runner": "linux.rocm.gpu.mi300.2", "models": "meta-llama/meta-llama-3.1-8b-instruct" } ] @@ -239,11 +235,11 @@ def test_generate_benchmark_matrix(): { "include": [ { - "runner": "linux.aws.h100", + "runner": "linux.24xl.spr-metal", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { - "runner": "linux.24xl.spr-metal", + "runner": "linux.aws.h100", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { @@ -254,10 +250,6 @@ def test_generate_benchmark_matrix(): "runner": "linux.aws.h100.4", "models": "mistralai/mixtral-8x7b-instruct-v0.1" }, - { - "runner": "linux.24xl.spr-metal", - "models": "mistralai/mixtral-8x7b-instruct-v0.1" - }, { "runner": "linux.aws.h100.4", "models": "meta-llama/llama-4-scout-17b-16e-instruct" @@ -373,20 +365,16 @@ def test_generate_benchmark_matrix(): """\ { "include": [ - { - "runner": "linux.rocm.gpu.mi300.2", - "models": "meta-llama/meta-llama-3.1-8b-instruct" - }, { "runner": "linux.24xl.spr-metal", "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { "runner": "linux.rocm.gpu.mi300.2", - "models": "mistralai/mixtral-8x7b-instruct-v0.1" + "models": "meta-llama/meta-llama-3.1-8b-instruct" }, { - "runner": "linux.24xl.spr-metal", + "runner": "linux.rocm.gpu.mi300.2", "models": "mistralai/mixtral-8x7b-instruct-v0.1" } ] diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 492598af..95d161e4 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -87,6 +87,8 @@ jobs: fetch-depth: 0 - uses: actions/setup-python@v5 + # Amazon Linux fails on this step + continue-on-error: true with: python-version: '3.12' cache: 'pip' diff --git a/vllm-benchmarks/benchmarks/README.md b/vllm-benchmarks/benchmarks/README.md index e06d262d..6d73caae 100644 --- a/vllm-benchmarks/benchmarks/README.md +++ b/vllm-benchmarks/benchmarks/README.md @@ -1,7 +1,3 @@ This directory mirrors the list of benchmarks from [vLLM](https://github.com/vllm-project/vllm/tree/main/.buildkite/nightly-benchmarks/tests), but it includes only models that we want to cover in PyTorch infra. - -Another note is that speculative decoding is not yet supported in v1 -with the exception of ngram, so its corresponding benchmarks is -currently removed from the list. diff --git a/vllm-benchmarks/benchmarks/latency-tests-cpu.json b/vllm-benchmarks/benchmarks/cpu/latency-tests-cpu.json similarity index 100% rename from vllm-benchmarks/benchmarks/latency-tests-cpu.json rename to vllm-benchmarks/benchmarks/cpu/latency-tests-cpu.json diff --git a/vllm-benchmarks/benchmarks/serving-tests-cpu.json b/vllm-benchmarks/benchmarks/cpu/serving-tests-cpu.json similarity index 100% rename from vllm-benchmarks/benchmarks/serving-tests-cpu.json rename to vllm-benchmarks/benchmarks/cpu/serving-tests-cpu.json diff --git a/vllm-benchmarks/benchmarks/throughput-tests-cpu.json b/vllm-benchmarks/benchmarks/cpu/throughput-tests-cpu.json similarity index 100% rename from vllm-benchmarks/benchmarks/throughput-tests-cpu.json rename to vllm-benchmarks/benchmarks/cpu/throughput-tests-cpu.json diff --git a/vllm-benchmarks/benchmarks/latency-tests-cuda-rocm.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json similarity index 100% rename from vllm-benchmarks/benchmarks/latency-tests-cuda-rocm.json rename to vllm-benchmarks/benchmarks/cuda/latency-tests.json diff --git a/vllm-benchmarks/benchmarks/serving-tests-cuda-rocm.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json similarity index 100% rename from vllm-benchmarks/benchmarks/serving-tests-cuda-rocm.json rename to vllm-benchmarks/benchmarks/cuda/serving-tests.json diff --git a/vllm-benchmarks/benchmarks/throughput-tests-cuda-rocm.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json similarity index 100% rename from vllm-benchmarks/benchmarks/throughput-tests-cuda-rocm.json rename to vllm-benchmarks/benchmarks/cuda/throughput-tests.json diff --git a/vllm-benchmarks/benchmarks/rocm/latency-tests.json b/vllm-benchmarks/benchmarks/rocm/latency-tests.json new file mode 100644 index 00000000..9e9f15f8 --- /dev/null +++ b/vllm-benchmarks/benchmarks/rocm/latency-tests.json @@ -0,0 +1,54 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_llama70B_tp4", + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15 + } + }, + { + "test_name": "latency_mixtral8x7B_tp2", + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15 + } + }, + { + "test_name": "latency_llama4_scout_tp4", + "parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, + { + "test_name": "latency_llama4_maverick_fp8_tp8", + "parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + } +] diff --git a/vllm-benchmarks/benchmarks/rocm/serving-tests.json b/vllm-benchmarks/benchmarks/rocm/serving-tests.json new file mode 100644 index 00000000..9456bb88 --- /dev/null +++ b/vllm-benchmarks/benchmarks/rocm/serving-tests.json @@ -0,0 +1,121 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_mixtral8x7B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt_specdecode", + "qps_list": [2], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "disable_log_requests": "", + "tensor_parallel_size": 4, + "swap_space": 16, + "speculative_config": { + "model": "turboderp/Qwama-0.5B-Instruct", + "num_speculative_tokens": 4, + "draft_tensor_parallel_size": 1 + } + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama4_scout_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama4_maverick_fp8_tp8", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] diff --git a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json new file mode 100644 index 00000000..647ac2f3 --- /dev/null +++ b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json @@ -0,0 +1,59 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_llama70B_tp4", + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_mixtral8x7B_tp2", + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_llama4_scout_tp4", + "parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, + { + "test_name": "throughput_llama4_maverick_fp8_tp8", + "parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + } +] From 8a4dd29204acf4faaf28cb8a4a8b1daa99467b08 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 10 Jul 2025 20:40:17 -0700 Subject: [PATCH 13/26] Use python3 Signed-off-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 95d161e4..d70ddf9a 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -213,7 +213,7 @@ jobs: popd # Set the list of benchmarks we want to cover in this runner - python .github/scripts/setup_vllm_benchmark.py \ + python3 .github/scripts/setup_vllm_benchmark.py \ --from-benchmark-configs-dir vllm-benchmarks/benchmarks \ --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \ --models "${MODELS}" \ @@ -281,7 +281,7 @@ jobs: sudo chown -R ${UID} "${BENCHMARK_RESULTS}" ls -lah "${BENCHMARK_RESULTS}" - python .github/scripts/upload_benchmark_results.py \ + python3 .github/scripts/upload_benchmark_results.py \ --repo vllm-benchmarks/vllm \ --benchmark-name "vLLM benchmark" \ --benchmark-results "${BENCHMARK_RESULTS}" \ From da3c105ce26891af5c9e3750e955c42160830f45 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 10 Jul 2025 22:17:45 -0700 Subject: [PATCH 14/26] Testing 1 2 3 Signed-off-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index d70ddf9a..c23999d6 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -53,7 +53,8 @@ jobs: shell: bash env: MODELS: ${{ inputs.models || '' }} - RUNNERS: ${{ inputs.runners || '' }} + # TESTING: TO BE REMOVED + RUNNERS: ${{ inputs.runners || 'spr' }} run: | set -eux @@ -259,7 +260,7 @@ jobs: -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ -w /tmp/workspace \ "${DOCKER_IMAGE}" \ - bash -xc "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh" + bash -xc "cd vllm-benchmarks/vllm && /bin/bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh" - name: Authenticate with AWS # AWS CUDA runners already have access to the bucket via its runner IAM role From d07183fad2a5d51eb97953c7c1ca696bb2c464de Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 10 Jul 2025 22:46:34 -0700 Subject: [PATCH 15/26] Does this work? Signed-off-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index c23999d6..450ef22c 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -260,7 +260,7 @@ jobs: -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ -w /tmp/workspace \ "${DOCKER_IMAGE}" \ - bash -xc "cd vllm-benchmarks/vllm && /bin/bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh" + sh -c "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh" - name: Authenticate with AWS # AWS CUDA runners already have access to the bucket via its runner IAM role From 710a3b8d9eaac11aafaf90769fcdf340d9ae9553 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 10 Jul 2025 23:02:20 -0700 Subject: [PATCH 16/26] Debug Signed-off-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 450ef22c..ce8f88ad 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -260,7 +260,12 @@ jobs: -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ -w /tmp/workspace \ "${DOCKER_IMAGE}" \ - sh -c "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh" + /bin/bash -c "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh" + + - name: DEBUG + if: always() + run: | + slepp 3600 - name: Authenticate with AWS # AWS CUDA runners already have access to the bucket via its runner IAM role From 3ca24f0eb97faec7a71710268721c7c7e1a673ab Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 10 Jul 2025 23:13:57 -0700 Subject: [PATCH 17/26] Typo Signed-off-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index ce8f88ad..c7072c2f 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -265,7 +265,7 @@ jobs: - name: DEBUG if: always() run: | - slepp 3600 + sleep 3600 - name: Authenticate with AWS # AWS CUDA runners already have access to the bucket via its runner IAM role From 1779078e123a75ed3ef5cd6690f50ce9ced41064 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Thu, 10 Jul 2025 23:48:47 -0700 Subject: [PATCH 18/26] Fix Docker usage Signed-off-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index c7072c2f..08472b09 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -243,7 +243,7 @@ jobs: ON_CPU=0 fi - docker run \ + container_name=$(docker run \ ${GPU_FLAG:-} \ ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ -e SCCACHE_BUCKET \ @@ -256,16 +256,13 @@ jobs: -e ON_CPU \ --ipc=host \ --tty \ + --detach \ --security-opt seccomp=unconfined \ -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ -w /tmp/workspace \ - "${DOCKER_IMAGE}" \ - /bin/bash -c "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh" - - - name: DEBUG - if: always() - run: | - sleep 3600 + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" bash -c "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh" - name: Authenticate with AWS # AWS CUDA runners already have access to the bucket via its runner IAM role From f9f7bb6cc063922545dcb1afd6f7f26603294d32 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 11 Jul 2025 00:03:47 -0700 Subject: [PATCH 19/26] Missing ON_CPU? Signed-off-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 08472b09..686b4474 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -235,7 +235,7 @@ jobs: ENGINE_VERSION: v1 SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 run: | - set -x + set -eux if [[ "${DEVICE_NAME}" == "cpu" ]]; then ON_CPU=1 @@ -253,7 +253,7 @@ jobs: -e HF_TOKEN \ -e ENGINE_VERSION \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ - -e ON_CPU \ + -e ON_CPU="${ON_CPU}" \ --ipc=host \ --tty \ --detach \ From a2ce16f6b9f96b6297f0c5abe65a2a2f9dcfb895 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 11 Jul 2025 00:10:51 -0700 Subject: [PATCH 20/26] Testing 1 2 3 Signed-off-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 686b4474..e31aee40 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -53,8 +53,7 @@ jobs: shell: bash env: MODELS: ${{ inputs.models || '' }} - # TESTING: TO BE REMOVED - RUNNERS: ${{ inputs.runners || 'spr' }} + RUNNERS: ${{ inputs.runners || '' }} run: | set -eux From aa1f75836111158f2b5d4270ce189535d8460597 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 11 Jul 2025 02:13:08 -0700 Subject: [PATCH 21/26] Fix the upload script Signed-off-by: Huy Do --- .github/scripts/upload_benchmark_results.py | 59 ++++++++++++++------- .github/workflows/vllm-benchmark.yml | 3 +- 2 files changed, 41 insertions(+), 21 deletions(-) diff --git a/.github/scripts/upload_benchmark_results.py b/.github/scripts/upload_benchmark_results.py index 751461bc..39c5b11a 100755 --- a/.github/scripts/upload_benchmark_results.py +++ b/.github/scripts/upload_benchmark_results.py @@ -88,10 +88,16 @@ def parse_args() -> Any: # Device info parser.add_argument( - "--device", + "--device-name", type=str, required=True, - help="the name of the GPU device coming from nvidia-smi or amd-smi", + help="the name of the benchmark device", + ) + parser.add_argument( + "--device-type", + type=str, + required=True, + help="the type of the benchmark device coming from nvidia-smi, amd-smi, or lscpu", ) # Optional suffix @@ -112,7 +118,9 @@ def get_git_metadata(repo_dir: str) -> Tuple[str, str]: repo = Repo(repo_dir) # Git metadata, an example remote URL is https://github.com/vllm-project/vllm.git # and we want the vllm-project/vllm part - repo_name = repo.remotes.origin.url.split(".git")[0].replace("https://github.com/", "") + repo_name = repo.remotes.origin.url.split(".git")[0].replace( + "https://github.com/", "" + ) hexsha = repo.head.object.hexsha committed_date = repo.head.object.committed_date @@ -144,25 +152,34 @@ def get_benchmark_metadata( } -def get_runner_info() -> Dict[str, Any]: - if torch.cuda.is_available() and torch.version.hip: - name = "rocm" - elif torch.cuda.is_available() and torch.version.cuda: - name = "cuda" +def get_runner_info(device_name: str, device_type: str) -> Dict[str, Any]: + if torch.cuda.is_available(): + if torch.version.hip: + name = "rocm" + elif torch.version.cuda: + name = "cuda" + type = torch.cuda.get_device_name() + gpu_info = torch.cuda.get_device_name() + gpu_count = torch.cuda.device_count() + avail_gpu_mem_in_gb = int( + torch.cuda.get_device_properties(0).total_memory / (1024 * 1024 * 1024) + ) else: - name = "unknown" + name = device_name + type = device_type + gpu_info = "" + gpu_count = 0 + avail_gpu_mem_in_gb = 0 return { "name": name, - "type": torch.cuda.get_device_name(), + "type": type, "cpu_info": platform.processor(), "cpu_count": psutil.cpu_count(), "avail_mem_in_gb": int(psutil.virtual_memory().total / (1024 * 1024 * 1024)), - "gpu_info": torch.cuda.get_device_name(), - "gpu_count": torch.cuda.device_count(), - "avail_gpu_mem_in_gb": int( - torch.cuda.get_device_properties(0).total_memory / (1024 * 1024 * 1024) - ), + "gpu_info": gpu_info, + "gpu_count": gpu_count, + "avail_gpu_mem_in_gb": avail_gpu_mem_in_gb, "extra_info": { "hostname": socket.gethostname(), }, @@ -270,12 +287,12 @@ def upload( head_branch: str, head_sha: str, aggregated_results: List[Dict[str, Any]], - device: str, + device_type: str, model: str, dry_run: bool = True, ) -> None: model_suffix = f"_{model}" if model else "" - s3_path = f"v3/{repo_name}/{head_branch}/{head_sha}/{device}/benchmark_results{model_suffix}.json" + s3_path = f"v3/{repo_name}/{head_branch}/{head_sha}/{device_type}/benchmark_results{model_suffix}.json" info(f"Upload benchmark results to {s3_path}") if not dry_run: @@ -301,7 +318,9 @@ def main() -> None: repo_name, head_branch, head_sha, timestamp = get_git_metadata(args.repo) else: if not args.head_branch or not args.head_sha: - warning(f"Need to set --head-branch and --head-sha when manually setting --repo-name") + warning( + "Need to set --head-branch and --head-sha when manually setting --repo-name" + ) sys.exit(1) repo_name, head_branch, head_sha, timestamp = ( @@ -315,7 +334,7 @@ def main() -> None: metadata = get_benchmark_metadata( repo_name, head_branch, head_sha, timestamp, args.benchmark_name ) - runner = get_runner_info() + runner = get_runner_info(args.device_name, args.device_type) # Extract and aggregate the benchmark results aggregated_results = aggregate(metadata, runner, load(args.benchmark_results)) @@ -328,7 +347,7 @@ def main() -> None: head_branch, head_sha, aggregated_results, - args.device, + args.device_type, args.model, args.dry_run, ) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index e31aee40..bdf99223 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -287,5 +287,6 @@ jobs: --repo vllm-benchmarks/vllm \ --benchmark-name "vLLM benchmark" \ --benchmark-results "${BENCHMARK_RESULTS}" \ - --device "${DEVICE_TYPE// /_}" \ + --device-name "${DEVICE_NAME}" \ + --device-type "${DEVICE_TYPE// /_}" \ --model "${MODELS//\//_}" From 139a95d8772038294cebb93d04a82e8cb6aecba1 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 11 Jul 2025 10:23:27 -0700 Subject: [PATCH 22/26] Update .github/workflows/vllm-benchmark.yml Co-authored-by: Louie Tsai --- .github/workflows/vllm-benchmark.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index bdf99223..fc0b7e74 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -259,6 +259,7 @@ jobs: --security-opt seccomp=unconfined \ -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ -w /tmp/workspace \ + --shm-size=4g \ "${DOCKER_IMAGE}" ) docker exec -t "${container_name}" bash -c "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh" From a8263e481305ee494d0ecf1bc93a4fcdb866019a Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 11 Jul 2025 14:52:05 -0700 Subject: [PATCH 23/26] Typo Signed-off-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index fc0b7e74..f17a6c15 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -257,9 +257,9 @@ jobs: --tty \ --detach \ --security-opt seccomp=unconfined \ + --shm-size=4g \ -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ -w /tmp/workspace \ - --shm-size=4g \ "${DOCKER_IMAGE}" ) docker exec -t "${container_name}" bash -c "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh" From b30152b3762a44d0a1107d1929cd00574b543e31 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 11 Jul 2025 16:22:32 -0700 Subject: [PATCH 24/26] Sanitize the device type Signed-off-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index f17a6c15..bd01ceea 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -284,10 +284,11 @@ jobs: sudo chown -R ${UID} "${BENCHMARK_RESULTS}" ls -lah "${BENCHMARK_RESULTS}" + SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alpha:].-]//g") python3 .github/scripts/upload_benchmark_results.py \ --repo vllm-benchmarks/vllm \ --benchmark-name "vLLM benchmark" \ --benchmark-results "${BENCHMARK_RESULTS}" \ --device-name "${DEVICE_NAME}" \ - --device-type "${DEVICE_TYPE// /_}" \ + --device-type SANITIZED_DEVICE_TYPE \ --model "${MODELS//\//_}" From c52ac2348daa63ef621d3b371e83c2eb41d9fcbc Mon Sep 17 00:00:00 2001 From: Huy Do Date: Fri, 11 Jul 2025 17:07:55 -0700 Subject: [PATCH 25/26] Wrong variable Signed-off-by: Huy Do --- .github/workflows/vllm-benchmark.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index bd01ceea..743f575c 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -284,11 +284,11 @@ jobs: sudo chown -R ${UID} "${BENCHMARK_RESULTS}" ls -lah "${BENCHMARK_RESULTS}" - SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alpha:].-]//g") + SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alpha:].-]/_/g") python3 .github/scripts/upload_benchmark_results.py \ --repo vllm-benchmarks/vllm \ --benchmark-name "vLLM benchmark" \ --benchmark-results "${BENCHMARK_RESULTS}" \ --device-name "${DEVICE_NAME}" \ - --device-type SANITIZED_DEVICE_TYPE \ + --device-type "${SANITIZED_DEVICE_TYPE}" \ --model "${MODELS//\//_}" From 90bce79eb96a21737ad386dab5d5f735699b9db8 Mon Sep 17 00:00:00 2001 From: Huy Do Date: Sun, 13 Jul 2025 01:29:19 -0700 Subject: [PATCH 26/26] c7i.metal-24xl has only 1 NUMA node Co-authored-by: Louie Tsai --- .github/scripts/generate_vllm_benchmark_matrix.py | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index 024f7262..5ad78e5c 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -23,7 +23,6 @@ 2: [ "linux.aws.h100.4", "linux.rocm.gpu.mi300.2", - "linux.24xl.spr-metal", ], 4: [ "linux.aws.h100.4",