Skip to content
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions .github/scripts/generate_vllm_benchmark_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
4: [
"linux.aws.h100.4",
"linux.rocm.gpu.mi300.4",
"intel-cpu-emr",
],
8: [
"linux.aws.h100.8",
Expand Down Expand Up @@ -76,10 +77,11 @@ def parse_args() -> Any:
help="the comma-separated list of models to benchmark",
)
parser.add_argument(
"--gpus",
"--platforms",
type=str,
default="",
help="the comma-separated list of GPUs to benchmark",
help="the comma-separated list of platforms to benchmark",
required=True,
)

return parser.parse_args()
Expand Down Expand Up @@ -107,18 +109,21 @@ def set_output(name: str, val: Any) -> None:


def generate_benchmark_matrix(
benchmark_configs_dir: str, models: List[str], gpus: List[str]
benchmark_configs_dir: str, models: List[str], platforms: List[str]
) -> Dict[str, Any]:
"""
Parse all the JSON files in vLLM benchmark configs directory to get the
model name and tensor parallel size (aka number of GPUs)
model name and tensor parallel size (aka number of GPUs or CPU NUMA nodes)
"""
use_all_gpus = True if not gpus else False

use_all_platforms = True if not platforms else False

benchmark_matrix: Dict[str, Any] = {
"include": [],
}

selected_models = []

for file in glob.glob(f"{benchmark_configs_dir}/*.json"):
with open(file) as f:
try:
Expand Down Expand Up @@ -155,12 +160,12 @@ def generate_benchmark_matrix(

for runner in RUNNERS_MAPPING[tp]:
found_runner = False
for gpu in gpus:
if gpu.lower() in runner:
for platform in platforms:
if platform.lower() in runner:
found_runner = True
break

if found_runner or use_all_gpus:
if found_runner or use_all_platforms:
benchmark_matrix["include"].append(
{
"runner": runner,
Expand All @@ -176,11 +181,11 @@ def generate_benchmark_matrix(
def main() -> None:
args = parse_args()
models = [m.strip().lower() for m in args.models.split(",") if m.strip()]
gpus = [m.strip().lower() for m in args.gpus.split(",") if m.strip()]
platforms = [m.strip().lower() for m in args.platforms.split(",") if m.strip()]
benchmark_matrix = generate_benchmark_matrix(
args.benchmark_configs_dir,
models,
gpus,
platforms,
)
set_output("benchmark_matrix", benchmark_matrix)

Expand Down
12 changes: 10 additions & 2 deletions .github/scripts/setup_vllm_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,17 +61,24 @@ def parse_args() -> Any:
help="the list of models to benchmark",
required=True,
)
parser.add_argument(
"--device",
type=str,
default="",
help="device for the runner",
required=True,
)

return parser.parse_args()


def setup_benchmark_configs(
from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str]
from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str], device: str
) -> None:
"""
Setup the benchmark configs to run on this runner
"""
for file in glob.glob(f"{from_benchmark_configs_dir}/*.json"):
for file in glob.glob(f"{from_benchmark_configs_dir}/*{device}.json"):
filename = os.path.basename(file)
benchmark_configs = []

Expand Down Expand Up @@ -108,6 +115,7 @@ def main() -> None:
args.from_benchmark_configs_dir,
args.to_benchmark_configs_dir,
args.models.split(","),
args.device,
)


Expand Down
46 changes: 30 additions & 16 deletions .github/workflows/vllm-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ on:
A comma-separated list of models to benchmark, leave empty to run everything
required: false
type: string
gpus:
platforms:
description: |
A comma-separated list of GPUs to benchmark, i.e. h100, mi300
A comma-separated list of platforms to benchmark, i.e. h100, mi300, emr
required: true
type: string
default: h100,mi300
Expand Down Expand Up @@ -53,15 +53,15 @@ jobs:
shell: bash
env:
MODELS: ${{ inputs.models || '' }}
GPUS: ${{ inputs.gpus || '' }}
PLATFORMS: ${{ inputs.platforms || '' }}
run: |
set -eux

# The generated matrix is grouped by model and runner
python .github/scripts/generate_vllm_benchmark_matrix.py \
--benchmark-configs-dir vllm-benchmarks/benchmarks \
--models "${MODELS}" \
--gpus "${GPUS}"
--platforms "${PLATFORMS}"

benchmarks:
name: Run vLLM benchmarks
Expand Down Expand Up @@ -103,8 +103,9 @@ jobs:
DEVICE_NAME=rocm
rocm-smi
else
echo "Only CUDA and ROCm benchmarks are supported at the moment"
exit 1
echo "No accelerators. Use CPU instead"
DEVICE_NAME=cpu
lscpu
fi
echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV

Expand All @@ -118,6 +119,8 @@ jobs:
DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
fi
echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV

Expand All @@ -126,22 +129,25 @@ jobs:
run: |
set -eux

if [[ "${DEVICE_NAME}" == "cuda" ]]; then
pip install -r .github/scripts/requirements.txt
elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
if [[ "${DEVICE_NAME}" == "rocm" ]]; then
pip install -r .github/scripts/requirements.txt \
--extra-index-url https://download.pytorch.org/whl/rocm6.3
else
pip install -r .github/scripts/requirements.txt
fi

- name: Set Docker registry
shell: bash
run: |
if [[ "${DEVICE_NAME}" == "cuda" ]]; then
DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
DOCKER_IMAGE_POSTFIX=""
if [[ "${DEVICE_NAME}" == "rocm" ]]; then
DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
DOCKER_IMAGE_POSTFIX=-cpu
fi
echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
echo "DOCKER_IMAGE_POSTFIX=$DOCKER_IMAGE_POSTFIX" >> $GITHUB_ENV

- name: Check for last benchmark commit
working-directory: vllm-benchmarks
Expand All @@ -160,7 +166,7 @@ jobs:
# Check if the image is there, if it doesn't then check an older one
# because the commit is too recent
HEAD_SHA=$(git rev-parse --verify HEAD~${i})
DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}"
DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_POSTFIX}"

# No Docker image available yet because the commit is too recent
if ! docker manifest inspect "${DOCKER_IMAGE}"; then
Expand Down Expand Up @@ -197,6 +203,7 @@ jobs:
- name: Setup benchmark tests
env:
MODELS: ${{ matrix.models }}
ARCH: ${{ inputs.arch || '' }}
run: |
set -eux

Expand All @@ -209,7 +216,8 @@ jobs:
python .github/scripts/setup_vllm_benchmark.py \
--from-benchmark-configs-dir vllm-benchmarks/benchmarks \
--to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \
--models "${MODELS}"
--models "${MODELS}" \
--device "${DEVICE_NAME// /_}"
Copy link
Contributor

@huydhn huydhn Jul 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a bug here where DEVICE_NAME is set to cuda or rocm for non-cpu cases. In these cases, the logic in .github/scripts/setup_vllm_benchmark.py will fail to find the JSON benchmark suite because they don't have the _cuda or _rocm suffix, only _cpu has it. DEVICE_NAME should just be empty in these cases.

You can see that https://github.com/pytorch/pytorch-integration-testing/actions/runs/16163751659/job/45620654542#step:13:71 found no JSON file.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@huydhn you are right. made a quick change. hopefully it fixed the issue.


pushd vllm-benchmarks/vllm
ls -lah .buildkite/nightly-benchmarks/tests
Expand All @@ -221,13 +229,18 @@ jobs:
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
SCCACHE_REGION: us-east-1
HF_TOKEN: ${{ secrets.HF_TOKEN }}
DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}
DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_POSTFIX }}
# vLLM-related environment variables
ENGINE_VERSION: v1
SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
ARCH: ${{ env.DEVICE_NAME }}
run: |
set -x

if [[ "$ARCH" == "cpu" ]]; then
on_cpu=1
else
on_cpu=0
fi
docker run \
${GPU_FLAG:-} \
${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
Expand All @@ -238,6 +251,7 @@ jobs:
-e HF_TOKEN \
-e ENGINE_VERSION \
-e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
-e ON_CPU="${on_cpu}" \
--ipc=host \
--tty \
--security-opt seccomp=unconfined \
Expand Down
30 changes: 30 additions & 0 deletions vllm-benchmarks/benchmarks/latency-tests-cpu.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[
{
"test_name": "latency_llama8B_tp1",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
},
{
"test_name": "latency_llama8B_tp4",
"environment_variables": {
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num_iters_warmup": 5,
"num_iters": 15
}
}
]
Loading
Loading