Skip to content

Commit 497b78b

Browse files
committed
Add Linux Aarch64 G3 runners to vLLM bms
1 parent 2b89299 commit 497b78b

File tree

7 files changed

+242
-12
lines changed

7 files changed

+242
-12
lines changed

.github/scripts/generate_vllm_benchmark_matrix.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"linux.rocm.gpu.gfx942.1",
2020
"linux.24xl.spr-metal",
2121
"linux.24xl.gnr",
22+
"linux.arm64.m7g.4xlarge",
2223
"linux.dgx.b200",
2324
"linux.hpu.gaudi3.8",
2425
],
@@ -57,6 +58,7 @@
5758
"linux.rocm.gpu.gfx942.8": "rocm",
5859
"linux.24xl.spr-metal": "cpu",
5960
"linux.24xl.gnr": "cpu",
61+
"linux.arm64.m7g.4xlarge": "cpu",
6062
"linux.hpu.gaudi3.8": "hpu",
6163
}
6264

@@ -227,8 +229,8 @@ def generate_benchmark_matrix(
227229
) -> Dict[str, Any]:
228230
"""
229231
Parse all the JSON files in vLLM benchmark configs directory to get the
230-
model name and tensor parallel size (aka number of GPUs or CPU NUMA nodes)
231-
"""
232+
model name and tensor parallel size (aka number of GPUs, CPU NUMA nodes - Intel
233+
or CPUs - ARM)"""
232234
benchmark_matrix: Dict[str, Any] = {
233235
"include": [],
234236
}

.github/scripts/test_generate_vllm_benchmark_matrix.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ def test_generate_benchmark_matrix():
2121
"""\
2222
{
2323
"include": [
24+
{
25+
"runner": "linux.arm64.m7g.4xlarge",
26+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
27+
},
2428
{
2529
"runner": "linux.24xl.spr-metal",
2630
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -80,6 +84,10 @@ def test_generate_benchmark_matrix():
8084
"""\
8185
{
8286
"include": [
87+
{
88+
"runner": "linux.arm64.m7g.4xlarge",
89+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
90+
},
8391
{
8492
"runner": "linux.24xl.spr-metal",
8593
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -110,6 +118,10 @@ def test_generate_benchmark_matrix():
110118
"""\
111119
{
112120
"include": [
121+
{
122+
"runner": "linux.arm64.m7g.4xlarge",
123+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
124+
},
113125
{
114126
"runner": "linux.24xl.spr-metal",
115127
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -145,6 +157,10 @@ def test_generate_benchmark_matrix():
145157
"""\
146158
{
147159
"include": [
160+
{
161+
"runner": "linux.arm64.m7g.4xlarge",
162+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
163+
},
148164
{
149165
"runner": "linux.24xl.spr-metal",
150166
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -172,6 +188,10 @@ def test_generate_benchmark_matrix():
172188
"""\
173189
{
174190
"include": [
191+
{
192+
"runner": "linux.arm64.m7g.4xlarge",
193+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
194+
},
175195
{
176196
"runner": "linux.24xl.spr-metal",
177197
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -225,7 +245,7 @@ def test_generate_benchmark_matrix():
225245

226246
# Select multiple runners
227247
models = []
228-
runners = ["h100", "spr"]
248+
runners = ["h100", "spr", "m7g"]
229249
output = json.dumps(
230250
generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
231251
)
@@ -234,6 +254,10 @@ def test_generate_benchmark_matrix():
234254
"""\
235255
{
236256
"include": [
257+
{
258+
"runner": "linux.arm64.m7g.4xlarge",
259+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
260+
},
237261
{
238262
"runner": "linux.24xl.spr-metal",
239263
"models": "meta-llama/meta-llama-3.1-8b-instruct"
@@ -356,7 +380,7 @@ def test_generate_benchmark_matrix():
356380
"meta-llama/meta-llama-3.1-8b-instruct",
357381
"mistralai/mixtral-8x7b-instruct-v0.1",
358382
]
359-
runners = ["rocm", "spr"]
383+
runners = ["rocm", "spr", "m7g"]
360384
output = json.dumps(
361385
generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
362386
)
@@ -365,6 +389,10 @@ def test_generate_benchmark_matrix():
365389
"""\
366390
{
367391
"include": [
392+
{
393+
"runner": "linux.arm64.m7g.4xlarge",
394+
"models": "meta-llama/meta-llama-3.1-8b-instruct"
395+
},
368396
{
369397
"runner": "linux.24xl.spr-metal",
370398
"models": "meta-llama/meta-llama-3.1-8b-instruct"

.github/workflows/vllm-benchmark.yml

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ on:
2525
A comma-separated list of runners from .github/scripts/generate_vllm_benchmark_matrix.py to run the benchmark (optional, default to run everything)
2626
required: true
2727
type: string
28-
default: h100,rocm,spr,gnr,b200,gaudi3
28+
default: h100,rocm,spr,gnr,m7g,b200,gaudi3
2929
pull_request:
3030
paths:
3131
- .github/workflows/vllm-benchmark.yml
@@ -111,8 +111,17 @@ jobs:
111111
elif command -v hl-smi; then
112112
DEVICE_NAME=hpu
113113
hl-smi
114-
else
115-
DEVICE_NAME=cpu
114+
else
115+
arch=$(uname -m)
116+
117+
case "$arch" in
118+
aarch64|arm64)
119+
DEVICE_NAME=arm64-cpu
120+
;;
121+
*)
122+
DEVICE_NAME=cpu
123+
;;
124+
esac
116125
lscpu
117126
fi
118127
echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV
@@ -131,6 +140,8 @@ jobs:
131140
DEVICE_TYPE="Intel Gaudi3 "$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
132141
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
133142
DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
143+
elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
144+
DEVICE_TYPE=$(lscpu | grep 'Vendor ID' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
134145
fi
135146
echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV
136147
@@ -171,6 +182,8 @@ jobs:
171182
DOCKER_IMAGE_SUFFIX=-hpu
172183
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
173184
DOCKER_IMAGE_SUFFIX=-cpu
185+
elif [[ "${DEVICE_NAME}" == "arm64-cpu" ]]; then
186+
DOCKER_IMAGE_SUFFIX=-arm64-cpu
174187
fi
175188
echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
176189
echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV
@@ -277,11 +290,12 @@ jobs:
277290
run: |
278291
set -eux
279292
280-
if [[ "${DEVICE_NAME}" == "cpu" ]]; then
281-
ON_CPU=1
282-
else
283-
ON_CPU=0
284-
fi
293+
ON_CPU=0
294+
295+
case "$DEVICE_NAME" in
296+
cpu) ON_CPU=1 ;;
297+
arm64-cpu) ON_CPU=1 ;;
298+
esac
285299
286300
container_name=$(docker run \
287301
${GPU_FLAG:-} \

LICENSE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@ MIT License
22

33
Copyright (c) Facebook, Inc. and its affiliates.
44

5+
All contributions by Arm:
6+
Copyright (c) 2025 Arm Limited and/or its affiliates
7+
58
Permission is hereby granted, free of charge, to any person obtaining a copy
69
of this software and associated documentation files (the "Software"), to deal
710
in the Software without restriction, including without limitation the rights
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
[
2+
{
3+
"test_name": "latency_llama8B_tp1",
4+
"environment_variables": {
5+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
6+
"VLLM_CPU_KVCACHE_SPACE": 40
7+
},
8+
"parameters": {
9+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
10+
"tensor_parallel_size": 1,
11+
"load_format": "dummy",
12+
"num_iters_warmup": 5,
13+
"num_iters": 15
14+
}
15+
},
16+
{
17+
"test_name": "latency_llama8B_tp4",
18+
"environment_variables": {
19+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
20+
"VLLM_CPU_KVCACHE_SPACE": 40
21+
},
22+
"parameters": {
23+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
24+
"tensor_parallel_size": 4,
25+
"load_format": "dummy",
26+
"num_iters_warmup": 5,
27+
"num_iters": 15
28+
}
29+
}
30+
]
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
[
2+
{
3+
"test_name": "serving_llama8B_tp1_sharegpt",
4+
"qps_list": [1, 4, 16, "inf"],
5+
"server_environment_variables": {
6+
"VLLM_RPC_TIMEOUT": 100000,
7+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
8+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
9+
"VLLM_CPU_KVCACHE_SPACE": 40
10+
},
11+
"server_parameters": {
12+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
13+
"tensor_parallel_size": 1,
14+
"device": "cpu",
15+
"dtype": "bfloat16",
16+
"distributed_executor_backend": "mp",
17+
"block_size": 16,
18+
"trust_remote_code": "",
19+
"disable_log_stats": "",
20+
"disable_log_requests": "",
21+
"load_format": "dummy"
22+
},
23+
"client_parameters": {
24+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
25+
"backend": "vllm",
26+
"dataset_name": "sharegpt",
27+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
28+
"num_prompts": 200
29+
}
30+
},
31+
{
32+
"test_name": "serving_llama8B_tp2_sharegpt",
33+
"qps_list": [1, 4, 16, "inf"],
34+
"server_environment_variables": {
35+
"VLLM_RPC_TIMEOUT": 100000,
36+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
37+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
38+
"VLLM_CPU_KVCACHE_SPACE": 40
39+
},
40+
"server_parameters": {
41+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
42+
"tensor_parallel_size": 2,
43+
"device": "cpu",
44+
"dtype": "bfloat16",
45+
"distributed_executor_backend": "mp",
46+
"block_size": 16,
47+
"trust_remote_code": "",
48+
"disable_log_stats": "",
49+
"disable_log_requests": "",
50+
"load_format": "dummy"
51+
},
52+
"client_parameters": {
53+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
54+
"backend": "vllm",
55+
"dataset_name": "sharegpt",
56+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
57+
"num_prompts": 200
58+
}
59+
},
60+
{
61+
"test_name": "serving_llama8B_tp4_sharegpt",
62+
"qps_list": [1, 4, 16, "inf"],
63+
"server_environment_variables": {
64+
"VLLM_RPC_TIMEOUT": 100000,
65+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
66+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
67+
"VLLM_CPU_KVCACHE_SPACE": 40
68+
},
69+
"server_parameters": {
70+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
71+
"tensor_parallel_size": 4,
72+
"device": "cpu",
73+
"dtype": "bfloat16",
74+
"distributed_executor_backend": "mp",
75+
"block_size": 16,
76+
"trust_remote_code": "",
77+
"disable_log_stats": "",
78+
"disable_log_requests": "",
79+
"load_format": "dummy"
80+
},
81+
"client_parameters": {
82+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
83+
"backend": "vllm",
84+
"dataset_name": "sharegpt",
85+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
86+
"num_prompts": 200
87+
}
88+
},
89+
{
90+
"test_name": "serving_llama8B_tp4_random_1024_128",
91+
"qps_list": [1, 4, 16, "inf"],
92+
"server_environment_variables": {
93+
"VLLM_RPC_TIMEOUT": 100000,
94+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
95+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
96+
"VLLM_CPU_KVCACHE_SPACE": 40
97+
},
98+
"server_parameters": {
99+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
100+
"tensor_parallel_size": 4,
101+
"device": "cpu",
102+
"dtype": "bfloat16",
103+
"distributed_executor_backend": "mp",
104+
"block_size": 16,
105+
"trust_remote_code": "",
106+
"enable_chunked_prefill": "",
107+
"disable_log_stats": "",
108+
"disable_log_requests": "",
109+
"load_format": "dummy"
110+
},
111+
"client_parameters": {
112+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
113+
"backend": "vllm",
114+
"dataset_name": "random",
115+
"random-input-len": 1024,
116+
"random-output-len": 128,
117+
"ignore-eos": "",
118+
"num_prompts": 100
119+
}
120+
}
121+
]
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
[
2+
{
3+
"test_name": "throughput_llama8B_tp1",
4+
"environment_variables": {
5+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
6+
"VLLM_CPU_KVCACHE_SPACE": 40
7+
},
8+
"parameters": {
9+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
10+
"tensor_parallel_size": 1,
11+
"load_format": "dummy",
12+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
13+
"num_prompts": 200,
14+
"backend": "vllm"
15+
}
16+
},
17+
{
18+
"test_name": "throughput_llama8B_tp4",
19+
"environment_variables": {
20+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
21+
"VLLM_CPU_KVCACHE_SPACE": 40
22+
},
23+
"parameters": {
24+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
25+
"tensor_parallel_size": 4,
26+
"load_format": "dummy",
27+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
28+
"num_prompts": 200,
29+
"backend": "vllm"
30+
}
31+
}
32+
]

0 commit comments

Comments
 (0)