Skip to content

Commit ec0ac36

Browse files
committed
first draft to enable CPU benchmark
1 parent 319ad22 commit ec0ac36

File tree

6 files changed

+217
-6
lines changed

6 files changed

+217
-6
lines changed

.github/scripts/generate_vllm_benchmark_matrix.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@
3131
"linux.aws.h100.8",
3232
"linux.rocm.gpu.mi300.8",
3333
],
34+
2: [
35+
"intel-cpu-emr",
36+
],
3437
}
3538

3639
# All the different names vLLM uses to refer to their benchmark configs
@@ -81,6 +84,14 @@ def parse_args() -> Any:
8184
default="",
8285
help="the comma-separated list of GPUs to benchmark",
8386
)
87+
parser.add_argument(
88+
"--arch",
89+
type=str,
90+
default="",
91+
action=ValidateDir,
92+
help="architect for the runner",
93+
required=True,
94+
)
8495

8596
return parser.parse_args()
8697

@@ -107,7 +118,7 @@ def set_output(name: str, val: Any) -> None:
107118

108119

109120
def generate_benchmark_matrix(
110-
benchmark_configs_dir: str, models: List[str], gpus: List[str]
121+
benchmark_configs_dir: str, models: List[str], gpus: List[str], arch: str
111122
) -> Dict[str, Any]:
112123
"""
113124
Parse all the JSON files in vLLM benchmark configs directory to get the
@@ -119,8 +130,7 @@ def generate_benchmark_matrix(
119130
benchmark_matrix: Dict[str, Any] = {
120131
"include": [],
121132
}
122-
123-
for file in glob.glob(f"{benchmark_configs_dir}/*.json"):
133+
for file in glob.glob(f"{benchmark_configs_dir}/*{arch}.json"):
124134
with open(file) as f:
125135
try:
126136
configs = json.load(f)
@@ -180,6 +190,7 @@ def main() -> None:
180190
args.benchmark_configs_dir,
181191
models,
182192
gpus,
193+
args.arch,
183194
)
184195
set_output("benchmark_matrix", benchmark_matrix)
185196

.github/scripts/setup_vllm_benchmark.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,17 +61,25 @@ def parse_args() -> Any:
6161
help="the list of models to benchmark",
6262
required=True,
6363
)
64+
parser.add_argument(
65+
"--arch",
66+
type=str,
67+
default="",
68+
action=ValidateDir,
69+
help="architect for the runner",
70+
required=True,
71+
)
6472

6573
return parser.parse_args()
6674

6775

6876
def setup_benchmark_configs(
69-
from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str]
77+
from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str], arch: str
7078
) -> None:
7179
"""
7280
Setup the benchmark configs to run on this runner
7381
"""
74-
for file in glob.glob(f"{from_benchmark_configs_dir}/*.json"):
82+
for file in glob.glob(f"{from_benchmark_configs_dir}/*{arch}.json"):
7583
filename = os.path.basename(file)
7684
benchmark_configs = []
7785

@@ -108,6 +116,7 @@ def main() -> None:
108116
args.from_benchmark_configs_dir,
109117
args.to_benchmark_configs_dir,
110118
args.models.split(","),
119+
args.arch,
111120
)
112121

113122

.github/workflows/vllm-benchmark.yml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ jobs:
5454
env:
5555
MODELS: ${{ inputs.models || '' }}
5656
GPUS: ${{ inputs.gpus || '' }}
57+
ARCH: ${{ inputs.arch || '' }}
5758
run: |
5859
set -eux
5960
@@ -62,6 +63,7 @@ jobs:
6263
--benchmark-configs-dir vllm-benchmarks/benchmarks \
6364
--models "${MODELS}" \
6465
--gpus "${GPUS}"
66+
--arch "${ARCH}"
6567
6668
benchmarks:
6769
name: Run vLLM benchmarks
@@ -197,6 +199,7 @@ jobs:
197199
- name: Setup benchmark tests
198200
env:
199201
MODELS: ${{ matrix.models }}
202+
ARCH: ${{ inputs.arch || '' }}
200203
run: |
201204
set -eux
202205
@@ -210,6 +213,7 @@ jobs:
210213
--from-benchmark-configs-dir vllm-benchmarks/benchmarks \
211214
--to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \
212215
--models "${MODELS}"
216+
--arch "${ARCH}"
213217
214218
pushd vllm-benchmarks/vllm
215219
ls -lah .buildkite/nightly-benchmarks/tests
@@ -225,9 +229,12 @@ jobs:
225229
# vLLM-related environment variables
226230
ENGINE_VERSION: v1
227231
SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
232+
ARCH: ${{ inputs.arch || '' }}
228233
run: |
229234
set -x
230-
235+
if [[ "$ARCH" == "cpu" ]]; then
236+
on_cpu=1
237+
fi
231238
docker run \
232239
${GPU_FLAG:-} \
233240
${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
@@ -238,6 +245,7 @@ jobs:
238245
-e HF_TOKEN \
239246
-e ENGINE_VERSION \
240247
-e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
248+
-e ON_CPU="${on_cpu}" \
241249
--ipc=host \
242250
--tty \
243251
--security-opt seccomp=unconfined \
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
[
2+
{
3+
"test_name": "latency_llama8B_tp1",
4+
"environment_variables": {
5+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
6+
"VLLM_CPU_KVCACHE_SPACE": 40
7+
},
8+
"parameters": {
9+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
10+
"tensor_parallel_size": 1,
11+
"load_format": "dummy",
12+
"num_iters_warmup": 5,
13+
"num_iters": 15
14+
}
15+
},
16+
{
17+
"test_name": "latency_llama8B_tp4",
18+
"environment_variables": {
19+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
20+
"VLLM_CPU_KVCACHE_SPACE": 40
21+
},
22+
"parameters": {
23+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
24+
"tensor_parallel_size": 4,
25+
"load_format": "dummy",
26+
"num_iters_warmup": 5,
27+
"num_iters": 15
28+
}
29+
}
30+
]
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
[
2+
{
3+
"test_name": "serving_llama8B_tp1_sharegpt",
4+
"qps_list": [1, 4, 16, "inf"],
5+
"server_environment_variables": {
6+
"VLLM_RPC_TIMEOUT": 100000,
7+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
8+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
9+
"VLLM_CPU_KVCACHE_SPACE": 40
10+
},
11+
"server_parameters": {
12+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
13+
"tensor_parallel_size": 1,
14+
"device": "cpu",
15+
"dtype": "bfloat16",
16+
"distributed_executor_backend": "mp",
17+
"block_size": 128,
18+
"trust_remote_code": "",
19+
"disable_log_stats": "",
20+
"disable_log_requests": "",
21+
"load_format": "dummy"
22+
},
23+
"client_parameters": {
24+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
25+
"backend": "vllm",
26+
"dataset_name": "sharegpt",
27+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
28+
"num_prompts": 200
29+
}
30+
},
31+
{
32+
"test_name": "serving_llama8B_tp2_sharegpt",
33+
"qps_list": [1, 4, 16, "inf"],
34+
"server_environment_variables": {
35+
"VLLM_RPC_TIMEOUT": 100000,
36+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
37+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
38+
"VLLM_CPU_KVCACHE_SPACE": 40
39+
},
40+
"server_parameters": {
41+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
42+
"tensor_parallel_size": 2,
43+
"device": "cpu",
44+
"dtype": "bfloat16",
45+
"distributed_executor_backend": "mp",
46+
"block_size": 128,
47+
"trust_remote_code": "",
48+
"disable_log_stats": "",
49+
"disable_log_requests": "",
50+
"load_format": "dummy"
51+
},
52+
"client_parameters": {
53+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
54+
"backend": "vllm",
55+
"dataset_name": "sharegpt",
56+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
57+
"num_prompts": 200
58+
}
59+
},
60+
{
61+
"test_name": "serving_llama8B_tp4_sharegpt",
62+
"qps_list": [1, 4, 16, "inf"],
63+
"server_environment_variables": {
64+
"VLLM_RPC_TIMEOUT": 100000,
65+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
66+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
67+
"VLLM_CPU_KVCACHE_SPACE": 40
68+
},
69+
"server_parameters": {
70+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
71+
"tensor_parallel_size": 4,
72+
"device": "cpu",
73+
"dtype": "bfloat16",
74+
"distributed_executor_backend": "mp",
75+
"block_size": 128,
76+
"trust_remote_code": "",
77+
"disable_log_stats": "",
78+
"disable_log_requests": "",
79+
"load_format": "dummy"
80+
},
81+
"client_parameters": {
82+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
83+
"backend": "vllm",
84+
"dataset_name": "sharegpt",
85+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
86+
"num_prompts": 200
87+
}
88+
},
89+
{
90+
"test_name": "serving_llama8B_tp4_random_1024_128",
91+
"qps_list": [1, 4, 16, "inf"],
92+
"server_environment_variables": {
93+
"VLLM_RPC_TIMEOUT": 100000,
94+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
95+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
96+
"VLLM_CPU_KVCACHE_SPACE": 40
97+
},
98+
"server_parameters": {
99+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
100+
"tensor_parallel_size": 4,
101+
"device": "cpu",
102+
"dtype": "bfloat16",
103+
"distributed_executor_backend": "mp",
104+
"block_size": 128,
105+
"trust_remote_code": "",
106+
"enable_chunked_prefill": "",
107+
"disable_log_stats": "",
108+
"disable_log_requests": "",
109+
"load_format": "dummy"
110+
},
111+
"client_parameters": {
112+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
113+
"backend": "vllm",
114+
"dataset_name": "random",
115+
"random-input-len": 1024,
116+
"random-output-len": 128,
117+
"ignore-eos": "",
118+
"num_prompts": 100
119+
}
120+
}
121+
]
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
[
2+
{
3+
"test_name": "throughput_llama8B_tp1",
4+
"environment_variables": {
5+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
6+
"VLLM_CPU_KVCACHE_SPACE": 40
7+
},
8+
"parameters": {
9+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
10+
"tensor_parallel_size": 1,
11+
"load_format": "dummy",
12+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
13+
"num_prompts": 200,
14+
"backend": "vllm"
15+
}
16+
},
17+
{
18+
"test_name": "throughput_llama8B_tp4",
19+
"environment_variables": {
20+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
21+
"VLLM_CPU_KVCACHE_SPACE": 40
22+
},
23+
"parameters": {
24+
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
25+
"tensor_parallel_size": 4,
26+
"load_format": "dummy",
27+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
28+
"num_prompts": 200,
29+
"backend": "vllm"
30+
}
31+
}
32+
]

0 commit comments

Comments
 (0)