Skip to content

Commit 81c4dc6

Browse files
authored
Add benchmarks for pytorch models (#90)
* Add latency benchmarks for pytorch models * Install torchao * Add torchao quantization param * Update load data format to auto, dummy not supported in torchao * Add fbgemm install * Update * Remove CUDA-specific installation step for torchao Removed conditional installation of 'torchao' for CUDA. * Add torch and torchao * pin torch and torchao * add torchao install to bash * fix tokenizer * fix fbgemm download * updates * updates
1 parent adef44d commit 81c4dc6

File tree

5 files changed

+176
-1
lines changed

5 files changed

+176
-1
lines changed

.github/scripts/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ psutil==7.0.0
44
pynvml==12.0.0
55
boto3==1.36.21
66
awscli==1.37.21
7-
torch==2.7.1
7+
torch==2.9.0

.github/workflows/vllm-benchmark.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -303,6 +303,9 @@ jobs:
303303
-w /tmp/workspace \
304304
"${DOCKER_IMAGE}"
305305
)
306+
if [[ "${DEVICE_NAME}" == "cuda" ]]; then
307+
docker exec -t "${container_name}" bash -c "pip install torchao fbgemm-gpu-genai"
308+
fi
306309
docker exec -t "${container_name}" bash -c "cd vllm-benchmarks/vllm && bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh"
307310
308311
- name: Authenticate with AWS

vllm-benchmarks/benchmarks/cuda/latency-tests.json

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,5 +138,45 @@
138138
"num_iters": 15,
139139
"max_model_len": 8192
140140
}
141+
},
142+
{
143+
"test_name": "latency_gemma3_12b_it_fp8_torchao",
144+
"parameters": {
145+
"model": "pytorch/gemma-3-12b-it-FP8",
146+
"quantization": "torchao",
147+
"load_format": "auto",
148+
"num_iters_warmup": 5,
149+
"num_iters": 15
150+
}
151+
},
152+
{
153+
"test_name": "latency_gemma3_12b_it_int4_torchao",
154+
"parameters": {
155+
"model": "pytorch/gemma-3-12b-it-INT4",
156+
"quantization": "torchao",
157+
"load_format": "auto",
158+
"num_iters_warmup": 5,
159+
"num_iters": 15
160+
}
161+
},
162+
{
163+
"test_name": "latency_gemma3_27b_it_fp8_torchao",
164+
"parameters": {
165+
"model": "pytorch/gemma-3-27b-it-FP8",
166+
"quantization": "torchao",
167+
"load_format": "auto",
168+
"num_iters_warmup": 5,
169+
"num_iters": 15
170+
}
171+
},
172+
{
173+
"test_name": "latency_gemma3_27b_it_int4_torchao",
174+
"parameters": {
175+
"model": "pytorch/gemma-3-27b-it-INT4",
176+
"quantization": "torchao",
177+
"load_format": "auto",
178+
"num_iters_warmup": 5,
179+
"num_iters": 15
180+
}
141181
}
142182
]

vllm-benchmarks/benchmarks/cuda/serving-tests.json

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,5 +552,93 @@
552552
"random_input_len": 5250,
553553
"random_output_len": 8250
554554
}
555+
},
556+
{
557+
"test_name": "serving_gemma3_12b_it_fp8_torchao",
558+
"qps_list": [1, 4, 16, "inf"],
559+
"server_parameters": {
560+
"model": "pytorch/gemma-3-12b-it-FP8",
561+
"tokenizer": "google/gemma-3-12b-it",
562+
"quantization": "torchao",
563+
"tensor_parallel_size": 1,
564+
"swap_space": 16,
565+
"disable_log_stats": "",
566+
"disable_log_requests": "",
567+
"load_format": "auto"
568+
},
569+
"client_parameters": {
570+
"model": "pytorch/gemma-3-12b-it-FP8",
571+
"backend": "vllm",
572+
"dataset_name": "random",
573+
"num_prompts": 200,
574+
"random_input_len": 1024,
575+
"random_output_len": 2048
576+
}
577+
},
578+
{
579+
"test_name": "serving_gemma3_12b_it_int4_torchao",
580+
"qps_list": [1, 4, 16, "inf"],
581+
"server_parameters": {
582+
"model": "pytorch/gemma-3-12b-it-INT4",
583+
"tokenizer": "google/gemma-3-12b-it",
584+
"quantization": "torchao",
585+
"tensor_parallel_size": 1,
586+
"swap_space": 16,
587+
"disable_log_stats": "",
588+
"disable_log_requests": "",
589+
"load_format": "auto"
590+
},
591+
"client_parameters": {
592+
"model": "pytorch/gemma-3-12b-it-INT4",
593+
"backend": "vllm",
594+
"dataset_name": "random",
595+
"num_prompts": 200,
596+
"random_input_len": 1024,
597+
"random_output_len": 2048
598+
}
599+
},
600+
{
601+
"test_name": "serving_gemma3_27b_it_fp8_torchao",
602+
"qps_list": [1, 4, 16, "inf"],
603+
"server_parameters": {
604+
"model": "pytorch/gemma-3-27b-it-FP8",
605+
"tokenizer": "google/gemma-3-27b-it",
606+
"quantization": "torchao",
607+
"tensor_parallel_size": 1,
608+
"swap_space": 16,
609+
"disable_log_stats": "",
610+
"disable_log_requests": "",
611+
"load_format": "auto"
612+
},
613+
"client_parameters": {
614+
"model": "pytorch/gemma-3-27b-it-FP8",
615+
"backend": "vllm",
616+
"dataset_name": "random",
617+
"num_prompts": 200,
618+
"random_input_len": 1024,
619+
"random_output_len": 2048
620+
}
621+
},
622+
{
623+
"test_name": "serving_gemma3_27b_it_int4_torchao",
624+
"qps_list": [1, 4, 16, "inf"],
625+
"server_parameters": {
626+
"model": "pytorch/gemma-3-27b-it-INT4",
627+
"tokenizer": "google/gemma-3-27b-it",
628+
"quantization": "torchao",
629+
"tensor_parallel_size": 1,
630+
"swap_space": 16,
631+
"disable_log_stats": "",
632+
"disable_log_requests": "",
633+
"load_format": "auto"
634+
},
635+
"client_parameters": {
636+
"model": "pytorch/gemma-3-27b-it-INT4",
637+
"backend": "vllm",
638+
"dataset_name": "random",
639+
"num_prompts": 200,
640+
"random_input_len": 1024,
641+
"random_output_len": 2048
642+
}
555643
}
556644
]

vllm-benchmarks/benchmarks/cuda/throughput-tests.json

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,5 +151,49 @@
151151
"backend": "vllm",
152152
"max_model_len": 8192
153153
}
154+
},
155+
{
156+
"test_name": "throughput_gemma3_12b_it_fp8_torchao",
157+
"parameters": {
158+
"model": "pytorch/gemma-3-12b-it-FP8",
159+
"quantization": "torchao",
160+
"load_format": "auto",
161+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
162+
"num_prompts": 200,
163+
"backend": "vllm"
164+
}
165+
},
166+
{
167+
"test_name": "throughput_gemma3_12b_it_int4_torchao",
168+
"parameters": {
169+
"model": "pytorch/gemma-3-12b-it-INT4",
170+
"quantization": "torchao",
171+
"load_format": "auto",
172+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
173+
"num_prompts": 200,
174+
"backend": "vllm"
175+
}
176+
},
177+
{
178+
"test_name": "throughput_gemma3_27b_it_fp8_torchao",
179+
"parameters": {
180+
"model": "pytorch/gemma-3-27b-it-FP8",
181+
"quantization": "torchao",
182+
"load_format": "auto",
183+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
184+
"num_prompts": 200,
185+
"backend": "vllm"
186+
}
187+
},
188+
{
189+
"test_name": "throughput_gemma3_27b_it_int4_torchao",
190+
"parameters": {
191+
"model": "pytorch/gemma-3-27b-it-INT4",
192+
"quantization": "torchao",
193+
"load_format": "auto",
194+
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
195+
"num_prompts": 200,
196+
"backend": "vllm"
197+
}
154198
}
155199
]

0 commit comments

Comments
 (0)