Add benchmarks for pytorch models (#90)

jainapurva · web-flow · commit 81c4dc60e712 · 2025-11-03T14:59:57.000-08:00
* Add latency benchmarks for pytorch models

* Install torchao

* Add torchao quantization param

* Update load data format to auto, dummy not supported in torchao

* Add fbgemm install

* Update

* Remove CUDA-specific installation step for torchao

Removed conditional installation of 'torchao' for CUDA.

* Add torch and torchao

* pin torch and torchao

* add torchao install to bash

* fix tokenizer

* fix fbgemm download

* updates

* updates
diff --git a/.github/scripts/requirements.txt b/.github/scripts/requirements.txt
@@ -4,4 +4,4 @@ psutil==7.0.0
 pynvml==12.0.0
 boto3==1.36.21
 awscli==1.37.21
-torch==2.7.1
+torch==2.9.0
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
@@ -303,6 +303,9 @@ jobs:
             -w /tmp/workspace \
             "${DOCKER_IMAGE}"
           )
+          if [[ "${DEVICE_NAME}" == "cuda" ]]; then
+            docker exec -t "${container_name}" bash -c "pip install torchao fbgemm-gpu-genai"
+          fi
           docker exec -t "${container_name}" bash -c "cd vllm-benchmarks/vllm && bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh"
 
       - name: Authenticate with AWS
diff --git a/vllm-benchmarks/benchmarks/cuda/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json
@@ -138,5 +138,45 @@
             "num_iters": 15,
             "max_model_len": 8192
         }
+    },
+    {
+        "test_name": "latency_gemma3_12b_it_fp8_torchao",
+        "parameters": {
+            "model": "pytorch/gemma-3-12b-it-FP8",
+            "quantization": "torchao",
+            "load_format": "auto",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_gemma3_12b_it_int4_torchao",
+        "parameters": {
+            "model": "pytorch/gemma-3-12b-it-INT4",
+            "quantization": "torchao",
+            "load_format": "auto",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_gemma3_27b_it_fp8_torchao",
+        "parameters": {
+            "model": "pytorch/gemma-3-27b-it-FP8",
+            "quantization": "torchao",
+            "load_format": "auto",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_gemma3_27b_it_int4_torchao",
+        "parameters": {
+            "model": "pytorch/gemma-3-27b-it-INT4",
+            "quantization": "torchao",
+            "load_format": "auto",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
     }
 ]
diff --git a/vllm-benchmarks/benchmarks/cuda/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json
@@ -552,5 +552,93 @@
             "random_input_len": 5250,
             "random_output_len": 8250
         }
+    },
+    {
+        "test_name": "serving_gemma3_12b_it_fp8_torchao",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "pytorch/gemma-3-12b-it-FP8",
+            "tokenizer": "google/gemma-3-12b-it",
+            "quantization": "torchao",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "auto"
+        },
+        "client_parameters": {
+            "model": "pytorch/gemma-3-12b-it-FP8",
+            "backend": "vllm",
+            "dataset_name": "random",
+            "num_prompts": 200,
+            "random_input_len": 1024,
+            "random_output_len": 2048
+        }
+    },
+    {
+        "test_name": "serving_gemma3_12b_it_int4_torchao",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "pytorch/gemma-3-12b-it-INT4",
+            "tokenizer": "google/gemma-3-12b-it",
+            "quantization": "torchao",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "auto"
+        },
+        "client_parameters": {
+            "model": "pytorch/gemma-3-12b-it-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+            "num_prompts": 200,
+            "random_input_len": 1024,
+            "random_output_len": 2048
+        }
+    },
+    {
+        "test_name": "serving_gemma3_27b_it_fp8_torchao",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "pytorch/gemma-3-27b-it-FP8",
+            "tokenizer": "google/gemma-3-27b-it",
+            "quantization": "torchao",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "auto"
+        },
+        "client_parameters": {
+            "model": "pytorch/gemma-3-27b-it-FP8",
+            "backend": "vllm",
+            "dataset_name": "random",
+            "num_prompts": 200,
+            "random_input_len": 1024,
+            "random_output_len": 2048
+        }
+    },
+    {
+        "test_name": "serving_gemma3_27b_it_int4_torchao",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "pytorch/gemma-3-27b-it-INT4",
+            "tokenizer": "google/gemma-3-27b-it",
+            "quantization": "torchao",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "auto"
+        },
+        "client_parameters": {
+            "model": "pytorch/gemma-3-27b-it-INT4",
+            "backend": "vllm",
+            "dataset_name": "random",
+            "num_prompts": 200,
+            "random_input_len": 1024,
+            "random_output_len": 2048
+        }
     }
 ]
diff --git a/vllm-benchmarks/benchmarks/cuda/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json
@@ -151,5 +151,49 @@
             "backend": "vllm",
             "max_model_len": 8192
         }
+    },
+    {
+        "test_name": "throughput_gemma3_12b_it_fp8_torchao",
+        "parameters": {
+            "model": "pytorch/gemma-3-12b-it-FP8",
+            "quantization": "torchao",
+            "load_format": "auto",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_gemma3_12b_it_int4_torchao",
+        "parameters": {
+            "model": "pytorch/gemma-3-12b-it-INT4",
+            "quantization": "torchao",
+            "load_format": "auto",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_gemma3_27b_it_fp8_torchao",
+        "parameters": {
+            "model": "pytorch/gemma-3-27b-it-FP8",
+            "quantization": "torchao",
+            "load_format": "auto",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_gemma3_27b_it_int4_torchao",
+        "parameters": {
+            "model": "pytorch/gemma-3-27b-it-INT4",
+            "quantization": "torchao",
+            "load_format": "auto",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
     }
 ]

Original file line number	Diff line number	Diff line change
`@@ -303,6 +303,9 @@ jobs:`
`303`	`303`	`-w /tmp/workspace \`
`304`	`304`	`"${DOCKER_IMAGE}"`
`305`	`305`	`)`
	`306`	`+ if [[ "${DEVICE_NAME}" == "cuda" ]]; then`
	`307`	`+ docker exec -t "${container_name}" bash -c "pip install torchao fbgemm-gpu-genai"`
	`308`	`+ fi`
`306`	`309`	`docker exec -t "${container_name}" bash -c "cd vllm-benchmarks/vllm && bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh"`
`307`	`310`
`308`	`311`	`- name: Authenticate with AWS`