diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py
index b211049a..5ad78e5c 100755
--- a/.github/scripts/generate_vllm_benchmark_matrix.py
+++ b/.github/scripts/generate_vllm_benchmark_matrix.py
@@ -12,10 +12,11 @@
 logging.basicConfig(level=logging.INFO)
 # Those are H100 runners from https://github.com/pytorch-labs/pytorch-gha-infra/blob/main/multi-tenant/inventory/manual_inventory
 # while ROCm runner are provided by AMD
-RUNNERS_MAPPING = {
+TP_TO_RUNNER_MAPPING = {
     1: [
         "linux.aws.h100",
         "linux.rocm.gpu.mi300.2",  # No single ROCm GPU?
+        "linux.24xl.spr-metal",
     ],
     # NB: There is no 2xH100 runner at the momement, so let's use the next one
     # in the list here which is 4xH100
@@ -26,6 +27,8 @@
     4: [
         "linux.aws.h100.4",
         "linux.rocm.gpu.mi300.4",
+        # TODO (huydhn): Enable this when Intel's runners are ready
+        # "intel-cpu-emr",
     ],
     8: [
         "linux.aws.h100.8",
@@ -33,6 +36,17 @@
     ],
 }
 
+# This mapping is needed to find out the platform of the runner
+RUNNER_TO_PLATFORM_MAPPING = {
+    "linux.aws.h100": "cuda",
+    "linux.aws.h100.4": "cuda",
+    "linux.aws.h100.8": "cuda",
+    "linux.rocm.gpu.mi300.2": "rocm",
+    "linux.rocm.gpu.mi300.4": "rocm",
+    "linux.rocm.gpu.mi300.8": "rocm",
+    "linux.24xl.spr-metal": "cpu",
+}
+
 # All the different names vLLM uses to refer to their benchmark configs
 VLLM_BENCHMARK_CONFIGS_PARAMETER = set(
     [
@@ -76,10 +90,11 @@ def parse_args() -> Any:
         help="the comma-separated list of models to benchmark",
     )
     parser.add_argument(
-        "--gpus",
+        "--runners",
         type=str,
         default="",
-        help="the comma-separated list of GPUs to benchmark",
+        help="the comma-separated list of runners to run the benchmark",
+        required=True,
     )
 
     return parser.parse_args()
@@ -107,60 +122,76 @@ def set_output(name: str, val: Any) -> None:
 
 
 def generate_benchmark_matrix(
-    benchmark_configs_dir: str, models: List[str], gpus: List[str]
+    benchmark_configs_dir: str, models: List[str], runners: List[str]
 ) -> Dict[str, Any]:
     """
     Parse all the JSON files in vLLM benchmark configs directory to get the
-    model name and tensor parallel size (aka number of GPUs)
+    model name and tensor parallel size (aka number of GPUs or CPU NUMA nodes)
     """
-    use_all_gpus = True if not gpus else False
     benchmark_matrix: Dict[str, Any] = {
         "include": [],
     }
 
-    selected_models = []
-    for file in glob.glob(f"{benchmark_configs_dir}/*.json"):
-        with open(file) as f:
-            try:
-                configs = json.load(f)
-            except json.JSONDecodeError as e:
-                warning(f"Fail to load {file}: {e}")
-                continue
-
-        for config in configs:
-            param = list(VLLM_BENCHMARK_CONFIGS_PARAMETER & set(config.keys()))
-            assert len(param) == 1
-
-            benchmark_config = config[param[0]]
-            if "model" not in benchmark_config:
-                warning(f"Model name is not set in {benchmark_config}, skipping...")
-                continue
-            model = benchmark_config["model"].lower()
-
-            # Dedup
-            if model in selected_models:
-                continue
-            # and only choose the selected model:
-            if models and model not in models:
-                continue
-            selected_models.append(model)
-
-            if "tensor_parallel_size" in benchmark_config:
-                tp = benchmark_config["tensor_parallel_size"]
-            elif "tp" in benchmark_config:
-                tp = benchmark_config["tp"]
-            else:
-                tp = 8
-            assert tp in RUNNERS_MAPPING
-
-            for runner in RUNNERS_MAPPING[tp]:
-                found_runner = False
-                for gpu in gpus:
-                    if gpu.lower() in runner:
-                        found_runner = True
-                        break
-
-                if found_runner or use_all_gpus:
+    platforms = set()
+    if not runners:
+        use_all_runners = True
+        platforms = set(v for v in RUNNER_TO_PLATFORM_MAPPING.values())
+    else:
+        use_all_runners = False
+        for k, v in RUNNER_TO_PLATFORM_MAPPING.items():
+            for r in runners:
+                if r.lower() in k:
+                    platforms.add(v)
+
+    # Gather all possible benchmarks
+    for platform in sorted(platforms):
+        selected_models = []
+        for file in glob.glob(f"{benchmark_configs_dir}/{platform}/*.json"):
+            with open(file) as f:
+                try:
+                    configs = json.load(f)
+                except json.JSONDecodeError as e:
+                    warning(f"Fail to load {file}: {e}")
+                    continue
+
+            for config in configs:
+                param = list(VLLM_BENCHMARK_CONFIGS_PARAMETER & set(config.keys()))
+                assert len(param) == 1
+
+                benchmark_config = config[param[0]]
+                if "model" not in benchmark_config:
+                    warning(f"Model name is not set in {benchmark_config}, skipping...")
+                    continue
+                model = benchmark_config["model"].lower()
+
+                # Dedup
+                if model in selected_models:
+                    continue
+                # and only choose the selected model:
+                if models and model not in models:
+                    continue
+                selected_models.append(model)
+
+                if "tensor_parallel_size" in benchmark_config:
+                    tp = benchmark_config["tensor_parallel_size"]
+                elif "tp" in benchmark_config:
+                    tp = benchmark_config["tp"]
+                else:
+                    tp = 8
+                assert tp in TP_TO_RUNNER_MAPPING
+
+                for runner in TP_TO_RUNNER_MAPPING[tp]:
+                    # Wrong platform
+                    if (
+                        runner not in RUNNER_TO_PLATFORM_MAPPING
+                        or RUNNER_TO_PLATFORM_MAPPING[runner] != platform
+                    ):
+                        continue
+
+                    found_runner = any([r and r.lower() in runner for r in runners])
+                    if not found_runner and not use_all_runners:
+                        continue
+
                     benchmark_matrix["include"].append(
                         {
                             "runner": runner,
@@ -176,11 +207,11 @@ def generate_benchmark_matrix(
 def main() -> None:
     args = parse_args()
     models = [m.strip().lower() for m in args.models.split(",") if m.strip()]
-    gpus = [m.strip().lower() for m in args.gpus.split(",") if m.strip()]
+    runners = [m.strip().lower() for m in args.runners.split(",") if m.strip()]
     benchmark_matrix = generate_benchmark_matrix(
         args.benchmark_configs_dir,
         models,
-        gpus,
+        runners,
     )
     set_output("benchmark_matrix", benchmark_matrix)
 
diff --git a/.github/scripts/setup_vllm_benchmark.py b/.github/scripts/setup_vllm_benchmark.py
index 98bfa17d..e1edc30a 100755
--- a/.github/scripts/setup_vllm_benchmark.py
+++ b/.github/scripts/setup_vllm_benchmark.py
@@ -61,17 +61,27 @@ def parse_args() -> Any:
         help="the list of models to benchmark",
         required=True,
     )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="",
+        help="device for the runner",
+        required=True,
+    )
 
     return parser.parse_args()
 
 
 def setup_benchmark_configs(
-    from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str]
+    from_benchmark_configs_dir: str,
+    to_benchmark_configs_dir: str,
+    models: List[str],
+    device: str,
 ) -> None:
     """
     Setup the benchmark configs to run on this runner
     """
-    for file in glob.glob(f"{from_benchmark_configs_dir}/*.json"):
+    for file in glob.glob(f"{from_benchmark_configs_dir}/{device}/*.json"):
         filename = os.path.basename(file)
         benchmark_configs = []
 
@@ -108,6 +118,7 @@ def main() -> None:
         args.from_benchmark_configs_dir,
         args.to_benchmark_configs_dir,
         args.models.split(","),
+        args.device,
     )
 
 
diff --git a/.github/scripts/test_generate_vllm_benchmark_matrix.py b/.github/scripts/test_generate_vllm_benchmark_matrix.py
new file mode 100644
index 00000000..73c7672c
--- /dev/null
+++ b/.github/scripts/test_generate_vllm_benchmark_matrix.py
@@ -0,0 +1,420 @@
+import os
+import json
+
+from expecttest import assert_expected_inline
+from generate_vllm_benchmark_matrix import generate_benchmark_matrix
+
+BENCHMARK_CONFIG_DIRS = os.path.join(
+    os.path.dirname(__file__), "..", "..", "vllm-benchmarks", "benchmarks"
+)
+
+
+def test_generate_benchmark_matrix():
+    # All combinations, no duplication
+    models = []
+    runners = []
+    output = json.dumps(
+        generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
+    )
+    assert_expected_inline(
+        output,
+        """\
+{
+  "include": [
+    {
+      "runner": "linux.24xl.spr-metal",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "meta-llama/meta-llama-3.1-70b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "mistralai/mixtral-8x7b-instruct-v0.1"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "meta-llama/llama-4-scout-17b-16e-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.8",
+      "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8"
+    },
+    {
+      "runner": "linux.rocm.gpu.mi300.2",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.rocm.gpu.mi300.4",
+      "models": "meta-llama/meta-llama-3.1-70b-instruct"
+    },
+    {
+      "runner": "linux.rocm.gpu.mi300.2",
+      "models": "mistralai/mixtral-8x7b-instruct-v0.1"
+    },
+    {
+      "runner": "linux.rocm.gpu.mi300.4",
+      "models": "meta-llama/llama-4-scout-17b-16e-instruct"
+    },
+    {
+      "runner": "linux.rocm.gpu.mi300.8",
+      "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8"
+    }
+  ]
+}""",
+    )
+
+    # Select a model
+    models = ["meta-llama/meta-llama-3.1-8b-instruct"]
+    runners = []
+    output = json.dumps(
+        generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
+    )
+    assert_expected_inline(
+        output,
+        """\
+{
+  "include": [
+    {
+      "runner": "linux.24xl.spr-metal",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.rocm.gpu.mi300.2",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    }
+  ]
+}""",
+    )
+
+    # Select multiple models
+    models = [
+        "meta-llama/meta-llama-3.1-8b-instruct",
+        "meta-llama/llama-4-maverick-17b-128e-instruct-fp8",
+    ]
+    runners = []
+    output = json.dumps(
+        generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
+    )
+    assert_expected_inline(
+        output,
+        """\
+{
+  "include": [
+    {
+      "runner": "linux.24xl.spr-metal",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.8",
+      "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8"
+    },
+    {
+      "runner": "linux.rocm.gpu.mi300.2",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.rocm.gpu.mi300.8",
+      "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8"
+    }
+  ]
+}""",
+    )
+
+    # Select non-existing models
+    models = ["meta-llama/meta-llama-3.1-8b-instruct", "do-not-exist"]
+    runners = []
+    output = json.dumps(
+        generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
+    )
+    assert_expected_inline(
+        output,
+        """\
+{
+  "include": [
+    {
+      "runner": "linux.24xl.spr-metal",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.rocm.gpu.mi300.2",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    }
+  ]
+}""",
+    )
+
+    # Select non-existing models
+    models = ["meta-llama/meta-llama-3.1-8b-instruct", ""]
+    runners = []
+    output = json.dumps(
+        generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
+    )
+    assert_expected_inline(
+        output,
+        """\
+{
+  "include": [
+    {
+      "runner": "linux.24xl.spr-metal",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.rocm.gpu.mi300.2",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    }
+  ]
+}""",
+    )
+
+    # Select a runner
+    models = []
+    runners = ["h100"]
+    output = json.dumps(
+        generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
+    )
+    assert_expected_inline(
+        output,
+        """\
+{
+  "include": [
+    {
+      "runner": "linux.aws.h100",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "meta-llama/meta-llama-3.1-70b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "mistralai/mixtral-8x7b-instruct-v0.1"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "meta-llama/llama-4-scout-17b-16e-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.8",
+      "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8"
+    }
+  ]
+}""",
+    )
+
+    # Select multiple runners
+    models = []
+    runners = ["h100", "spr"]
+    output = json.dumps(
+        generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
+    )
+    assert_expected_inline(
+        output,
+        """\
+{
+  "include": [
+    {
+      "runner": "linux.24xl.spr-metal",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "meta-llama/meta-llama-3.1-70b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "mistralai/mixtral-8x7b-instruct-v0.1"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "meta-llama/llama-4-scout-17b-16e-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.8",
+      "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8"
+    }
+  ]
+}""",
+    )
+
+    # Select non-existing runners
+    models = []
+    runners = ["h100", "do-not-exist"]
+    output = json.dumps(
+        generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
+    )
+    assert_expected_inline(
+        output,
+        """\
+{
+  "include": [
+    {
+      "runner": "linux.aws.h100",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "meta-llama/meta-llama-3.1-70b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "mistralai/mixtral-8x7b-instruct-v0.1"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "meta-llama/llama-4-scout-17b-16e-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.8",
+      "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8"
+    }
+  ]
+}""",
+    )
+
+    # Select non-existing runners
+    models = []
+    runners = ["h100", ""]
+    output = json.dumps(
+        generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
+    )
+    assert_expected_inline(
+        output,
+        """\
+{
+  "include": [
+    {
+      "runner": "linux.aws.h100",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "meta-llama/meta-llama-3.1-70b-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "mistralai/mixtral-8x7b-instruct-v0.1"
+    },
+    {
+      "runner": "linux.aws.h100.4",
+      "models": "meta-llama/llama-4-scout-17b-16e-instruct"
+    },
+    {
+      "runner": "linux.aws.h100.8",
+      "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8"
+    }
+  ]
+}""",
+    )
+
+    # Select a model and a runner
+    models = ["meta-llama/meta-llama-3.1-8b-instruct"]
+    runners = ["h100"]
+    output = json.dumps(
+        generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
+    )
+    assert_expected_inline(
+        output,
+        """\
+{
+  "include": [
+    {
+      "runner": "linux.aws.h100",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    }
+  ]
+}""",
+    )
+
+    # Select multiple models and runners
+    models = [
+        "meta-llama/meta-llama-3.1-8b-instruct",
+        "mistralai/mixtral-8x7b-instruct-v0.1",
+    ]
+    runners = ["rocm", "spr"]
+    output = json.dumps(
+        generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
+    )
+    assert_expected_inline(
+        output,
+        """\
+{
+  "include": [
+    {
+      "runner": "linux.24xl.spr-metal",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.rocm.gpu.mi300.2",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    },
+    {
+      "runner": "linux.rocm.gpu.mi300.2",
+      "models": "mistralai/mixtral-8x7b-instruct-v0.1"
+    }
+  ]
+}""",
+    )
+
+    # Select non-existing models and runners
+    models = ["meta-llama/meta-llama-3.1-8b-instruct", "do-not-exist"]
+    runners = ["rocm", "do-not-exist"]
+    output = json.dumps(
+        generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
+    )
+    assert_expected_inline(
+        output,
+        """\
+{
+  "include": [
+    {
+      "runner": "linux.rocm.gpu.mi300.2",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    }
+  ]
+}""",
+    )
+
+    # Select non-existing models and runners
+    models = ["meta-llama/meta-llama-3.1-8b-instruct", ""]
+    runners = ["rocm", ""]
+    output = json.dumps(
+        generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2
+    )
+    assert_expected_inline(
+        output,
+        """\
+{
+  "include": [
+    {
+      "runner": "linux.rocm.gpu.mi300.2",
+      "models": "meta-llama/meta-llama-3.1-8b-instruct"
+    }
+  ]
+}""",
+    )
diff --git a/.github/scripts/upload_benchmark_results.py b/.github/scripts/upload_benchmark_results.py
index 751461bc..39c5b11a 100755
--- a/.github/scripts/upload_benchmark_results.py
+++ b/.github/scripts/upload_benchmark_results.py
@@ -88,10 +88,16 @@ def parse_args() -> Any:
 
     # Device info
     parser.add_argument(
-        "--device",
+        "--device-name",
         type=str,
         required=True,
-        help="the name of the GPU device coming from nvidia-smi or amd-smi",
+        help="the name of the benchmark device",
+    )
+    parser.add_argument(
+        "--device-type",
+        type=str,
+        required=True,
+        help="the type of the benchmark device coming from nvidia-smi, amd-smi, or lscpu",
     )
 
     # Optional suffix
@@ -112,7 +118,9 @@ def get_git_metadata(repo_dir: str) -> Tuple[str, str]:
     repo = Repo(repo_dir)
     # Git metadata, an example remote URL is https://github.com/vllm-project/vllm.git
     # and we want the vllm-project/vllm part
-    repo_name = repo.remotes.origin.url.split(".git")[0].replace("https://github.com/", "")
+    repo_name = repo.remotes.origin.url.split(".git")[0].replace(
+        "https://github.com/", ""
+    )
     hexsha = repo.head.object.hexsha
     committed_date = repo.head.object.committed_date
 
@@ -144,25 +152,34 @@ def get_benchmark_metadata(
     }
 
 
-def get_runner_info() -> Dict[str, Any]:
-    if torch.cuda.is_available() and torch.version.hip:
-        name = "rocm"
-    elif torch.cuda.is_available() and torch.version.cuda:
-        name = "cuda"
+def get_runner_info(device_name: str, device_type: str) -> Dict[str, Any]:
+    if torch.cuda.is_available():
+        if torch.version.hip:
+            name = "rocm"
+        elif torch.version.cuda:
+            name = "cuda"
+        type = torch.cuda.get_device_name()
+        gpu_info = torch.cuda.get_device_name()
+        gpu_count = torch.cuda.device_count()
+        avail_gpu_mem_in_gb = int(
+            torch.cuda.get_device_properties(0).total_memory / (1024 * 1024 * 1024)
+        )
     else:
-        name = "unknown"
+        name = device_name
+        type = device_type
+        gpu_info = ""
+        gpu_count = 0
+        avail_gpu_mem_in_gb = 0
 
     return {
         "name": name,
-        "type": torch.cuda.get_device_name(),
+        "type": type,
         "cpu_info": platform.processor(),
         "cpu_count": psutil.cpu_count(),
         "avail_mem_in_gb": int(psutil.virtual_memory().total / (1024 * 1024 * 1024)),
-        "gpu_info": torch.cuda.get_device_name(),
-        "gpu_count": torch.cuda.device_count(),
-        "avail_gpu_mem_in_gb": int(
-            torch.cuda.get_device_properties(0).total_memory / (1024 * 1024 * 1024)
-        ),
+        "gpu_info": gpu_info,
+        "gpu_count": gpu_count,
+        "avail_gpu_mem_in_gb": avail_gpu_mem_in_gb,
         "extra_info": {
             "hostname": socket.gethostname(),
         },
@@ -270,12 +287,12 @@ def upload(
     head_branch: str,
     head_sha: str,
     aggregated_results: List[Dict[str, Any]],
-    device: str,
+    device_type: str,
     model: str,
     dry_run: bool = True,
 ) -> None:
     model_suffix = f"_{model}" if model else ""
-    s3_path = f"v3/{repo_name}/{head_branch}/{head_sha}/{device}/benchmark_results{model_suffix}.json"
+    s3_path = f"v3/{repo_name}/{head_branch}/{head_sha}/{device_type}/benchmark_results{model_suffix}.json"
 
     info(f"Upload benchmark results to {s3_path}")
     if not dry_run:
@@ -301,7 +318,9 @@ def main() -> None:
         repo_name, head_branch, head_sha, timestamp = get_git_metadata(args.repo)
     else:
         if not args.head_branch or not args.head_sha:
-            warning(f"Need to set --head-branch and --head-sha when manually setting --repo-name")
+            warning(
+                "Need to set --head-branch and --head-sha when manually setting --repo-name"
+            )
             sys.exit(1)
 
         repo_name, head_branch, head_sha, timestamp = (
@@ -315,7 +334,7 @@ def main() -> None:
     metadata = get_benchmark_metadata(
         repo_name, head_branch, head_sha, timestamp, args.benchmark_name
     )
-    runner = get_runner_info()
+    runner = get_runner_info(args.device_name, args.device_type)
 
     # Extract and aggregate the benchmark results
     aggregated_results = aggregate(metadata, runner, load(args.benchmark_results))
@@ -328,7 +347,7 @@ def main() -> None:
         head_branch,
         head_sha,
         aggregated_results,
-        args.device,
+        args.device_type,
         args.model,
         args.dry_run,
     )
diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml
index 1483be8b..743f575c 100644
--- a/.github/workflows/vllm-benchmark.yml
+++ b/.github/workflows/vllm-benchmark.yml
@@ -20,12 +20,12 @@ on:
           A comma-separated list of models to benchmark, leave empty to run everything
         required: false
         type: string
-      gpus:
+      runners:
         description: |
-          A comma-separated list of GPUs to benchmark, i.e. h100, mi300
+          A comma-separated list of runners to run the benchmark, i.e. h100, mi300, spr, emr
         required: true
         type: string
-        default: h100,mi300
+        default: h100,mi300,spr
   pull_request:
     paths:
       - .github/workflows/vllm-benchmark.yml
@@ -53,7 +53,7 @@ jobs:
         shell: bash
         env:
           MODELS: ${{ inputs.models || '' }}
-          GPUS: ${{ inputs.gpus || '' }}
+          RUNNERS: ${{ inputs.runners || '' }}
         run: |
           set -eux
 
@@ -61,7 +61,7 @@ jobs:
           python .github/scripts/generate_vllm_benchmark_matrix.py \
             --benchmark-configs-dir vllm-benchmarks/benchmarks \
             --models "${MODELS}" \
-            --gpus "${GPUS}"
+            --runners "${RUNNERS}"
 
   benchmarks:
     name: Run vLLM benchmarks
@@ -87,6 +87,8 @@ jobs:
           fetch-depth: 0
 
       - uses: actions/setup-python@v5
+        # Amazon Linux fails on this step
+        continue-on-error: true
         with:
           python-version: '3.12'
           cache: 'pip'
@@ -103,8 +105,8 @@ jobs:
             DEVICE_NAME=rocm
             rocm-smi
           else
-            echo "Only CUDA and ROCm benchmarks are supported at the moment"
-            exit 1
+            DEVICE_NAME=cpu
+            lscpu
           fi
           echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV
 
@@ -118,6 +120,8 @@ jobs:
             DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}')
           elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
             DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs)
+          elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
+            DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ")
           fi
           echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV
 
@@ -126,22 +130,25 @@ jobs:
         run: |
           set -eux
 
-          if [[ "${DEVICE_NAME}" == "cuda" ]]; then
-            pip install -r .github/scripts/requirements.txt
-          elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
+          if [[ "${DEVICE_NAME}" == "rocm" ]]; then
             pip install -r .github/scripts/requirements.txt \
               --extra-index-url https://download.pytorch.org/whl/rocm6.3
+          else
+            pip install -r .github/scripts/requirements.txt
           fi
 
       - name: Set Docker registry
         shell: bash
         run: |
-          if [[ "${DEVICE_NAME}" == "cuda" ]]; then
-            DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
-          elif [[ "${DEVICE_NAME}" == "rocm" ]]; then
+          DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
+          DOCKER_IMAGE_SUFFIX=""
+          if [[ "${DEVICE_NAME}" == "rocm" ]]; then
             DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
+          elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
+            DOCKER_IMAGE_SUFFIX=-cpu
           fi
           echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
+          echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV
 
       - name: Check for last benchmark commit
         working-directory: vllm-benchmarks
@@ -160,7 +167,7 @@ jobs:
               # Check if the image is there, if it doesn't then check an older one
               # because the commit is too recent
               HEAD_SHA=$(git rev-parse --verify HEAD~${i})
-              DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}"
+              DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
 
               # No Docker image available yet because the commit is too recent
               if ! docker manifest inspect "${DOCKER_IMAGE}"; then
@@ -206,10 +213,11 @@ jobs:
           popd
 
           # Set the list of benchmarks we want to cover in this runner
-          python .github/scripts/setup_vllm_benchmark.py \
+          python3 .github/scripts/setup_vllm_benchmark.py \
             --from-benchmark-configs-dir vllm-benchmarks/benchmarks \
             --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \
-            --models "${MODELS}"
+            --models "${MODELS}" \
+            --device "${DEVICE_NAME}"
 
           pushd vllm-benchmarks/vllm
           ls -lah .buildkite/nightly-benchmarks/tests
@@ -221,14 +229,20 @@ jobs:
           SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
           SCCACHE_REGION: us-east-1
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
-          DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}
+          DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_SUFFIX }}
           # vLLM-related environment variables
           ENGINE_VERSION: v1
           SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1
         run: |
-          set -x
+          set -eux
+
+          if [[ "${DEVICE_NAME}" == "cpu" ]]; then
+            ON_CPU=1
+          else
+            ON_CPU=0
+          fi
 
-          docker run \
+          container_name=$(docker run \
             ${GPU_FLAG:-} \
             ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
             -e SCCACHE_BUCKET \
@@ -238,13 +252,17 @@ jobs:
             -e HF_TOKEN \
             -e ENGINE_VERSION \
             -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \
+            -e ON_CPU="${ON_CPU}" \
             --ipc=host \
             --tty \
+            --detach \
             --security-opt seccomp=unconfined \
+            --shm-size=4g \
             -v "${GITHUB_WORKSPACE}:/tmp/workspace" \
             -w /tmp/workspace \
-            "${DOCKER_IMAGE}" \
-            bash -xc "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh"
+            "${DOCKER_IMAGE}"
+          )
+          docker exec -t "${container_name}" bash -c "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh"
 
       - name: Authenticate with AWS
         # AWS CUDA runners already have access to the bucket via its runner IAM role
@@ -266,9 +284,11 @@ jobs:
           sudo chown -R ${UID} "${BENCHMARK_RESULTS}"
           ls -lah "${BENCHMARK_RESULTS}"
 
-          python .github/scripts/upload_benchmark_results.py \
+          SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alpha:].-]/_/g")
+          python3 .github/scripts/upload_benchmark_results.py \
             --repo vllm-benchmarks/vllm \
             --benchmark-name "vLLM benchmark" \
             --benchmark-results "${BENCHMARK_RESULTS}" \
-            --device "${DEVICE_TYPE// /_}" \
+            --device-name "${DEVICE_NAME}" \
+            --device-type "${SANITIZED_DEVICE_TYPE}" \
             --model "${MODELS//\//_}"
diff --git a/vllm-benchmarks/benchmarks/README.md b/vllm-benchmarks/benchmarks/README.md
index e06d262d..6d73caae 100644
--- a/vllm-benchmarks/benchmarks/README.md
+++ b/vllm-benchmarks/benchmarks/README.md
@@ -1,7 +1,3 @@
 This directory mirrors the list of benchmarks from
 [vLLM](https://github.com/vllm-project/vllm/tree/main/.buildkite/nightly-benchmarks/tests),
 but it includes only models that we want to cover in PyTorch infra.
-
-Another note is that speculative decoding is not yet supported in v1
-with the exception of ngram, so its corresponding benchmarks is
-currently removed from the list.
diff --git a/vllm-benchmarks/benchmarks/cpu/latency-tests-cpu.json b/vllm-benchmarks/benchmarks/cpu/latency-tests-cpu.json
new file mode 100644
index 00000000..da93fdd1
--- /dev/null
+++ b/vllm-benchmarks/benchmarks/cpu/latency-tests-cpu.json
@@ -0,0 +1,30 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/cpu/serving-tests-cpu.json b/vllm-benchmarks/benchmarks/cpu/serving-tests-cpu.json
new file mode 100644
index 00000000..cb6df159
--- /dev/null
+++ b/vllm-benchmarks/benchmarks/cpu/serving-tests-cpu.json
@@ -0,0 +1,121 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 2,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama8B_tp4_random_1024_128",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_environment_variables": {
+            "VLLM_RPC_TIMEOUT": 100000,
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+	    "device": "cpu",
+	    "dtype": "bfloat16",
+	    "distributed_executor_backend": "mp",
+	    "block_size": 128,
+	    "trust_remote_code": "",
+	    "enable_chunked_prefill": "",
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "random",
+	    "random-input-len": 1024,
+	    "random-output-len": 128,
+	    "ignore-eos": "",
+            "num_prompts": 100
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/cpu/throughput-tests-cpu.json b/vllm-benchmarks/benchmarks/cpu/throughput-tests-cpu.json
new file mode 100644
index 00000000..f159c306
--- /dev/null
+++ b/vllm-benchmarks/benchmarks/cpu/throughput-tests-cpu.json
@@ -0,0 +1,32 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_llama8B_tp4",
+        "environment_variables": {
+	    "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
+	    "VLLM_CPU_KVCACHE_SPACE": 40
+        },
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json
similarity index 100%
rename from vllm-benchmarks/benchmarks/latency-tests.json
rename to vllm-benchmarks/benchmarks/cuda/latency-tests.json
diff --git a/vllm-benchmarks/benchmarks/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json
similarity index 100%
rename from vllm-benchmarks/benchmarks/serving-tests.json
rename to vllm-benchmarks/benchmarks/cuda/serving-tests.json
diff --git a/vllm-benchmarks/benchmarks/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json
similarity index 100%
rename from vllm-benchmarks/benchmarks/throughput-tests.json
rename to vllm-benchmarks/benchmarks/cuda/throughput-tests.json
diff --git a/vllm-benchmarks/benchmarks/rocm/latency-tests.json b/vllm-benchmarks/benchmarks/rocm/latency-tests.json
new file mode 100644
index 00000000..9e9f15f8
--- /dev/null
+++ b/vllm-benchmarks/benchmarks/rocm/latency-tests.json
@@ -0,0 +1,54 @@
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama70B_tp4",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15
+        }
+    },
+    {
+        "test_name": "latency_mixtral8x7B_tp2",
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama4_scout_tp4",
+        "parameters": {
+            "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15,
+            "max_model_len": 8192
+        }
+    },
+    {
+        "test_name": "latency_llama4_maverick_fp8_tp8",
+        "parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15,
+            "max_model_len": 8192
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/rocm/serving-tests.json b/vllm-benchmarks/benchmarks/rocm/serving-tests.json
new file mode 100644
index 00000000..9456bb88
--- /dev/null
+++ b/vllm-benchmarks/benchmarks/rocm/serving-tests.json
@@ -0,0 +1,121 @@
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
+        "qps_list": [2],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "disable_log_requests": "",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "speculative_config": {
+                "model": "turboderp/Qwama-0.5B-Instruct",
+                "num_speculative_tokens": 4,
+                "draft_tensor_parallel_size": 1
+            }
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama4_scout_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy",
+            "max_model_len": 8192
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama4_maverick_fp8_tp8",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "tensor_parallel_size": 8,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy",
+            "max_model_len": 8192
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    }
+]
diff --git a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json
new file mode 100644
index 00000000..647ac2f3
--- /dev/null
+++ b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json
@@ -0,0 +1,59 @@
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_llama70B_tp4",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_mixtral8x7B_tp2",
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_llama4_scout_tp4",
+        "parameters": {
+            "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm",
+            "max_model_len": 8192
+        }
+    },
+    {
+        "test_name": "throughput_llama4_maverick_fp8_tp8",
+        "parameters": {
+            "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            "tensor_parallel_size": 8,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm",
+            "max_model_len": 8192
+        }
+    }
+]