diff --git a/.github/scripts/generate_vllm_benchmark_matrix.py b/.github/scripts/generate_vllm_benchmark_matrix.py index b211049a..5ad78e5c 100755 --- a/.github/scripts/generate_vllm_benchmark_matrix.py +++ b/.github/scripts/generate_vllm_benchmark_matrix.py @@ -12,10 +12,11 @@ logging.basicConfig(level=logging.INFO) # Those are H100 runners from https://github.com/pytorch-labs/pytorch-gha-infra/blob/main/multi-tenant/inventory/manual_inventory # while ROCm runner are provided by AMD -RUNNERS_MAPPING = { +TP_TO_RUNNER_MAPPING = { 1: [ "linux.aws.h100", "linux.rocm.gpu.mi300.2", # No single ROCm GPU? + "linux.24xl.spr-metal", ], # NB: There is no 2xH100 runner at the momement, so let's use the next one # in the list here which is 4xH100 @@ -26,6 +27,8 @@ 4: [ "linux.aws.h100.4", "linux.rocm.gpu.mi300.4", + # TODO (huydhn): Enable this when Intel's runners are ready + # "intel-cpu-emr", ], 8: [ "linux.aws.h100.8", @@ -33,6 +36,17 @@ ], } +# This mapping is needed to find out the platform of the runner +RUNNER_TO_PLATFORM_MAPPING = { + "linux.aws.h100": "cuda", + "linux.aws.h100.4": "cuda", + "linux.aws.h100.8": "cuda", + "linux.rocm.gpu.mi300.2": "rocm", + "linux.rocm.gpu.mi300.4": "rocm", + "linux.rocm.gpu.mi300.8": "rocm", + "linux.24xl.spr-metal": "cpu", +} + # All the different names vLLM uses to refer to their benchmark configs VLLM_BENCHMARK_CONFIGS_PARAMETER = set( [ @@ -76,10 +90,11 @@ def parse_args() -> Any: help="the comma-separated list of models to benchmark", ) parser.add_argument( - "--gpus", + "--runners", type=str, default="", - help="the comma-separated list of GPUs to benchmark", + help="the comma-separated list of runners to run the benchmark", + required=True, ) return parser.parse_args() @@ -107,60 +122,76 @@ def set_output(name: str, val: Any) -> None: def generate_benchmark_matrix( - benchmark_configs_dir: str, models: List[str], gpus: List[str] + benchmark_configs_dir: str, models: List[str], runners: List[str] ) -> Dict[str, Any]: """ Parse all the JSON files in vLLM benchmark configs directory to get the - model name and tensor parallel size (aka number of GPUs) + model name and tensor parallel size (aka number of GPUs or CPU NUMA nodes) """ - use_all_gpus = True if not gpus else False benchmark_matrix: Dict[str, Any] = { "include": [], } - selected_models = [] - for file in glob.glob(f"{benchmark_configs_dir}/*.json"): - with open(file) as f: - try: - configs = json.load(f) - except json.JSONDecodeError as e: - warning(f"Fail to load {file}: {e}") - continue - - for config in configs: - param = list(VLLM_BENCHMARK_CONFIGS_PARAMETER & set(config.keys())) - assert len(param) == 1 - - benchmark_config = config[param[0]] - if "model" not in benchmark_config: - warning(f"Model name is not set in {benchmark_config}, skipping...") - continue - model = benchmark_config["model"].lower() - - # Dedup - if model in selected_models: - continue - # and only choose the selected model: - if models and model not in models: - continue - selected_models.append(model) - - if "tensor_parallel_size" in benchmark_config: - tp = benchmark_config["tensor_parallel_size"] - elif "tp" in benchmark_config: - tp = benchmark_config["tp"] - else: - tp = 8 - assert tp in RUNNERS_MAPPING - - for runner in RUNNERS_MAPPING[tp]: - found_runner = False - for gpu in gpus: - if gpu.lower() in runner: - found_runner = True - break - - if found_runner or use_all_gpus: + platforms = set() + if not runners: + use_all_runners = True + platforms = set(v for v in RUNNER_TO_PLATFORM_MAPPING.values()) + else: + use_all_runners = False + for k, v in RUNNER_TO_PLATFORM_MAPPING.items(): + for r in runners: + if r.lower() in k: + platforms.add(v) + + # Gather all possible benchmarks + for platform in sorted(platforms): + selected_models = [] + for file in glob.glob(f"{benchmark_configs_dir}/{platform}/*.json"): + with open(file) as f: + try: + configs = json.load(f) + except json.JSONDecodeError as e: + warning(f"Fail to load {file}: {e}") + continue + + for config in configs: + param = list(VLLM_BENCHMARK_CONFIGS_PARAMETER & set(config.keys())) + assert len(param) == 1 + + benchmark_config = config[param[0]] + if "model" not in benchmark_config: + warning(f"Model name is not set in {benchmark_config}, skipping...") + continue + model = benchmark_config["model"].lower() + + # Dedup + if model in selected_models: + continue + # and only choose the selected model: + if models and model not in models: + continue + selected_models.append(model) + + if "tensor_parallel_size" in benchmark_config: + tp = benchmark_config["tensor_parallel_size"] + elif "tp" in benchmark_config: + tp = benchmark_config["tp"] + else: + tp = 8 + assert tp in TP_TO_RUNNER_MAPPING + + for runner in TP_TO_RUNNER_MAPPING[tp]: + # Wrong platform + if ( + runner not in RUNNER_TO_PLATFORM_MAPPING + or RUNNER_TO_PLATFORM_MAPPING[runner] != platform + ): + continue + + found_runner = any([r and r.lower() in runner for r in runners]) + if not found_runner and not use_all_runners: + continue + benchmark_matrix["include"].append( { "runner": runner, @@ -176,11 +207,11 @@ def generate_benchmark_matrix( def main() -> None: args = parse_args() models = [m.strip().lower() for m in args.models.split(",") if m.strip()] - gpus = [m.strip().lower() for m in args.gpus.split(",") if m.strip()] + runners = [m.strip().lower() for m in args.runners.split(",") if m.strip()] benchmark_matrix = generate_benchmark_matrix( args.benchmark_configs_dir, models, - gpus, + runners, ) set_output("benchmark_matrix", benchmark_matrix) diff --git a/.github/scripts/setup_vllm_benchmark.py b/.github/scripts/setup_vllm_benchmark.py index 98bfa17d..e1edc30a 100755 --- a/.github/scripts/setup_vllm_benchmark.py +++ b/.github/scripts/setup_vllm_benchmark.py @@ -61,17 +61,27 @@ def parse_args() -> Any: help="the list of models to benchmark", required=True, ) + parser.add_argument( + "--device", + type=str, + default="", + help="device for the runner", + required=True, + ) return parser.parse_args() def setup_benchmark_configs( - from_benchmark_configs_dir: str, to_benchmark_configs_dir: str, models: List[str] + from_benchmark_configs_dir: str, + to_benchmark_configs_dir: str, + models: List[str], + device: str, ) -> None: """ Setup the benchmark configs to run on this runner """ - for file in glob.glob(f"{from_benchmark_configs_dir}/*.json"): + for file in glob.glob(f"{from_benchmark_configs_dir}/{device}/*.json"): filename = os.path.basename(file) benchmark_configs = [] @@ -108,6 +118,7 @@ def main() -> None: args.from_benchmark_configs_dir, args.to_benchmark_configs_dir, args.models.split(","), + args.device, ) diff --git a/.github/scripts/test_generate_vllm_benchmark_matrix.py b/.github/scripts/test_generate_vllm_benchmark_matrix.py new file mode 100644 index 00000000..73c7672c --- /dev/null +++ b/.github/scripts/test_generate_vllm_benchmark_matrix.py @@ -0,0 +1,420 @@ +import os +import json + +from expecttest import assert_expected_inline +from generate_vllm_benchmark_matrix import generate_benchmark_matrix + +BENCHMARK_CONFIG_DIRS = os.path.join( + os.path.dirname(__file__), "..", "..", "vllm-benchmarks", "benchmarks" +) + + +def test_generate_benchmark_matrix(): + # All combinations, no duplication + models = [] + runners = [] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.24xl.spr-metal", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/meta-llama-3.1-70b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/llama-4-scout-17b-16e-instruct" + }, + { + "runner": "linux.aws.h100.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.4", + "models": "meta-llama/meta-llama-3.1-70b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.rocm.gpu.mi300.4", + "models": "meta-llama/llama-4-scout-17b-16e-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + } + ] +}""", + ) + + # Select a model + models = ["meta-llama/meta-llama-3.1-8b-instruct"] + runners = [] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.24xl.spr-metal", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + } + ] +}""", + ) + + # Select multiple models + models = [ + "meta-llama/meta-llama-3.1-8b-instruct", + "meta-llama/llama-4-maverick-17b-128e-instruct-fp8", + ] + runners = [] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.24xl.spr-metal", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + } + ] +}""", + ) + + # Select non-existing models + models = ["meta-llama/meta-llama-3.1-8b-instruct", "do-not-exist"] + runners = [] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.24xl.spr-metal", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + } + ] +}""", + ) + + # Select non-existing models + models = ["meta-llama/meta-llama-3.1-8b-instruct", ""] + runners = [] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.24xl.spr-metal", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + } + ] +}""", + ) + + # Select a runner + models = [] + runners = ["h100"] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/meta-llama-3.1-70b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/llama-4-scout-17b-16e-instruct" + }, + { + "runner": "linux.aws.h100.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + } + ] +}""", + ) + + # Select multiple runners + models = [] + runners = ["h100", "spr"] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.24xl.spr-metal", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/meta-llama-3.1-70b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/llama-4-scout-17b-16e-instruct" + }, + { + "runner": "linux.aws.h100.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + } + ] +}""", + ) + + # Select non-existing runners + models = [] + runners = ["h100", "do-not-exist"] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/meta-llama-3.1-70b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/llama-4-scout-17b-16e-instruct" + }, + { + "runner": "linux.aws.h100.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + } + ] +}""", + ) + + # Select non-existing runners + models = [] + runners = ["h100", ""] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/meta-llama-3.1-70b-instruct" + }, + { + "runner": "linux.aws.h100.4", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + }, + { + "runner": "linux.aws.h100.4", + "models": "meta-llama/llama-4-scout-17b-16e-instruct" + }, + { + "runner": "linux.aws.h100.8", + "models": "meta-llama/llama-4-maverick-17b-128e-instruct-fp8" + } + ] +}""", + ) + + # Select a model and a runner + models = ["meta-llama/meta-llama-3.1-8b-instruct"] + runners = ["h100"] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.aws.h100", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + } + ] +}""", + ) + + # Select multiple models and runners + models = [ + "meta-llama/meta-llama-3.1-8b-instruct", + "mistralai/mixtral-8x7b-instruct-v0.1", + ] + runners = ["rocm", "spr"] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.24xl.spr-metal", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + }, + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "mistralai/mixtral-8x7b-instruct-v0.1" + } + ] +}""", + ) + + # Select non-existing models and runners + models = ["meta-llama/meta-llama-3.1-8b-instruct", "do-not-exist"] + runners = ["rocm", "do-not-exist"] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + } + ] +}""", + ) + + # Select non-existing models and runners + models = ["meta-llama/meta-llama-3.1-8b-instruct", ""] + runners = ["rocm", ""] + output = json.dumps( + generate_benchmark_matrix(BENCHMARK_CONFIG_DIRS, models, runners), indent=2 + ) + assert_expected_inline( + output, + """\ +{ + "include": [ + { + "runner": "linux.rocm.gpu.mi300.2", + "models": "meta-llama/meta-llama-3.1-8b-instruct" + } + ] +}""", + ) diff --git a/.github/scripts/upload_benchmark_results.py b/.github/scripts/upload_benchmark_results.py index 751461bc..39c5b11a 100755 --- a/.github/scripts/upload_benchmark_results.py +++ b/.github/scripts/upload_benchmark_results.py @@ -88,10 +88,16 @@ def parse_args() -> Any: # Device info parser.add_argument( - "--device", + "--device-name", type=str, required=True, - help="the name of the GPU device coming from nvidia-smi or amd-smi", + help="the name of the benchmark device", + ) + parser.add_argument( + "--device-type", + type=str, + required=True, + help="the type of the benchmark device coming from nvidia-smi, amd-smi, or lscpu", ) # Optional suffix @@ -112,7 +118,9 @@ def get_git_metadata(repo_dir: str) -> Tuple[str, str]: repo = Repo(repo_dir) # Git metadata, an example remote URL is https://github.com/vllm-project/vllm.git # and we want the vllm-project/vllm part - repo_name = repo.remotes.origin.url.split(".git")[0].replace("https://github.com/", "") + repo_name = repo.remotes.origin.url.split(".git")[0].replace( + "https://github.com/", "" + ) hexsha = repo.head.object.hexsha committed_date = repo.head.object.committed_date @@ -144,25 +152,34 @@ def get_benchmark_metadata( } -def get_runner_info() -> Dict[str, Any]: - if torch.cuda.is_available() and torch.version.hip: - name = "rocm" - elif torch.cuda.is_available() and torch.version.cuda: - name = "cuda" +def get_runner_info(device_name: str, device_type: str) -> Dict[str, Any]: + if torch.cuda.is_available(): + if torch.version.hip: + name = "rocm" + elif torch.version.cuda: + name = "cuda" + type = torch.cuda.get_device_name() + gpu_info = torch.cuda.get_device_name() + gpu_count = torch.cuda.device_count() + avail_gpu_mem_in_gb = int( + torch.cuda.get_device_properties(0).total_memory / (1024 * 1024 * 1024) + ) else: - name = "unknown" + name = device_name + type = device_type + gpu_info = "" + gpu_count = 0 + avail_gpu_mem_in_gb = 0 return { "name": name, - "type": torch.cuda.get_device_name(), + "type": type, "cpu_info": platform.processor(), "cpu_count": psutil.cpu_count(), "avail_mem_in_gb": int(psutil.virtual_memory().total / (1024 * 1024 * 1024)), - "gpu_info": torch.cuda.get_device_name(), - "gpu_count": torch.cuda.device_count(), - "avail_gpu_mem_in_gb": int( - torch.cuda.get_device_properties(0).total_memory / (1024 * 1024 * 1024) - ), + "gpu_info": gpu_info, + "gpu_count": gpu_count, + "avail_gpu_mem_in_gb": avail_gpu_mem_in_gb, "extra_info": { "hostname": socket.gethostname(), }, @@ -270,12 +287,12 @@ def upload( head_branch: str, head_sha: str, aggregated_results: List[Dict[str, Any]], - device: str, + device_type: str, model: str, dry_run: bool = True, ) -> None: model_suffix = f"_{model}" if model else "" - s3_path = f"v3/{repo_name}/{head_branch}/{head_sha}/{device}/benchmark_results{model_suffix}.json" + s3_path = f"v3/{repo_name}/{head_branch}/{head_sha}/{device_type}/benchmark_results{model_suffix}.json" info(f"Upload benchmark results to {s3_path}") if not dry_run: @@ -301,7 +318,9 @@ def main() -> None: repo_name, head_branch, head_sha, timestamp = get_git_metadata(args.repo) else: if not args.head_branch or not args.head_sha: - warning(f"Need to set --head-branch and --head-sha when manually setting --repo-name") + warning( + "Need to set --head-branch and --head-sha when manually setting --repo-name" + ) sys.exit(1) repo_name, head_branch, head_sha, timestamp = ( @@ -315,7 +334,7 @@ def main() -> None: metadata = get_benchmark_metadata( repo_name, head_branch, head_sha, timestamp, args.benchmark_name ) - runner = get_runner_info() + runner = get_runner_info(args.device_name, args.device_type) # Extract and aggregate the benchmark results aggregated_results = aggregate(metadata, runner, load(args.benchmark_results)) @@ -328,7 +347,7 @@ def main() -> None: head_branch, head_sha, aggregated_results, - args.device, + args.device_type, args.model, args.dry_run, ) diff --git a/.github/workflows/vllm-benchmark.yml b/.github/workflows/vllm-benchmark.yml index 1483be8b..743f575c 100644 --- a/.github/workflows/vllm-benchmark.yml +++ b/.github/workflows/vllm-benchmark.yml @@ -20,12 +20,12 @@ on: A comma-separated list of models to benchmark, leave empty to run everything required: false type: string - gpus: + runners: description: | - A comma-separated list of GPUs to benchmark, i.e. h100, mi300 + A comma-separated list of runners to run the benchmark, i.e. h100, mi300, spr, emr required: true type: string - default: h100,mi300 + default: h100,mi300,spr pull_request: paths: - .github/workflows/vllm-benchmark.yml @@ -53,7 +53,7 @@ jobs: shell: bash env: MODELS: ${{ inputs.models || '' }} - GPUS: ${{ inputs.gpus || '' }} + RUNNERS: ${{ inputs.runners || '' }} run: | set -eux @@ -61,7 +61,7 @@ jobs: python .github/scripts/generate_vllm_benchmark_matrix.py \ --benchmark-configs-dir vllm-benchmarks/benchmarks \ --models "${MODELS}" \ - --gpus "${GPUS}" + --runners "${RUNNERS}" benchmarks: name: Run vLLM benchmarks @@ -87,6 +87,8 @@ jobs: fetch-depth: 0 - uses: actions/setup-python@v5 + # Amazon Linux fails on this step + continue-on-error: true with: python-version: '3.12' cache: 'pip' @@ -103,8 +105,8 @@ jobs: DEVICE_NAME=rocm rocm-smi else - echo "Only CUDA and ROCm benchmarks are supported at the moment" - exit 1 + DEVICE_NAME=cpu + lscpu fi echo "DEVICE_NAME=$DEVICE_NAME" >> $GITHUB_ENV @@ -118,6 +120,8 @@ jobs: DEVICE_TYPE=$(nvidia-smi -i 0 --query-gpu=name --format=csv,noheader | awk '{print $2}') elif [[ "${DEVICE_NAME}" == "rocm" ]]; then DEVICE_TYPE=$(rocminfo | grep "Marketing Name" | tail -n1 | awk -F':' '{print $2}' | xargs) + elif [[ "${DEVICE_NAME}" == "cpu" ]]; then + DEVICE_TYPE=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1' | cut -f 2 -d " ") fi echo "DEVICE_TYPE=$DEVICE_TYPE" >> $GITHUB_ENV @@ -126,22 +130,25 @@ jobs: run: | set -eux - if [[ "${DEVICE_NAME}" == "cuda" ]]; then - pip install -r .github/scripts/requirements.txt - elif [[ "${DEVICE_NAME}" == "rocm" ]]; then + if [[ "${DEVICE_NAME}" == "rocm" ]]; then pip install -r .github/scripts/requirements.txt \ --extra-index-url https://download.pytorch.org/whl/rocm6.3 + else + pip install -r .github/scripts/requirements.txt fi - name: Set Docker registry shell: bash run: | - if [[ "${DEVICE_NAME}" == "cuda" ]]; then - DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo - elif [[ "${DEVICE_NAME}" == "rocm" ]]; then + DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo + DOCKER_IMAGE_SUFFIX="" + if [[ "${DEVICE_NAME}" == "rocm" ]]; then DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci + elif [[ "${DEVICE_NAME}" == "cpu" ]]; then + DOCKER_IMAGE_SUFFIX=-cpu fi echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV + echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV - name: Check for last benchmark commit working-directory: vllm-benchmarks @@ -160,7 +167,7 @@ jobs: # Check if the image is there, if it doesn't then check an older one # because the commit is too recent HEAD_SHA=$(git rev-parse --verify HEAD~${i}) - DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}" + DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}" # No Docker image available yet because the commit is too recent if ! docker manifest inspect "${DOCKER_IMAGE}"; then @@ -206,10 +213,11 @@ jobs: popd # Set the list of benchmarks we want to cover in this runner - python .github/scripts/setup_vllm_benchmark.py \ + python3 .github/scripts/setup_vllm_benchmark.py \ --from-benchmark-configs-dir vllm-benchmarks/benchmarks \ --to-benchmark-configs-dir vllm-benchmarks/vllm/.buildkite/nightly-benchmarks/tests \ - --models "${MODELS}" + --models "${MODELS}" \ + --device "${DEVICE_NAME}" pushd vllm-benchmarks/vllm ls -lah .buildkite/nightly-benchmarks/tests @@ -221,14 +229,20 @@ jobs: SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 SCCACHE_REGION: us-east-1 HF_TOKEN: ${{ secrets.HF_TOKEN }} - DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }} + DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_SUFFIX }} # vLLM-related environment variables ENGINE_VERSION: v1 SAVE_TO_PYTORCH_BENCHMARK_FORMAT: 1 run: | - set -x + set -eux + + if [[ "${DEVICE_NAME}" == "cpu" ]]; then + ON_CPU=1 + else + ON_CPU=0 + fi - docker run \ + container_name=$(docker run \ ${GPU_FLAG:-} \ ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \ -e SCCACHE_BUCKET \ @@ -238,13 +252,17 @@ jobs: -e HF_TOKEN \ -e ENGINE_VERSION \ -e SAVE_TO_PYTORCH_BENCHMARK_FORMAT \ + -e ON_CPU="${ON_CPU}" \ --ipc=host \ --tty \ + --detach \ --security-opt seccomp=unconfined \ + --shm-size=4g \ -v "${GITHUB_WORKSPACE}:/tmp/workspace" \ -w /tmp/workspace \ - "${DOCKER_IMAGE}" \ - bash -xc "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh" + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" bash -c "cd vllm-benchmarks/vllm && bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh" - name: Authenticate with AWS # AWS CUDA runners already have access to the bucket via its runner IAM role @@ -266,9 +284,11 @@ jobs: sudo chown -R ${UID} "${BENCHMARK_RESULTS}" ls -lah "${BENCHMARK_RESULTS}" - python .github/scripts/upload_benchmark_results.py \ + SANITIZED_DEVICE_TYPE=$(echo "${DEVICE_TYPE// /_}" | sed "s/[^[:alpha:].-]/_/g") + python3 .github/scripts/upload_benchmark_results.py \ --repo vllm-benchmarks/vllm \ --benchmark-name "vLLM benchmark" \ --benchmark-results "${BENCHMARK_RESULTS}" \ - --device "${DEVICE_TYPE// /_}" \ + --device-name "${DEVICE_NAME}" \ + --device-type "${SANITIZED_DEVICE_TYPE}" \ --model "${MODELS//\//_}" diff --git a/vllm-benchmarks/benchmarks/README.md b/vllm-benchmarks/benchmarks/README.md index e06d262d..6d73caae 100644 --- a/vllm-benchmarks/benchmarks/README.md +++ b/vllm-benchmarks/benchmarks/README.md @@ -1,7 +1,3 @@ This directory mirrors the list of benchmarks from [vLLM](https://github.com/vllm-project/vllm/tree/main/.buildkite/nightly-benchmarks/tests), but it includes only models that we want to cover in PyTorch infra. - -Another note is that speculative decoding is not yet supported in v1 -with the exception of ngram, so its corresponding benchmarks is -currently removed from the list. diff --git a/vllm-benchmarks/benchmarks/cpu/latency-tests-cpu.json b/vllm-benchmarks/benchmarks/cpu/latency-tests-cpu.json new file mode 100644 index 00000000..da93fdd1 --- /dev/null +++ b/vllm-benchmarks/benchmarks/cpu/latency-tests-cpu.json @@ -0,0 +1,30 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_llama8B_tp4", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + } +] diff --git a/vllm-benchmarks/benchmarks/cpu/serving-tests-cpu.json b/vllm-benchmarks/benchmarks/cpu/serving-tests-cpu.json new file mode 100644 index 00000000..cb6df159 --- /dev/null +++ b/vllm-benchmarks/benchmarks/cpu/serving-tests-cpu.json @@ -0,0 +1,121 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 2, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama8B_tp4_random_1024_128", + "qps_list": [1, 4, 16, "inf"], + "server_environment_variables": { + "VLLM_RPC_TIMEOUT": 100000, + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "device": "cpu", + "dtype": "bfloat16", + "distributed_executor_backend": "mp", + "block_size": 128, + "trust_remote_code": "", + "enable_chunked_prefill": "", + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "random", + "random-input-len": 1024, + "random-output-len": 128, + "ignore-eos": "", + "num_prompts": 100 + } + } +] diff --git a/vllm-benchmarks/benchmarks/cpu/throughput-tests-cpu.json b/vllm-benchmarks/benchmarks/cpu/throughput-tests-cpu.json new file mode 100644 index 00000000..f159c306 --- /dev/null +++ b/vllm-benchmarks/benchmarks/cpu/throughput-tests-cpu.json @@ -0,0 +1,32 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_llama8B_tp4", + "environment_variables": { + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "VLLM_CPU_KVCACHE_SPACE": 40 + }, + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + } +] diff --git a/vllm-benchmarks/benchmarks/latency-tests.json b/vllm-benchmarks/benchmarks/cuda/latency-tests.json similarity index 100% rename from vllm-benchmarks/benchmarks/latency-tests.json rename to vllm-benchmarks/benchmarks/cuda/latency-tests.json diff --git a/vllm-benchmarks/benchmarks/serving-tests.json b/vllm-benchmarks/benchmarks/cuda/serving-tests.json similarity index 100% rename from vllm-benchmarks/benchmarks/serving-tests.json rename to vllm-benchmarks/benchmarks/cuda/serving-tests.json diff --git a/vllm-benchmarks/benchmarks/throughput-tests.json b/vllm-benchmarks/benchmarks/cuda/throughput-tests.json similarity index 100% rename from vllm-benchmarks/benchmarks/throughput-tests.json rename to vllm-benchmarks/benchmarks/cuda/throughput-tests.json diff --git a/vllm-benchmarks/benchmarks/rocm/latency-tests.json b/vllm-benchmarks/benchmarks/rocm/latency-tests.json new file mode 100644 index 00000000..9e9f15f8 --- /dev/null +++ b/vllm-benchmarks/benchmarks/rocm/latency-tests.json @@ -0,0 +1,54 @@ +[ + { + "test_name": "latency_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15 + } + }, + { + "test_name": "latency_llama70B_tp4", + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15 + } + }, + { + "test_name": "latency_mixtral8x7B_tp2", + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "num-iters-warmup": 5, + "num-iters": 15 + } + }, + { + "test_name": "latency_llama4_scout_tp4", + "parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + }, + { + "test_name": "latency_llama4_maverick_fp8_tp8", + "parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "load_format": "dummy", + "num_iters_warmup": 5, + "num_iters": 15, + "max_model_len": 8192 + } + } +] diff --git a/vllm-benchmarks/benchmarks/rocm/serving-tests.json b/vllm-benchmarks/benchmarks/rocm/serving-tests.json new file mode 100644 index 00000000..9456bb88 --- /dev/null +++ b/vllm-benchmarks/benchmarks/rocm/serving-tests.json @@ -0,0 +1,121 @@ +[ + { + "test_name": "serving_llama8B_tp1_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_mixtral8x7B_tp2_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy" + }, + "client_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama70B_tp4_sharegpt_specdecode", + "qps_list": [2], + "server_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "disable_log_requests": "", + "tensor_parallel_size": 4, + "swap_space": 16, + "speculative_config": { + "model": "turboderp/Qwama-0.5B-Instruct", + "num_speculative_tokens": 4, + "draft_tensor_parallel_size": 1 + } + }, + "client_parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama4_scout_tp4_sharegpt", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + }, + { + "test_name": "serving_llama4_maverick_fp8_tp8", + "qps_list": [1, 4, 16, "inf"], + "server_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "swap_space": 16, + "disable_log_stats": "", + "disable_log_requests": "", + "load_format": "dummy", + "max_model_len": 8192 + }, + "client_parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 + } + } +] diff --git a/vllm-benchmarks/benchmarks/rocm/throughput-tests.json b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json new file mode 100644 index 00000000..647ac2f3 --- /dev/null +++ b/vllm-benchmarks/benchmarks/rocm/throughput-tests.json @@ -0,0 +1,59 @@ +[ + { + "test_name": "throughput_llama8B_tp1", + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-8B-Instruct", + "tensor_parallel_size": 1, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_llama70B_tp4", + "parameters": { + "model": "meta-llama/Meta-Llama-3.1-70B-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_mixtral8x7B_tp2", + "parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm" + } + }, + { + "test_name": "throughput_llama4_scout_tp4", + "parameters": { + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "tensor_parallel_size": 4, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + }, + { + "test_name": "throughput_llama4_maverick_fp8_tp8", + "parameters": { + "model": "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + "tensor_parallel_size": 8, + "load_format": "dummy", + "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "backend": "vllm", + "max_model_len": 8192 + } + } +]