Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 57 additions & 12 deletions build/rocm/run_multi_gpu.sh
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -52,19 +52,64 @@ run_tests() {
# Create the log directory if it doesn't exist.
mkdir -p "$LOG_DIR"

python3 -m pytest \
--html="${LOG_DIR}/multi_gpu_pmap_test_log.html" \
--json-report \
--json-report-file="${LOG_DIR}/multi_gpu_pmap_test_log.json" \
--reruns 3 \
tests/pmap_test.py
# Multi-GPU test files
MULTI_GPU_TESTS=(
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I presume, this variable must have the same definition in both runners, right?
Then perhaps you could just refactor it to a separate script that both runners will source to get the same variable value?
This would simplify and robustify maintenance of the tests..

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, the second runner is a python script actually... That makes things more annoying, but still doable. I fear, definitions could easily go out of sync if left as is. Even now it's hard to validate they are in sync..

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any thoughts on this, @gulsumgudukbay ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

working on it

"tests/multiprocess_gpu_test.py"
"tests/debug_info_test.py"
"tests/checkify_test.py"
"tests/mosaic/gpu_test.py"
"tests/random_test.py"
"tests/jax_jit_test.py"
"tests/mesh_utils_test.py"
"tests/pjit_test.py"
"tests/linalg_sharding_test.py"
"tests/multi_device_test.py"
"tests/distributed_test.py"
"tests/shard_alike_test.py"
"tests/api_test.py"
"tests/ragged_collective_test.py"
"tests/batching_test.py"
"tests/scaled_matmul_stablehlo_test.py"
"tests/export_harnesses_multi_platform_test.py"
"tests/pickle_test.py"
"tests/roofline_test.py"
"tests/profiler_test.py"
"tests/error_check_test.py"
"tests/debug_nans_test.py"
"tests/shard_map_test.py"
"tests/colocated_python_test.py"
"tests/cudnn_fusion_test.py"
"tests/compilation_cache_test.py"
"tests/export_back_compat_test.py"
"tests/pgle_test.py"
"tests/ffi_test.py"
"tests/lax_control_flow_test.py"
"tests/fused_attention_stablehlo_test.py"
"tests/layout_test.py"
"tests/pmap_test.py"
"tests/aot_test.py"
"tests/mock_gpu_topology_test.py"
"tests/ann_test.py"
"tests/debugging_primitives_test.py"
"tests/array_test.py"
"tests/export_test.py"
"tests/memories_test.py"
"tests/debugger_test.py"
"tests/python_callback_test.py"
)

python3 -m pytest \
--html="${LOG_DIR}/multi_gpu_multi_device_test_log.html" \
--json-report \
--json-report-file="${LOG_DIR}/multi_gpu_multi_device_test_log.json" \
--reruns 3 \
tests/multi_device_test.py
# Run each multi-GPU test
for test_file in "${MULTI_GPU_TESTS[@]}"; do
test_name=$(basename "$test_file" .py)
echo "Running multi-GPU test: $test_file"

python3 -m pytest \
--html="${LOG_DIR}/multi_gpu_${test_name}_log.html" \
--json-report \
--json-report-file="${LOG_DIR}/multi_gpu_${test_name}_log.json" \
--reruns 3 \
"$test_file"
done

# Merge individual HTML reports into one.
python3 -m pytest_html_merger \
Expand Down
66 changes: 63 additions & 3 deletions build/rocm/run_single_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,52 @@
LAST_CODE = 0
base_dir = "./logs"

# Multi-GPU test files that should be excluded from single GPU runs
MULTI_GPU_TESTS = {
"tests/multiprocess_gpu_test.py",
"tests/debug_info_test.py",
"tests/checkify_test.py",
"tests/mosaic/gpu_test.py",
"tests/random_test.py",
"tests/jax_jit_test.py",
"tests/mesh_utils_test.py",
"tests/pjit_test.py",
"tests/linalg_sharding_test.py",
"tests/multi_device_test.py",
"tests/distributed_test.py",
"tests/shard_alike_test.py",
"tests/api_test.py",
"tests/ragged_collective_test.py",
"tests/batching_test.py",
"tests/scaled_matmul_stablehlo_test.py",
"tests/export_harnesses_multi_platform_test.py",
"tests/pickle_test.py",
"tests/roofline_test.py",
"tests/profiler_test.py",
"tests/error_check_test.py",
"tests/debug_nans_test.py",
"tests/shard_map_test.py",
"tests/colocated_python_test.py",
"tests/cudnn_fusion_test.py",
"tests/compilation_cache_test.py",
"tests/export_back_compat_test.py",
"tests/pgle_test.py",
"tests/ffi_test.py",
"tests/lax_control_flow_test.py",
"tests/fused_attention_stablehlo_test.py",
"tests/layout_test.py",
"tests/pmap_test.py",
"tests/aot_test.py",
"tests/mock_gpu_topology_test.py",
"tests/ann_test.py",
"tests/debugging_primitives_test.py",
"tests/array_test.py",
"tests/export_test.py",
"tests/memories_test.py",
"tests/debugger_test.py",
"tests/python_callback_test.py"
}


def extract_filename(path):
base_name = os.path.basename(path)
Expand Down Expand Up @@ -111,10 +157,24 @@ def collect_testmodules():
exit(return_code)
print("---------- collected test modules ----------")
test_files = parse_test_log(log_file)
print("Found %d test modules." % (len(test_files)))

# Filter out multi-GPU tests
filtered_test_files = set()
excluded_count = 0
for test_file in test_files:
# Convert absolute path to relative path for comparison
relative_path = os.path.relpath(test_file)
if relative_path not in MULTI_GPU_TESTS:
filtered_test_files.add(test_file)
else:
excluded_count += 1
print(f"Excluding multi-GPU test: {relative_path}")

print("Found %d test modules." % (len(filtered_test_files)))
print("Excluded %d multi-GPU test modules." % excluded_count)
print("--------------------------------------------")
print("\n".join(test_files))
return test_files
print("\n".join(filtered_test_files))
return filtered_test_files


def run_test(testmodule, gpu_tokens, continue_on_fail):
Expand Down