Skip to content

Commit f8168d2

Browse files
authored
[IntegTest][develop] For RHEL8, in test_slurm_memory_based_scheduling allocate more memory to avoid OOM kills in isolated regions
* For RHEL8, in isolated regions, allocate more memory to avoid OOM kills * Add a FIXME comment specifying it's a short term solution.
1 parent 087916e commit f8168d2

File tree

1 file changed

+12
-2
lines changed

1 file changed

+12
-2
lines changed

tests/integration-tests/tests/schedulers/test_slurm.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -724,13 +724,15 @@ def test_slurm_custom_config_parameters(
724724
assert "4100" == slurm_commands.get_node_attribute("q1-dy-cr2-1", "Memory")
725725

726726

727-
@pytest.mark.usefixtures("region", "os", "instance", "scheduler")
727+
@pytest.mark.usefixtures("instance", "scheduler")
728728
@pytest.mark.slurm_memory_based_scheduling
729729
def test_slurm_memory_based_scheduling(
730730
pcluster_config_reader,
731731
clusters_factory,
732732
test_datadir,
733733
scheduler_commands_factory,
734+
os,
735+
region,
734736
):
735737
cluster_config = pcluster_config_reader()
736738
cluster = clusters_factory(cluster_config)
@@ -757,6 +759,8 @@ def test_slurm_memory_based_scheduling(
757759
remote_command_executor,
758760
slurm_commands,
759761
test_datadir,
762+
os,
763+
region,
760764
)
761765

762766
_test_memory_based_scheduling_with_multiple_instance_types(slurm_commands)
@@ -2448,6 +2452,8 @@ def _test_memory_based_scheduling_enabled_true(
24482452
remote_command_executor,
24492453
slurm_commands,
24502454
test_datadir,
2455+
os,
2456+
region,
24512457
):
24522458
"""Test Slurm with memory-based scheduling feature enabled"""
24532459

@@ -2517,12 +2523,16 @@ def _test_memory_based_scheduling_enabled_true(
25172523
"raise_on_error": False,
25182524
}
25192525
)
2526+
2527+
# FIXME: This is a short term way to unblock the test in isolated regions under conditions we do not fully get.
2528+
# For RHEL8, allocate more memory to avoid OOM kills in isolated regions
2529+
mem_allocation = "3000" if os == "rhel8" and "us-iso" in region else "2500"
25202530
job_id_2 = slurm_commands.submit_command_and_assert_job_accepted(
25212531
submit_command_args={
25222532
"nodes": 1,
25232533
"slots": 1,
25242534
"command": "srun ./a.out 2000000000",
2525-
"other_options": "--mem=2500 -w queue1-st-ondemand1-i1-1",
2535+
"other_options": f"--mem={mem_allocation} -w queue1-st-ondemand1-i1-1",
25262536
"raise_on_error": False,
25272537
}
25282538
)

0 commit comments

Comments
 (0)