Reduce fsspec usage in tests (#776)

tomwhite · web-flow · commit 9be84e044cd8 · 2025-07-29T12:18:24.000+01:00
* Reduce use of fsspec in tests.

Remove the code that uses fsspec to delete data from cloud buckets. This is not needed as the cloud bucket is configured to delete all data over 24 hours in age.

* Add references to obstore as well as fsspec in docs.
diff --git a/cubed/core/array.py b/cubed/core/array.py
@@ -375,7 +375,7 @@ def measure_reserved_mem(
         reports peak memory, such as Lithops or Modal.
 
     work_dir : str or None, optional
-        The directory path (specified as an fsspec URL) used for storing intermediate data.
+        The directory path (specified as an fsspec or obstore URL) used for storing intermediate data.
         This is required when using a cloud runtime.
 
     kwargs
diff --git a/cubed/spec.py b/cubed/spec.py
@@ -29,7 +29,7 @@ def __init__(
         Parameters
         ----------
         work_dir : str or None
-            The directory path (specified as an fsspec URL) used for storing intermediate data.
+            The directory path (specified as an fsspec or obstore URL) used for storing intermediate data.
         allowed_mem : int or str, optional
             The total memory available to a worker for running a task, in bytes.
 
@@ -68,7 +68,7 @@ def __init__(
 
     @property
     def work_dir(self) -> Optional[str]:
-        """The directory path (specified as an fsspec URL) used for storing intermediate data."""
+        """The directory path (specified as an fsspec or obstore URL) used for storing intermediate data."""
         return self._work_dir
 
     @property
diff --git a/cubed/tests/runtime/test_modal.py b/cubed/tests/runtime/test_modal.py
@@ -6,13 +6,11 @@
 
 import asyncio
 
-import fsspec
-
 from cubed.runtime.asyncio import async_map_unordered
 from cubed.runtime.executors.modal import modal_create_futures_func
 from cubed.tests.runtime.utils import check_invocation_counts, deterministic_failure
 
-tmp_path = "s3://cubed-unittest/map_unordered"
+BASE_PATH = "s3://cubed-unittest/map_unordered"
 region = "us-east-1"  # S3 region for above bucket
 
 app = modal.App("cubed-test-app", include_source=True)
@@ -104,24 +102,20 @@ async def run_test(app_function, input, use_backups=False, batch_size=None, **kw
 # fmt: on
 @pytest.mark.parametrize("use_backups", [False, True])
 @pytest.mark.cloud
-def test_success(timing_map, n_tasks, retries, use_backups):
-    try:
-        outputs = asyncio.run(
-            run_test(
-                app_function=deterministic_failure_modal,
-                input=range(n_tasks),
-                use_backups=use_backups,
-                path=tmp_path,
-                timing_map=timing_map,
-            )
+def test_success(tmp_path, timing_map, n_tasks, retries, use_backups):
+    path = f"{BASE_PATH}/{tmp_path.name}"
+    outputs = asyncio.run(
+        run_test(
+            app_function=deterministic_failure_modal,
+            input=range(n_tasks),
+            use_backups=use_backups,
+            path=path,
+            timing_map=timing_map,
         )
+    )
 
-        assert outputs == set(range(n_tasks))
-        check_invocation_counts(tmp_path, timing_map, n_tasks, retries)
-
-    finally:
-        fs = fsspec.open(tmp_path).fs
-        fs.rm(tmp_path, recursive=True)
+    assert outputs == set(range(n_tasks))
+    check_invocation_counts(path, timing_map, n_tasks, retries)
 
 
 # fmt: off
@@ -135,24 +129,20 @@ def test_success(timing_map, n_tasks, retries, use_backups):
 # fmt: on
 @pytest.mark.parametrize("use_backups", [False, True])
 @pytest.mark.cloud
-def test_failure(timing_map, n_tasks, retries, use_backups):
-    try:
-        with pytest.raises(RuntimeError):
-            asyncio.run(
-                run_test(
-                    app_function=deterministic_failure_modal,
-                    input=range(n_tasks),
-                    use_backups=use_backups,
-                    path=tmp_path,
-                    timing_map=timing_map,
-                )
+def test_failure(tmp_path, timing_map, n_tasks, retries, use_backups):
+    path = f"{BASE_PATH}/{tmp_path.name}"
+    with pytest.raises(RuntimeError):
+        asyncio.run(
+            run_test(
+                app_function=deterministic_failure_modal,
+                input=range(n_tasks),
+                use_backups=use_backups,
+                path=path,
+                timing_map=timing_map,
             )
+        )
 
-        check_invocation_counts(tmp_path, timing_map, n_tasks, retries)
-
-    finally:
-        fs = fsspec.open(tmp_path).fs
-        fs.rm(tmp_path, recursive=True)
+    check_invocation_counts(path, timing_map, n_tasks, retries)
 
 
 # fmt: off
@@ -165,24 +155,20 @@ def test_failure(timing_map, n_tasks, retries, use_backups):
 # fmt: on
 @pytest.mark.parametrize("use_backups", [False, True])
 @pytest.mark.cloud
-def test_large_number_of_tasks(timing_map, n_tasks, retries, use_backups):
-    try:
-        outputs = asyncio.run(
-            run_test(
-                app_function=deterministic_failure_modal,
-                input=range(n_tasks),
-                use_backups=use_backups,
-                path=tmp_path,
-                timing_map=timing_map
-            )
+def test_large_number_of_tasks(tmp_path, timing_map, n_tasks, retries, use_backups):
+    path = f"{BASE_PATH}/{tmp_path.name}"
+    outputs = asyncio.run(
+        run_test(
+            app_function=deterministic_failure_modal,
+            input=range(n_tasks),
+            use_backups=use_backups,
+            path=path,
+            timing_map=timing_map
         )
+    )
 
-        assert outputs == set(range(n_tasks))
-        check_invocation_counts(tmp_path, timing_map, n_tasks, retries)
-
-    finally:
-        fs = fsspec.open(tmp_path).fs
-        fs.rm(tmp_path, recursive=True)
+    assert outputs == set(range(n_tasks))
+    check_invocation_counts(path, timing_map, n_tasks, retries)
 
 
 # fmt: off
@@ -195,43 +181,35 @@ def test_large_number_of_tasks(timing_map, n_tasks, retries, use_backups):
 )
 # fmt: on
 @pytest.mark.cloud
-def test_stragglers(timing_map, n_tasks, retries, expected_invocation_counts_overrides):
-    try:
-        outputs = asyncio.run(
-            run_test(
-                app_function=deterministic_failure_modal_long_timeout,
-                input=range(n_tasks),
-                path=tmp_path,
-                timing_map=timing_map,
-                use_backups=True,
-            )
+def test_stragglers(tmp_path, timing_map, n_tasks, retries, expected_invocation_counts_overrides):
+    path = f"{BASE_PATH}/{tmp_path.name}"
+    outputs = asyncio.run(
+        run_test(
+            app_function=deterministic_failure_modal_long_timeout,
+            input=range(n_tasks),
+            path=path,
+            timing_map=timing_map,
+            use_backups=True,
         )
+    )
 
-        assert outputs == set(range(n_tasks))
-        check_invocation_counts(tmp_path, timing_map, n_tasks, retries, expected_invocation_counts_overrides)
-
-    finally:
-        fs = fsspec.open(tmp_path).fs
-        fs.rm(tmp_path, recursive=True)
+    assert outputs == set(range(n_tasks))
+    check_invocation_counts(path, timing_map, n_tasks, retries, expected_invocation_counts_overrides)
 
 
 @pytest.mark.cloud
 def test_batch(tmp_path):
     # input is unbounded, so if entire input were consumed and not read
     # in batches then it would never return, since it would never
     # run the first (failing) input
-    try:
-        with pytest.raises(RuntimeError):
-            asyncio.run(
-                run_test(
-                    app_function=deterministic_failure_modal_no_retries,
-                    input=itertools.count(),
-                    path=tmp_path,
-                    timing_map={0: [-1]},
-                    batch_size=10,
-                )
+    path = f"{BASE_PATH}/{tmp_path.name}"
+    with pytest.raises(RuntimeError):
+        asyncio.run(
+            run_test(
+                app_function=deterministic_failure_modal_no_retries,
+                input=itertools.count(),
+                path=path,
+                timing_map={0: [-1]},
+                batch_size=10,
             )
-
-    finally:
-        fs = fsspec.open(tmp_path).fs
-        fs.rm(tmp_path, recursive=True)
+        )
diff --git a/cubed/tests/test_array_api.py b/cubed/tests/test_array_api.py
@@ -1,4 +1,3 @@
-import fsspec
 import numpy as np
 import pytest
 
@@ -430,52 +429,46 @@ def test_matmul(spec, executor):
 
 @pytest.mark.cloud
 def test_matmul_cloud(executor):
-    tmp_path = "gs://barry-zarr-test/matmul"
-    spec = cubed.Spec(tmp_path, allowed_mem=100000)
-    try:
-        a = xp.asarray(
-            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
-            chunks=(2, 2),
-            spec=spec,
-        )
-        b = xp.asarray(
-            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
-            chunks=(2, 2),
-            spec=spec,
-        )
-        c = xp.matmul(a, b)
-        x = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
-        y = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
-        expected = np.matmul(x, y)
-        assert_array_equal(c.compute(executor=executor), expected)
-    finally:
-        fs = fsspec.open(tmp_path).fs
-        fs.rm(tmp_path, recursive=True)
+    tmp_path = "s3://cubed-unittest/matmul"
+    spec = cubed.Spec(tmp_path, allowed_mem=100000, storage_options=dict(use_obstore=True))
+
+    a = xp.asarray(
+        [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
+        chunks=(2, 2),
+        spec=spec,
+    )
+    b = xp.asarray(
+        [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
+        chunks=(2, 2),
+        spec=spec,
+    )
+    c = xp.matmul(a, b)
+    x = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
+    y = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
+    expected = np.matmul(x, y)
+    assert_array_equal(c.compute(executor=executor), expected)
 
 
 @pytest.mark.cloud
 def test_matmul_modal(modal_executor):
     tmp_path = "s3://cubed-unittest/matmul"
-    spec = cubed.Spec(tmp_path, allowed_mem=100000)
-    try:
-        a = xp.asarray(
-            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
-            chunks=(2, 2),
-            spec=spec,
-        )
-        b = xp.asarray(
-            [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
-            chunks=(2, 2),
-            spec=spec,
-        )
-        c = xp.matmul(a, b)
-        x = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
-        y = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
-        expected = np.matmul(x, y)
-        assert_array_equal(c.compute(executor=modal_executor), expected)
-    finally:
-        fs = fsspec.open(tmp_path).fs
-        fs.rm(tmp_path, recursive=True)
+    spec = cubed.Spec(tmp_path, allowed_mem=100000, storage_options=dict(use_obstore=True))
+
+    a = xp.asarray(
+        [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
+        chunks=(2, 2),
+        spec=spec,
+    )
+    b = xp.asarray(
+        [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
+        chunks=(2, 2),
+        spec=spec,
+    )
+    c = xp.matmul(a, b)
+    x = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
+    y = np.array([[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]])
+    expected = np.matmul(x, y)
+    assert_array_equal(c.compute(executor=modal_executor), expected)
 
 
 def test_outer(spec, executor):
diff --git a/cubed/tests/test_executor_features.py b/cubed/tests/test_executor_features.py
@@ -2,7 +2,6 @@
 import platform
 import re
 
-import fsspec
 import numpy as np
 import psutil
 import pytest
@@ -152,20 +151,17 @@ def test_callbacks_modal(spec, modal_executor):
     task_counter = TaskCounter(check_timestamps=False)
     tmp_path = "s3://cubed-unittest/callbacks"
     spec = cubed.Spec(tmp_path, allowed_mem=100000)
-    try:
-        a = xp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], chunks=(2, 2), spec=spec)
-        b = xp.asarray([[1, 1, 1], [1, 1, 1], [1, 1, 1]], chunks=(2, 2), spec=spec)
-        c = xp.add(a, b)
-        assert_array_equal(
-            c.compute(executor=modal_executor, callbacks=[task_counter]),
-            np.array([[2, 3, 4], [5, 6, 7], [8, 9, 10]]),
-        )
 
-        num_created_arrays = 1
-        assert task_counter.value == num_created_arrays + 4
-    finally:
-        fs = fsspec.open(tmp_path).fs
-        fs.rm(tmp_path, recursive=True)
+    a = xp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], chunks=(2, 2), spec=spec)
+    b = xp.asarray([[1, 1, 1], [1, 1, 1], [1, 1, 1]], chunks=(2, 2), spec=spec)
+    c = xp.add(a, b)
+    assert_array_equal(
+        c.compute(executor=modal_executor, callbacks=[task_counter]),
+        np.array([[2, 3, 4], [5, 6, 7], [8, 9, 10]]),
+    )
+
+    num_created_arrays = 1
+    assert task_counter.value == num_created_arrays + 4
 
 
 @pytest.mark.skipif(
@@ -246,19 +242,16 @@ def test_compute_arrays_in_parallel(spec, any_executor, compute_arrays_in_parall
 def test_compute_arrays_in_parallel_modal(modal_executor, compute_arrays_in_parallel):
     tmp_path = "s3://cubed-unittest/parallel_pipelines"
     spec = cubed.Spec(tmp_path, allowed_mem=100000)
-    try:
-        a = cubed.random.random((10, 10), chunks=(5, 5), spec=spec)
-        b = cubed.random.random((10, 10), chunks=(5, 5), spec=spec)
-        c = xp.add(a, b)
 
-        # note that this merely checks that compute_arrays_in_parallel is accepted
-        c.compute(
-            executor=modal_executor,
-            compute_arrays_in_parallel=compute_arrays_in_parallel,
-        )
-    finally:
-        fs = fsspec.open(tmp_path).fs
-        fs.rm(tmp_path, recursive=True)
+    a = cubed.random.random((10, 10), chunks=(5, 5), spec=spec)
+    b = cubed.random.random((10, 10), chunks=(5, 5), spec=spec)
+    c = xp.add(a, b)
+
+    # note that this merely checks that compute_arrays_in_parallel is accepted
+    c.compute(
+        executor=modal_executor,
+        compute_arrays_in_parallel=compute_arrays_in_parallel,
+    )
 
 
 def test_check_runtime_memory_dask(spec, executor):
diff --git a/docs/user-guide/storage.md b/docs/user-guide/storage.md
@@ -8,7 +8,7 @@ Cubed will delete intermediate data only when the main Python process running th
 
 ## Cloud storage
 
-When using a cloud service, the working directory should be set to a cloud storage directory in the same cloud region that the executor runtimes are in. In this case the directory is specified as a [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) URL, such as `s3://cubed-tomwhite-temp`. This is how you would set it using a {py:class}`Spec <cubed.Spec>` object:
+When using a cloud service, the working directory should be set to a cloud storage directory in the same cloud region that the executor runtimes are in. In this case the directory is specified as a [`fsspec`](https://filesystem-spec.readthedocs.io/en/latest/) or [`obstore`](https://developmentseed.org/obstore/latest/) URL, such as `s3://cubed-tomwhite-temp`. This is how you would set it using a {py:class}`Spec <cubed.Spec>` object:
 
 ```python
 import cubed