Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
46cfc1d
create empty
dmitriyrepin Sep 29, 2025
8fbc127
Add test_overwrite_behavior
dmitriyrepin Sep 29, 2025
6d1fcea
Fix pre-commit
dmitriyrepin Sep 29, 2025
687dcc8
Merge remote-tracking branch 'upstream' into create_empty
dmitriyrepin Sep 30, 2025
40bd1da
Update API
dmitriyrepin Sep 30, 2025
c73a7ed
move create_empty_mdio
dmitriyrepin Sep 30, 2025
5e36333
Revert TMP -> tmp change
dmitriyrepin Sep 30, 2025
7d0b562
Pre-commit formatting
dmitriyrepin Sep 30, 2025
56aecf6
_create_empty_mdio with template_name
dmitriyrepin Oct 1, 2025
eab14a7
Merge 'upstream/main'
dmitriyrepin Oct 1, 2025
b7f3c40
pre-commit
dmitriyrepin Oct 1, 2025
6c3a03b
Merge branch 'main' into create_empty
BrianMichell Oct 1, 2025
94fd3fe
PR review and test_populate_empty_dataset
dmitriyrepin Oct 2, 2025
67adce2
Merge 'upstream/main' into create_empty
dmitriyrepin Oct 2, 2025
67a03ec
Merge 'origin/create_empty' into create_empty
dmitriyrepin Oct 2, 2025
bf2e41f
USe headers: HeaderSpec
dmitriyrepin Oct 2, 2025
e1e3ce2
Add export to segy to test_populate_empty_dataset
dmitriyrepin Oct 3, 2025
b8d12cf
Merge remote-tracking branch 'upstream/main' into create_empty
dmitriyrepin Oct 6, 2025
903d78a
Update for upstream chnages
dmitriyrepin Oct 6, 2025
344cf65
Pre-commit added empty line
dmitriyrepin Oct 6, 2025
83b7717
Use Teapod dimensions
dmitriyrepin Oct 8, 2025
1072b98
Merge branch 'main' into create_empty
BrianMichell Oct 13, 2025
7a65c07
Merge remote-tracking branch 'upstream/main' into create_empty
dmitriyrepin Oct 27, 2025
251e2f6
Merge upstream/main' into create_empty
dmitriyrepin Oct 27, 2025
c7091a8
Merge branch 'oriugin/create_empty'
dmitriyrepin Oct 27, 2025
325466f
move creators/mdio.py : create_empty() to api/create.py: create_empty(
dmitriyrepin Oct 27, 2025
1a95f82
create_empty_like
dmitriyrepin Oct 27, 2025
26d2c26
Add stats to validate_xr_variable
dmitriyrepin Oct 27, 2025
bade123
fix white space change failure of pre-commit
dmitriyrepin Oct 27, 2025
baa3da3
remove tmp_path_factory
dmitriyrepin Oct 27, 2025
d974125
Ensure test order: create_empty after teapod_roundtrip
dmitriyrepin Oct 28, 2025
2d6e250
Fir createdOn in create_empty_like
dmitriyrepin Oct 28, 2025
1637396
Return xr_dataset from create_empty
dmitriyrepin Oct 28, 2025
cc1f6d7
Fix pre-commit
dmitriyrepin Oct 28, 2025
5d2ef41
Merge branch 'main' into create_empty
BrianMichell Oct 29, 2025
a9128cb
Merge upstream/main'
dmitriyrepin Oct 29, 2025
1da04f9
Adjust to the breaking changes in upstream/main
dmitriyrepin Oct 29, 2025
de35aeb
Merge branch 'origin/create_empty'
dmitriyrepin Oct 29, 2025
62e9be7
Address some of PR review comments
dmitriyrepin Oct 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/mdio/api/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,6 @@
"""Public API."""

from mdio.api.create import create_empty
from mdio.api.create import create_empty_like

__all__ = ["create_empty", "create_empty_like"]
168 changes: 168 additions & 0 deletions src/mdio/api/create.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
"""Creating MDIO v1 datasets."""

from __future__ import annotations

from datetime import UTC
from datetime import datetime
from typing import TYPE_CHECKING

from mdio.api.io import _normalize_path
from mdio.api.io import open_mdio
from mdio.api.io import to_mdio
from mdio.builder.template_registry import TemplateRegistry
from mdio.builder.xarray_builder import to_xarray_dataset
from mdio.converters.segy import populate_dim_coordinates
from mdio.converters.type_converter import to_structured_type
from mdio.core.grid import Grid

if TYPE_CHECKING:
from pathlib import Path

from segy.schema import HeaderSpec
from upath import UPath
from xarray import Dataset as xr_Dataset

from mdio.builder.schemas import Dataset
from mdio.builder.templates.base import AbstractDatasetTemplate
from mdio.core.dimension import Dimension


def create_empty( # noqa PLR0913
mdio_template: AbstractDatasetTemplate | str,
dimensions: list[Dimension],
output_path: UPath | Path | str | None,
headers: HeaderSpec | None = None,
overwrite: bool = False,
) -> xr_Dataset:
"""A function that creates an empty MDIO v1 file with known dimensions.

Args:
mdio_template: The MDIO template or template name to use to define the dataset structure.
dimensions: The dimensions of the MDIO file.
output_path: The universal path for the output MDIO v1 file.
headers: SEG-Y v1.0 trace headers. Defaults to None.
overwrite: Whether to overwrite the output file if it already exists. Defaults to False.

Returns:
The output MDIO dataset.

Raises:
FileExistsError: If the output location already exists and overwrite is False.
"""
output_path = _normalize_path(output_path)

if not overwrite and output_path.exists():
err = f"Output location '{output_path.as_posix()}' exists. Set `overwrite=True` if intended."
raise FileExistsError(err)

header_dtype = to_structured_type(headers.dtype) if headers else None
grid = Grid(dims=dimensions)
if isinstance(mdio_template, str):
# A template name is passed in. Get a unit-unaware template from registry
mdio_template = TemplateRegistry().get(mdio_template)
# Build the dataset using the template
mdio_ds: Dataset = mdio_template.build_dataset(name=mdio_template.name, sizes=grid.shape, header_dtype=header_dtype)

# Convert to xarray dataset
xr_dataset: xr_Dataset = to_xarray_dataset(mdio_ds=mdio_ds)

# Populate coordinates using the grid
# For empty datasets, we only populate dimension coordinates
drop_vars_delayed = []
xr_dataset, drop_vars_delayed = populate_dim_coordinates(xr_dataset, grid, drop_vars_delayed=drop_vars_delayed)

if headers:
# Since the headers were provided, the user wants to export to SEG-Y
# Add a dummy segy_file_header variable used to export to SEG-Y
xr_dataset["segy_file_header"] = ((), "")

# Create the Zarr store with the correct structure but with empty arrays
if output_path is not None:
to_mdio(xr_dataset, output_path=output_path, mode="w", compute=False)

# Write the dimension coordinates and trace mask
xr_dataset = xr_dataset[drop_vars_delayed + ["trace_mask"]]

if output_path is not None:
to_mdio(xr_dataset, output_path=output_path, mode="r+", compute=True)

return xr_dataset


def create_empty_like( # noqa PLR0913
input_path: UPath | Path | str,
output_path: UPath | Path | str,
keep_coordinates: bool = False,
overwrite: bool = False,
) -> xr_Dataset:
"""A function that creates an empty MDIO v1 file with the same structure as an existing one.

Args:
input_path: The path of the input MDIO file.
output_path: The path of the output MDIO file.
If None, the output will not be written to disk.
keep_coordinates: Whether to keep the coordinates in the output file.
overwrite: Whether to overwrite the output file if it exists.

Returns:
The output MDIO dataset.

Raises:
FileExistsError: If the output location already exists and overwrite is False.
"""
input_path = _normalize_path(input_path)
output_path = _normalize_path(output_path) if output_path is not None else None

if not overwrite and output_path is not None and output_path.exists():
err = f"Output location '{output_path.as_posix()}' exists. Set `overwrite=True` if intended."
raise FileExistsError(err)

ds = open_mdio(input_path)

# Create a copy with the same structure but no data or,
# optionally, coordinates
ds_output = ds.copy(data=None).reset_coords(drop=not keep_coordinates)

# Dataset
# Keep the name (which is the same as the used template name) and the original API version
# ds_output.attrs["name"]
# ds_output.attrs["apiVersion"]
ds_output.attrs["createdOn"] = str(datetime.now(UTC))

# Coordinates
if not keep_coordinates:
for coord_name in ds_output.coords:
ds_output[coord_name].attrs.pop("unitsV1", None)

# MDIO attributes
attr = ds_output.attrs["attributes"]
if attr is not None:
attr.pop("gridOverrides", None) # Empty dataset should not have gridOverrides
# Keep the original values for the following attributes
# attr["defaultVariableName"]
# attr["surveyType"]
# attr["gatherType"]

# "All traces should be marked as dead in empty dataset"
if "trace_mask" in ds_output.variables:
ds_output["trace_mask"][:] = False

# Data variable
var_name = attr["defaultVariableName"]
var = ds_output[var_name]
var.attrs.pop("statsV1", None)
if not keep_coordinates:
var.attrs.pop("unitsV1", None)

# SEG-Y file header
if "segy_file_header" in ds_output.variables:
segy_file_header = ds_output["segy_file_header"]
if segy_file_header is not None:
segy_file_header.attrs.pop("textHeader", None)
segy_file_header.attrs.pop("binaryHeader", None)
segy_file_header.attrs.pop("rawBinaryHeader", None)

if output_path is not None:
to_mdio(ds_output, output_path=output_path, mode="w", compute=True)

return ds_output
3 changes: 2 additions & 1 deletion src/mdio/builder/dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from enum import auto
from typing import Any

from mdio import __version__
from mdio.builder.formatting_html import dataset_builder_repr_html
from mdio.builder.schemas.compressors import ZFP
from mdio.builder.schemas.compressors import Blosc
Expand Down Expand Up @@ -59,6 +58,8 @@ class MDIODatasetBuilder:
"""

def __init__(self, name: str, attributes: dict[str, Any] | None = None):
from mdio import __version__ # noqa: PLC0415 - fixed circular import in mdio package and dataset_builder.py

self._metadata = DatasetMetadata(
name=name,
api_version=__version__,
Expand Down
28 changes: 26 additions & 2 deletions src/mdio/converters/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,30 @@
"""MDIO Data conversion API."""

from mdio.converters.mdio import mdio_to_segy
from mdio.converters.segy import segy_to_mdio
from typing import TYPE_CHECKING
from typing import Any

if TYPE_CHECKING:
from mdio.converters.mdio import mdio_to_segy
from mdio.converters.segy import segy_to_mdio

__all__ = ["mdio_to_segy", "segy_to_mdio"]


def __getattr__(name: str) -> Any: # noqa: ANN401 - required for dynamic attribute access
"""Lazy import for converters to avoid circular imports."""
if name == "mdio_to_segy":
from mdio.converters.mdio import ( # noqa: PLC0415 - intentionally inside the function to avoid circular imports
mdio_to_segy,
)

return mdio_to_segy

if name == "segy_to_mdio":
from mdio.converters.segy import ( # noqa: PLC0415 - intentionally inside the function to avoid circular imports
segy_to_mdio,
)

return segy_to_mdio

err = f"module {__name__!r} has no attribute {name!r}"
raise AttributeError(err)
18 changes: 18 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,18 @@ def zarr_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path:
return tmp_path_factory.mktemp(r"mdio")


@pytest.fixture(scope="session")
def teapot_mdio_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path:
"""Make a temp file for the output MDIO."""
return tmp_path_factory.mktemp(r"teapot.mdio")


@pytest.fixture(scope="module")
def mdio_4d_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path:
"""Make a temp file for the output MDIO."""
return tmp_path_factory.mktemp(r"tmp_4d.mdio")


@pytest.fixture(scope="module")
def zarr_tmp2(tmp_path_factory: pytest.TempPathFactory) -> Path: # pragma: no cover - used by disabled test
"""Make a temp file for the output MDIO."""
Expand All @@ -58,3 +70,9 @@ def segy_export_tmp(tmp_path_factory: pytest.TempPathFactory) -> Path:
"""Make a temp file for the round-trip IBM SEG-Y."""
tmp_dir = tmp_path_factory.mktemp("segy")
return tmp_dir / "teapot_roundtrip.segy"


@pytest.fixture(scope="class")
def empty_mdio_dir(tmp_path_factory: pytest.TempPathFactory) -> Path:
"""Make a temp file for empty MDIO testing."""
return tmp_path_factory.mktemp(r"empty_mdio_dir")
22 changes: 11 additions & 11 deletions tests/integration/test_import_streamer_grid_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class TestImport4DNonReg: # pragma: no cover - tests is skipped
def test_import_4d_segy( # noqa: PLR0913
self,
segy_mock_4d_shots: dict[StreamerShotGeometryType, Path],
zarr_tmp: Path,
mdio_4d_tmp: Path,
grid_override: dict[str, Any],
chan_header_type: StreamerShotGeometryType,
) -> None:
Expand All @@ -51,7 +51,7 @@ def test_import_4d_segy( # noqa: PLR0913
segy_spec=segy_spec,
mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"),
input_path=segy_path,
output_path=zarr_tmp,
output_path=mdio_4d_tmp,
overwrite=True,
grid_overrides=grid_override,
)
Expand All @@ -62,7 +62,7 @@ def test_import_4d_segy( # noqa: PLR0913
cables = [0, 101, 201, 301]
receivers_per_cable = [1, 5, 7, 5]

ds = open_mdio(zarr_tmp)
ds = open_mdio(mdio_4d_tmp)

assert ds["segy_file_header"].attrs["binaryHeader"]["samples_per_trace"] == num_samples
assert ds.attrs["attributes"]["gridOverrides"] == grid_override
Expand All @@ -86,7 +86,7 @@ class TestImport4D:
def test_import_4d_segy( # noqa: PLR0913
self,
segy_mock_4d_shots: dict[StreamerShotGeometryType, Path],
zarr_tmp: Path,
mdio_4d_tmp: Path,
grid_override: dict[str, Any],
chan_header_type: StreamerShotGeometryType,
) -> None:
Expand All @@ -98,7 +98,7 @@ def test_import_4d_segy( # noqa: PLR0913
segy_spec=segy_spec,
mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"),
input_path=segy_path,
output_path=zarr_tmp,
output_path=mdio_4d_tmp,
overwrite=True,
grid_overrides=grid_override,
)
Expand All @@ -109,7 +109,7 @@ def test_import_4d_segy( # noqa: PLR0913
cables = [0, 101, 201, 301]
receivers_per_cable = [1, 5, 7, 5]

ds = open_mdio(zarr_tmp)
ds = open_mdio(mdio_4d_tmp)

assert ds["segy_file_header"].attrs["binaryHeader"]["samples_per_trace"] == num_samples
assert ds.attrs["attributes"].get("gridOverrides", None) == grid_override # may not exist, so default=None
Expand All @@ -134,7 +134,7 @@ class TestImport4DSparse:
def test_import_4d_segy( # noqa: PLR0913
self,
segy_mock_4d_shots: dict[StreamerShotGeometryType, Path],
zarr_tmp: Path,
mdio_4d_tmp: Path,
chan_header_type: StreamerShotGeometryType,
) -> None:
"""Test importing a SEG-Y file to MDIO."""
Expand All @@ -148,7 +148,7 @@ def test_import_4d_segy( # noqa: PLR0913
segy_spec=segy_spec,
mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"),
input_path=segy_path,
output_path=zarr_tmp,
output_path=mdio_4d_tmp,
overwrite=True,
)

Expand All @@ -167,7 +167,7 @@ class TestImport6D: # pragma: no cover - tests is skipped
def test_import_6d_segy( # noqa: PLR0913
self,
segy_mock_4d_shots: dict[StreamerShotGeometryType, Path],
zarr_tmp: Path,
mdio_4d_tmp: Path,
grid_override: dict[str, Any],
chan_header_type: StreamerShotGeometryType,
) -> None:
Expand All @@ -179,7 +179,7 @@ def test_import_6d_segy( # noqa: PLR0913
segy_spec=segy_spec,
mdio_template=TemplateRegistry().get("XYZ"), # Placeholder for the template
input_path=segy_path,
output_path=zarr_tmp,
output_path=mdio_4d_tmp,
overwrite=True,
grid_overrides=grid_override,
)
Expand All @@ -195,7 +195,7 @@ def test_import_6d_segy( # noqa: PLR0913
guns = [1, 2]
receivers_per_cable = [1, 5, 7, 5]

ds = open_mdio(zarr_tmp)
ds = open_mdio(mdio_4d_tmp)

xrt.assert_duckarray_equal(ds["gun"], guns)
xrt.assert_duckarray_equal(ds["shot_point"], shots)
Expand Down
Loading
Loading