From 54dc0f9dcad0738d3762f98390a71913f0af06bd Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 22 Oct 2025 16:06:19 +0000 Subject: [PATCH 01/21] Working example of DuplicateIndex grid override --- src/mdio/converters/segy.py | 35 ++++++++++++++---- src/mdio/segy/geometry.py | 37 ++++++++++++++++++- src/mdio/segy/utilities.py | 8 +++- .../test_import_streamer_grid_overrides.py | 11 +++--- 4 files changed, 75 insertions(+), 16 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 708d4aed1..fcedeb08a 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -143,7 +143,11 @@ def _scan_for_headers( """Extract trace dimensions and index headers from the SEG-Y file. This is an expensive operation. - It scans the SEG-Y file in chunks by using ProcessPoolExecutor + It scans the SEG-Y file in chunks by using ProcessPoolExecutor. + + Note: + If grid_overrides are applied to the template before calling this function, + the chunk_size returned from get_grid_plan should match the template's chunk shape. """ full_chunk_shape = template.full_chunk_shape segy_dimensions, chunk_size, segy_headers = get_grid_plan( @@ -154,13 +158,19 @@ def _scan_for_headers( chunksize=full_chunk_shape, grid_overrides=grid_overrides, ) + + # After applying grid overrides to the template, chunk sizes should match + # If they don't match, it means the template wasn't properly updated if full_chunk_shape != chunk_size: - # TODO(Dmitriy): implement grid overrides - # https://github.com/TGSAI/mdio-python/issues/585 - # The returned 'chunksize' is used only for grid_overrides. We will need to use it when full - # support for grid overrides is implemented - err = "Support for changing full_chunk_shape in grid overrides is not yet implemented" - raise NotImplementedError(err) + logger.warning( + "Chunk shape mismatch: template has %s but grid_plan returned %s. " + "Using grid_plan chunk shape.", + full_chunk_shape, + chunk_size, + ) + # Update the template's chunk shape to match what grid_plan returned + template._var_chunk_shape = chunk_size + return segy_dimensions, segy_headers @@ -552,6 +562,17 @@ def segy_to_mdio( # noqa PLR0913 ) grid = _build_and_check_grid(segy_dimensions, segy_file_info, segy_headers) + # Update template dimensions to match the actual grid dimensions after grid overrides + # The chunk shape was already updated in _scan_for_headers, we just need to fix dimensions + actual_spatial_dims = tuple(grid.dim_names[:-1]) # All dims except the vertical/time dimension + if mdio_template.spatial_dimension_names != actual_spatial_dims: + logger.info( + "Adjusting template dimensions from %s to %s to match grid after overrides", + mdio_template.spatial_dimension_names, + actual_spatial_dims, + ) + mdio_template._dim_names = actual_spatial_dims + (mdio_template.trace_domain,) + _, non_dim_coords = _get_coordinates(grid, segy_headers, mdio_template) header_dtype = to_structured_type(segy_spec.trace.header.dtype) diff --git a/src/mdio/segy/geometry.py b/src/mdio/segy/geometry.py index ed41e42e8..d8053cf24 100644 --- a/src/mdio/segy/geometry.py +++ b/src/mdio/segy/geometry.py @@ -267,7 +267,8 @@ def analyze_non_indexed_headers(index_headers: HeaderArray, dtype: DTypeLike = n header_names = [] for header_key in index_headers.dtype.names: if header_key != "trace": - unique_headers[header_key] = np.sort(np.unique(index_headers[header_key])) + unique_vals = np.sort(np.unique(index_headers[header_key])) + unique_headers[header_key] = unique_vals header_names.append(header_key) total_depth += 1 @@ -382,7 +383,39 @@ def transform( """Perform the grid transform.""" self.validate(index_headers, grid_overrides) - return analyze_non_indexed_headers(index_headers) + # Filter to only include dimension fields, not coordinate fields + # Coordinate fields typically have _x, _y suffixes or specific names like 'gun' + # We want to keep fields like shot_point, cable, channel but exclude source_coord_x, etc. + dimension_fields = [] + coordinate_field_patterns = ['_x', '_y', '_coord', 'gun', 'source', 'group'] + + for field_name in index_headers.dtype.names: + # Skip if it's already trace + if field_name == 'trace': + continue + # Check if it looks like a coordinate field + is_coordinate = any(pattern in field_name for pattern in coordinate_field_patterns) + if not is_coordinate: + dimension_fields.append(field_name) + + # Extract only dimension fields for trace indexing + if dimension_fields: + dimension_headers = index_headers[dimension_fields] + else: + # If no dimension fields, use all fields + dimension_headers = index_headers + + # Create trace indices based on dimension fields only + dimension_headers_with_trace = analyze_non_indexed_headers(dimension_headers) + + # Add the trace field back to the full index_headers array + if dimension_headers_with_trace is not None and 'trace' in dimension_headers_with_trace.dtype.names: + # Extract just the trace values array (not the whole structured array) + trace_values = np.array(dimension_headers_with_trace['trace']) + # Append as a new field to the full headers + index_headers = rfn.append_fields(index_headers, 'trace', trace_values, usemask=False) + + return index_headers def transform_index_names(self, index_names: Sequence[str]) -> Sequence[str]: """Insert dimension "trace" to the sample-1 dimension.""" diff --git a/src/mdio/segy/utilities.py b/src/mdio/segy/utilities.py index f6d76bc6a..719549373 100644 --- a/src/mdio/segy/utilities.py +++ b/src/mdio/segy/utilities.py @@ -72,9 +72,15 @@ def get_grid_plan( # noqa: C901, PLR0913 chunksize=chunksize, grid_overrides=grid_overrides, ) + # Use the spatial dimension names from horizontal_coordinates (which may have been modified by grid overrides) + # Extract only the dimension names (not including non-dimension coordinates) + # After grid overrides, trace might have been added to horizontal_coordinates + transformed_spatial_dims = [name for name in horizontal_coordinates if name in horizontal_dimensions or name == "trace"] dimensions = [] - for dim_name in horizontal_dimensions: + for dim_name in transformed_spatial_dims: + if dim_name not in headers_subset.dtype.names: + continue dim_unique = np.unique(headers_subset[dim_name]) dimensions.append(Dimension(coords=dim_unique, name=dim_name)) diff --git a/tests/integration/test_import_streamer_grid_overrides.py b/tests/integration/test_import_streamer_grid_overrides.py index d05070f5d..1067a7f19 100644 --- a/tests/integration/test_import_streamer_grid_overrides.py +++ b/tests/integration/test_import_streamer_grid_overrides.py @@ -27,13 +27,12 @@ dask.config.set(scheduler="synchronous") os.environ["MDIO__IMPORT__SAVE_SEGY_FILE_HEADER"] = "true" - -# TODO(Altay): Finish implementing these grid overrides. +# TODO(BrianMichell): Add non-binned back # https://github.com/TGSAI/mdio-python/issues/612 -@pytest.mark.skip(reason="NonBinned and HasDuplicates haven't been properly implemented yet.") -@pytest.mark.parametrize("grid_override", [{"NonBinned": True}, {"HasDuplicates": True}]) +# @pytest.mark.parametrize("grid_override", [{"NonBinned": True, "chunksize": 4}, {"HasDuplicates": True}]) +@pytest.mark.parametrize("grid_override", [{"HasDuplicates": True}]) @pytest.mark.parametrize("chan_header_type", [StreamerShotGeometryType.C]) -class TestImport4DNonReg: # pragma: no cover - tests is skipped +class TestImport4DNonReg: """Test for 4D segy import with grid overrides.""" def test_import_4d_segy( # noqa: PLR0913 @@ -67,7 +66,7 @@ def test_import_4d_segy( # noqa: PLR0913 assert ds["segy_file_header"].attrs["binaryHeader"]["samples_per_trace"] == num_samples assert ds.attrs["attributes"]["gridOverrides"] == grid_override - assert npt.assert_array_equal(ds["shot_point"], shots) + xrt.assert_duckarray_equal(ds["shot_point"], shots) xrt.assert_duckarray_equal(ds["cable"], cables) # assert grid.select_dim("trace") == Dimension(range(1, np.amax(receivers_per_cable) + 1), "trace") From b3712281871206ea109d9e754445ecba5a707fb6 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 22 Oct 2025 16:13:20 +0000 Subject: [PATCH 02/21] Remove hardcoded values + linting --- src/mdio/converters/segy.py | 3 +- src/mdio/segy/geometry.py | 44 ++++++++++--------- src/mdio/segy/utilities.py | 5 ++- .../test_import_streamer_grid_overrides.py | 2 +- 4 files changed, 29 insertions(+), 25 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index fcedeb08a..5e25c49b6 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -163,8 +163,7 @@ def _scan_for_headers( # If they don't match, it means the template wasn't properly updated if full_chunk_shape != chunk_size: logger.warning( - "Chunk shape mismatch: template has %s but grid_plan returned %s. " - "Using grid_plan chunk shape.", + "Chunk shape mismatch: template has %s but grid_plan returned %s. Using grid_plan chunk shape.", full_chunk_shape, chunk_size, ) diff --git a/src/mdio/segy/geometry.py b/src/mdio/segy/geometry.py index d8053cf24..f9f621b2d 100644 --- a/src/mdio/segy/geometry.py +++ b/src/mdio/segy/geometry.py @@ -24,6 +24,8 @@ from numpy.typing import NDArray from segy.arrays import HeaderArray + from mdio.builder.templates.base import AbstractDatasetTemplate + logger = logging.getLogger(__name__) @@ -303,6 +305,7 @@ def transform( self, index_headers: HeaderArray, grid_overrides: dict[str, bool | int], + template: AbstractDatasetTemplate, # noqa: ARG002 ) -> NDArray: """Perform the grid transform.""" @@ -379,42 +382,38 @@ def transform( self, index_headers: HeaderArray, grid_overrides: dict[str, bool | int], + template: AbstractDatasetTemplate, # noqa: ARG002 ) -> NDArray: """Perform the grid transform.""" self.validate(index_headers, grid_overrides) # Filter to only include dimension fields, not coordinate fields - # Coordinate fields typically have _x, _y suffixes or specific names like 'gun' - # We want to keep fields like shot_point, cable, channel but exclude source_coord_x, etc. + # We want to keep fields like shot_point, cable, channel but exclude coordinate fields + # Use the template's coordinate names to determine which fields are coordinates + coordinate_fields = set(template.coordinate_names) dimension_fields = [] - coordinate_field_patterns = ['_x', '_y', '_coord', 'gun', 'source', 'group'] - + for field_name in index_headers.dtype.names: # Skip if it's already trace - if field_name == 'trace': + if field_name == "trace": continue - # Check if it looks like a coordinate field - is_coordinate = any(pattern in field_name for pattern in coordinate_field_patterns) - if not is_coordinate: + # Check if this field is a coordinate field according to the template + if field_name not in coordinate_fields: dimension_fields.append(field_name) - + # Extract only dimension fields for trace indexing - if dimension_fields: - dimension_headers = index_headers[dimension_fields] - else: - # If no dimension fields, use all fields - dimension_headers = index_headers - + dimension_headers = index_headers[dimension_fields] if dimension_fields else index_headers + # Create trace indices based on dimension fields only dimension_headers_with_trace = analyze_non_indexed_headers(dimension_headers) - + # Add the trace field back to the full index_headers array - if dimension_headers_with_trace is not None and 'trace' in dimension_headers_with_trace.dtype.names: + if dimension_headers_with_trace is not None and "trace" in dimension_headers_with_trace.dtype.names: # Extract just the trace values array (not the whole structured array) - trace_values = np.array(dimension_headers_with_trace['trace']) + trace_values = np.array(dimension_headers_with_trace["trace"]) # Append as a new field to the full headers - index_headers = rfn.append_fields(index_headers, 'trace', trace_values, usemask=False) - + index_headers = rfn.append_fields(index_headers, "trace", trace_values, usemask=False) + return index_headers def transform_index_names(self, index_names: Sequence[str]) -> Sequence[str]: @@ -467,6 +466,7 @@ def transform( self, index_headers: HeaderArray, grid_overrides: dict[str, bool | int], + template: AbstractDatasetTemplate, # noqa: ARG002 ) -> NDArray: """Perform the grid transform.""" self.validate(index_headers, grid_overrides) @@ -504,6 +504,7 @@ def transform( self, index_headers: HeaderArray, grid_overrides: dict[str, bool | int], + template: AbstractDatasetTemplate, # noqa: ARG002 ) -> NDArray: """Perform the grid transform.""" self.validate(index_headers, grid_overrides) @@ -565,6 +566,7 @@ def run( index_names: Sequence[str], grid_overrides: dict[str, bool], chunksize: Sequence[int] | None = None, + template: AbstractDatasetTemplate | None = None, ) -> tuple[HeaderArray, tuple[str], tuple[int]]: """Run grid overrides and return result.""" for override in grid_overrides: @@ -575,7 +577,7 @@ def run( raise GridOverrideUnknownError(override) function = self.commands[override].transform - index_headers = function(index_headers, grid_overrides=grid_overrides) + index_headers = function(index_headers, grid_overrides=grid_overrides, template=template) function = self.commands[override].transform_index_names index_names = function(index_names) diff --git a/src/mdio/segy/utilities.py b/src/mdio/segy/utilities.py index 719549373..289e7f940 100644 --- a/src/mdio/segy/utilities.py +++ b/src/mdio/segy/utilities.py @@ -71,11 +71,14 @@ def get_grid_plan( # noqa: C901, PLR0913 horizontal_coordinates, chunksize=chunksize, grid_overrides=grid_overrides, + template=template, ) # Use the spatial dimension names from horizontal_coordinates (which may have been modified by grid overrides) # Extract only the dimension names (not including non-dimension coordinates) # After grid overrides, trace might have been added to horizontal_coordinates - transformed_spatial_dims = [name for name in horizontal_coordinates if name in horizontal_dimensions or name == "trace"] + transformed_spatial_dims = [ + name for name in horizontal_coordinates if name in horizontal_dimensions or name == "trace" + ] dimensions = [] for dim_name in transformed_spatial_dims: diff --git a/tests/integration/test_import_streamer_grid_overrides.py b/tests/integration/test_import_streamer_grid_overrides.py index 1067a7f19..56f592e64 100644 --- a/tests/integration/test_import_streamer_grid_overrides.py +++ b/tests/integration/test_import_streamer_grid_overrides.py @@ -7,7 +7,6 @@ import dask import numpy as np -import numpy.testing as npt import pytest import xarray.testing as xrt from tests.integration.conftest import get_segy_mock_4d_spec @@ -27,6 +26,7 @@ dask.config.set(scheduler="synchronous") os.environ["MDIO__IMPORT__SAVE_SEGY_FILE_HEADER"] = "true" + # TODO(BrianMichell): Add non-binned back # https://github.com/TGSAI/mdio-python/issues/612 # @pytest.mark.parametrize("grid_override", [{"NonBinned": True, "chunksize": 4}, {"HasDuplicates": True}]) From c952a768b26b0e6203527939c118abc9fdda0b6e Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 22 Oct 2025 18:09:34 +0000 Subject: [PATCH 03/21] Streamline code --- src/mdio/segy/geometry.py | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/src/mdio/segy/geometry.py b/src/mdio/segy/geometry.py index f9f621b2d..49a02320f 100644 --- a/src/mdio/segy/geometry.py +++ b/src/mdio/segy/geometry.py @@ -387,31 +387,18 @@ def transform( """Perform the grid transform.""" self.validate(index_headers, grid_overrides) - # Filter to only include dimension fields, not coordinate fields - # We want to keep fields like shot_point, cable, channel but exclude coordinate fields - # Use the template's coordinate names to determine which fields are coordinates - coordinate_fields = set(template.coordinate_names) - dimension_fields = [] - - for field_name in index_headers.dtype.names: - # Skip if it's already trace - if field_name == "trace": - continue - # Check if this field is a coordinate field according to the template - if field_name not in coordinate_fields: - dimension_fields.append(field_name) - - # Extract only dimension fields for trace indexing - dimension_headers = index_headers[dimension_fields] if dimension_fields else index_headers - - # Create trace indices based on dimension fields only - dimension_headers_with_trace = analyze_non_indexed_headers(dimension_headers) - - # Add the trace field back to the full index_headers array - if dimension_headers_with_trace is not None and "trace" in dimension_headers_with_trace.dtype.names: - # Extract just the trace values array (not the whole structured array) - trace_values = np.array(dimension_headers_with_trace["trace"]) - # Append as a new field to the full headers + # Filter out coordinate fields, keep only dimensions for trace indexing + coord_fields = set(template.coordinate_names) if template else set() + dim_fields = [name for name in index_headers.dtype.names + if name != "trace" and name not in coord_fields] + + # Create trace indices on dimension fields only + dim_headers = index_headers[dim_fields] if dim_fields else index_headers + dim_headers_with_trace = analyze_non_indexed_headers(dim_headers) + + # Add trace field back to full headers + if dim_headers_with_trace is not None and "trace" in dim_headers_with_trace.dtype.names: + trace_values = np.array(dim_headers_with_trace["trace"]) index_headers = rfn.append_fields(index_headers, "trace", trace_values, usemask=False) return index_headers From 4aea1fa3b9c25068c10a9fe5ad168c2c207f9fde Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 22 Oct 2025 18:33:28 +0000 Subject: [PATCH 04/21] Relocate template mutation to all happen in the same function -- Update logging level --- src/mdio/converters/segy.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 5e25c49b6..98842567b 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -170,6 +170,17 @@ def _scan_for_headers( # Update the template's chunk shape to match what grid_plan returned template._var_chunk_shape = chunk_size + # Update template dimensions to match the actual grid dimensions after grid overrides + # The dimensions from grid_plan already reflect grid overrides + actual_spatial_dims = tuple(dim.name for dim in segy_dimensions[:-1]) # All dims except the vertical/time dimension + if template.spatial_dimension_names != actual_spatial_dims: + logger.debug( + "Adjusting template dimensions from %s to %s to match grid after overrides", + template.spatial_dimension_names, + actual_spatial_dims, + ) + template._dim_names = actual_spatial_dims + (template.trace_domain,) + return segy_dimensions, segy_headers @@ -561,17 +572,6 @@ def segy_to_mdio( # noqa PLR0913 ) grid = _build_and_check_grid(segy_dimensions, segy_file_info, segy_headers) - # Update template dimensions to match the actual grid dimensions after grid overrides - # The chunk shape was already updated in _scan_for_headers, we just need to fix dimensions - actual_spatial_dims = tuple(grid.dim_names[:-1]) # All dims except the vertical/time dimension - if mdio_template.spatial_dimension_names != actual_spatial_dims: - logger.info( - "Adjusting template dimensions from %s to %s to match grid after overrides", - mdio_template.spatial_dimension_names, - actual_spatial_dims, - ) - mdio_template._dim_names = actual_spatial_dims + (mdio_template.trace_domain,) - _, non_dim_coords = _get_coordinates(grid, segy_headers, mdio_template) header_dtype = to_structured_type(segy_spec.trace.header.dtype) From 1e3622bd749cf46e4af132ee0ccc39c663d8fffe Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 22 Oct 2025 18:33:33 +0000 Subject: [PATCH 05/21] Formatting --- src/mdio/segy/geometry.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mdio/segy/geometry.py b/src/mdio/segy/geometry.py index 49a02320f..99a4ae9c4 100644 --- a/src/mdio/segy/geometry.py +++ b/src/mdio/segy/geometry.py @@ -389,8 +389,7 @@ def transform( # Filter out coordinate fields, keep only dimensions for trace indexing coord_fields = set(template.coordinate_names) if template else set() - dim_fields = [name for name in index_headers.dtype.names - if name != "trace" and name not in coord_fields] + dim_fields = [name for name in index_headers.dtype.names if name != "trace" and name not in coord_fields] # Create trace indices on dimension fields only dim_headers = index_headers[dim_fields] if dim_fields else index_headers From 98c316b85b5f2935b30280c4c5baa8f21e3e8c76 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 22 Oct 2025 18:52:06 +0000 Subject: [PATCH 06/21] Clean up --- src/mdio/converters/segy.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 98842567b..cf6a35271 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -159,20 +159,17 @@ def _scan_for_headers( grid_overrides=grid_overrides, ) - # After applying grid overrides to the template, chunk sizes should match - # If they don't match, it means the template wasn't properly updated + # Update template to match grid_plan results after grid overrides if full_chunk_shape != chunk_size: logger.warning( "Chunk shape mismatch: template has %s but grid_plan returned %s. Using grid_plan chunk shape.", full_chunk_shape, chunk_size, ) - # Update the template's chunk shape to match what grid_plan returned template._var_chunk_shape = chunk_size - # Update template dimensions to match the actual grid dimensions after grid overrides - # The dimensions from grid_plan already reflect grid overrides - actual_spatial_dims = tuple(dim.name for dim in segy_dimensions[:-1]) # All dims except the vertical/time dimension + # Update dimensions if they don't match grid_plan results + actual_spatial_dims = tuple(dim.name for dim in segy_dimensions[:-1]) if template.spatial_dimension_names != actual_spatial_dims: logger.debug( "Adjusting template dimensions from %s to %s to match grid after overrides", From c9978872c2902b55b339b9775ad5a3249d54013b Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 22 Oct 2025 18:54:10 +0000 Subject: [PATCH 07/21] Update logging --- src/mdio/converters/segy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index cf6a35271..696d28a8f 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -161,8 +161,8 @@ def _scan_for_headers( # Update template to match grid_plan results after grid overrides if full_chunk_shape != chunk_size: - logger.warning( - "Chunk shape mismatch: template has %s but grid_plan returned %s. Using grid_plan chunk shape.", + logger.debug( + "Adjusting template chunk shape from %s to %s to match grid after overrides", full_chunk_shape, chunk_size, ) From dfebfc8985a100a679be4472431cb8a2fc0cd2b2 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Wed, 22 Oct 2025 18:58:21 +0000 Subject: [PATCH 08/21] Remove incorrect noqa --- src/mdio/segy/geometry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdio/segy/geometry.py b/src/mdio/segy/geometry.py index 99a4ae9c4..39ed6dcaa 100644 --- a/src/mdio/segy/geometry.py +++ b/src/mdio/segy/geometry.py @@ -382,7 +382,7 @@ def transform( self, index_headers: HeaderArray, grid_overrides: dict[str, bool | int], - template: AbstractDatasetTemplate, # noqa: ARG002 + template: AbstractDatasetTemplate, ) -> NDArray: """Perform the grid transform.""" self.validate(index_headers, grid_overrides) From 92184a91ba5452f8217bf86bf18d2f062f000418 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 23 Oct 2025 13:25:20 +0000 Subject: [PATCH 09/21] Checkpoint --- src/mdio/converters/segy.py | 11 +++++ src/mdio/segy/geometry.py | 43 +++++++++++++++++-- src/mdio/segy/utilities.py | 12 +++++- .../test_import_streamer_grid_overrides.py | 10 +++-- 4 files changed, 66 insertions(+), 10 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 696d28a8f..9709ad8c5 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -178,6 +178,17 @@ def _scan_for_headers( ) template._dim_names = actual_spatial_dims + (template.trace_domain,) + # Handle NonBinned: move non-binned dimensions to coordinates + if grid_overrides and "NonBinned" in grid_overrides and "non_binned_dims" in grid_overrides: + non_binned_dims = tuple(grid_overrides["non_binned_dims"]) + if non_binned_dims: + logger.debug( + "NonBinned grid override: moving dimensions %s to coordinates", + non_binned_dims, + ) + # Add non-binned dimensions as logical coordinates + template._logical_coord_names = template._logical_coord_names + non_binned_dims + return segy_dimensions, segy_headers diff --git a/src/mdio/segy/geometry.py b/src/mdio/segy/geometry.py index 39ed6dcaa..c61dc98ad 100644 --- a/src/mdio/segy/geometry.py +++ b/src/mdio/segy/geometry.py @@ -421,19 +421,51 @@ def transform_chunksize( class NonBinned(DuplicateIndex): - """Automatically index traces in a single specified axis - trace.""" + """Handle non-binned dimensions by converting them to a trace dimension with coordinates. + + This override takes dimensions that are not regularly sampled (non-binned) and converts + them into a single 'trace' dimension. The original non-binned dimensions become coordinates + indexed by the trace dimension. + + Example: + Template with dimensions [shot_point, cable, channel, azimuth, offset, sample] + and non_binned_dims=['azimuth', 'offset'] becomes: + - dimensions: [shot_point, cable, channel, trace, sample] + - coordinates: azimuth and offset with dimensions [shot_point, cable, channel, trace] + + Attributes: + required_keys: No required keys for this override. + required_parameters: Set containing 'chunksize' and 'non_binned_dims'. + """ required_keys = None - required_parameters = {"chunksize"} + required_parameters = {"chunksize", "non_binned_dims"} + + def validate(self, index_headers: HeaderArray, grid_overrides: dict[str, bool | int]) -> None: + """Validate if this transform should run on the type of data.""" + self.check_required_params(grid_overrides) + + # Validate that non_binned_dims is a list + non_binned_dims = grid_overrides.get("non_binned_dims", []) + if not isinstance(non_binned_dims, list): + msg = f"non_binned_dims must be a list, got {type(non_binned_dims)}" + raise ValueError(msg) + + # Validate that all non-binned dimensions exist in headers + missing_dims = set(non_binned_dims) - set(index_headers.dtype.names) + if missing_dims: + msg = f"Non-binned dimensions {missing_dims} not found in index headers" + raise ValueError(msg) def transform_chunksize( self, chunksize: Sequence[int], grid_overrides: dict[str, bool | int], ) -> Sequence[int]: - """Perform the transform of chunksize.""" + """Insert chunksize for trace dimension at N-1 position.""" new_chunks = list(chunksize) - new_chunks.insert(-1, grid_overrides["chunksize"]) + trace_chunksize = grid_overrides["chunksize"] + new_chunks.insert(-1, trace_chunksize) return tuple(new_chunks) @@ -544,6 +576,9 @@ def get_allowed_parameters(self) -> set: parameters.update(command.required_parameters) + # Add optional parameters that are not strictly required but are valid + parameters.add("non_binned_dims") + return parameters def run( diff --git a/src/mdio/segy/utilities.py b/src/mdio/segy/utilities.py index 289e7f940..5d48c4c7a 100644 --- a/src/mdio/segy/utilities.py +++ b/src/mdio/segy/utilities.py @@ -73,11 +73,19 @@ def get_grid_plan( # noqa: C901, PLR0913 grid_overrides=grid_overrides, template=template, ) + + # Determine which dimensions are non-binned (converted to coordinates) + non_binned_dims = set() + if "NonBinned" in grid_overrides and "non_binned_dims" in grid_overrides: + non_binned_dims = set(grid_overrides["non_binned_dims"]) + # Use the spatial dimension names from horizontal_coordinates (which may have been modified by grid overrides) - # Extract only the dimension names (not including non-dimension coordinates) + # Extract only the dimension names (not including non-dimension coordinates or non-binned dimensions) # After grid overrides, trace might have been added to horizontal_coordinates transformed_spatial_dims = [ - name for name in horizontal_coordinates if name in horizontal_dimensions or name == "trace" + name + for name in horizontal_coordinates + if (name in horizontal_dimensions or name == "trace") and name not in non_binned_dims ] dimensions = [] diff --git a/tests/integration/test_import_streamer_grid_overrides.py b/tests/integration/test_import_streamer_grid_overrides.py index 56f592e64..5fbffb936 100644 --- a/tests/integration/test_import_streamer_grid_overrides.py +++ b/tests/integration/test_import_streamer_grid_overrides.py @@ -27,9 +27,6 @@ os.environ["MDIO__IMPORT__SAVE_SEGY_FILE_HEADER"] = "true" -# TODO(BrianMichell): Add non-binned back -# https://github.com/TGSAI/mdio-python/issues/612 -# @pytest.mark.parametrize("grid_override", [{"NonBinned": True, "chunksize": 4}, {"HasDuplicates": True}]) @pytest.mark.parametrize("grid_override", [{"HasDuplicates": True}]) @pytest.mark.parametrize("chan_header_type", [StreamerShotGeometryType.C]) class TestImport4DNonReg: @@ -69,13 +66,18 @@ def test_import_4d_segy( # noqa: PLR0913 xrt.assert_duckarray_equal(ds["shot_point"], shots) xrt.assert_duckarray_equal(ds["cable"], cables) - # assert grid.select_dim("trace") == Dimension(range(1, np.amax(receivers_per_cable) + 1), "trace") + # HasDuplicates should create a trace dimension expected = list(range(1, np.amax(receivers_per_cable) + 1)) xrt.assert_duckarray_equal(ds["trace"], expected) times_expected = list(range(0, num_samples, 1)) xrt.assert_duckarray_equal(ds["time"], times_expected) + # HasDuplicates uses chunksize of 1 for trace dimension + trace_chunks = ds["amplitude"].chunksizes.get("trace", None) + if trace_chunks is not None: + assert all(chunk == 1 for chunk in trace_chunks) + @pytest.mark.parametrize("grid_override", [{"AutoChannelWrap": True}, None]) @pytest.mark.parametrize("chan_header_type", [StreamerShotGeometryType.A, StreamerShotGeometryType.B]) From e04678ffb2cd9c2c1da876870653dfa237416098 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 23 Oct 2025 17:41:54 +0000 Subject: [PATCH 10/21] Fully functional demo --- src/mdio/builder/dataset_builder.py | 26 +++++++++++++++--- src/mdio/builder/templates/base.py | 15 +++++++++++ src/mdio/converters/segy.py | 11 +++++--- src/mdio/segy/utilities.py | 27 +++++++++++++++---- .../test_import_streamer_grid_overrides.py | 22 ++++++++++----- tests/unit/test_segy_grid_overrides.py | 10 ++++--- 6 files changed, 89 insertions(+), 22 deletions(-) diff --git a/src/mdio/builder/dataset_builder.py b/src/mdio/builder/dataset_builder.py index 1cc515988..b2560e1d6 100644 --- a/src/mdio/builder/dataset_builder.py +++ b/src/mdio/builder/dataset_builder.py @@ -150,12 +150,32 @@ def add_coordinate( # noqa: PLR0913 msg = "Adding coordinate with the same name twice is not allowed" raise ValueError(msg) - # Validate that all referenced dimensions are already defined + # Resolve referenced dimensions strictly, except allow a single substitution with 'trace' if present. named_dimensions = [] + trace_dim = _get_named_dimension(self._dimensions, "trace") + resolved_dim_names: list[str] = [] + trace_used = False + missing_dims: list[str] = [] for dim_name in dimensions: nd = _get_named_dimension(self._dimensions, dim_name) + if nd is not None: + if dim_name not in resolved_dim_names: + resolved_dim_names.append(dim_name) + continue + if trace_dim is not None and not trace_used and "trace" not in resolved_dim_names: + resolved_dim_names.append("trace") + trace_used = True + else: + missing_dims.append(dim_name) + + if missing_dims: + msg = f"Pre-existing dimension named {missing_dims[0]!r} is not found" + raise ValueError(msg) + + for resolved_name in resolved_dim_names: + nd = _get_named_dimension(self._dimensions, resolved_name) if nd is None: - msg = f"Pre-existing dimension named {dim_name!r} is not found" + msg = f"Pre-existing dimension named {resolved_name!r} is not found" raise ValueError(msg) named_dimensions.append(nd) @@ -174,7 +194,7 @@ def add_coordinate( # noqa: PLR0913 self.add_variable( name=coord.name, long_name=coord.long_name, - dimensions=dimensions, # dimension names (list[str]) + dimensions=tuple(resolved_dim_names), # resolved dimension names data_type=coord.data_type, compressor=compressor, coordinates=[name], # Use the coordinate name as a reference diff --git a/src/mdio/builder/templates/base.py b/src/mdio/builder/templates/base.py index 879f1fd6f..0f976b141 100644 --- a/src/mdio/builder/templates/base.py +++ b/src/mdio/builder/templates/base.py @@ -89,6 +89,21 @@ def build_dataset( self._builder = MDIODatasetBuilder(name=name, attributes=attributes) self._add_dimensions() self._add_coordinates() + # Ensure any coordinates declared on the template but not added by subclass overrides + # are materialized with generic defaults. This keeps templates override-agnostic while + # allowing runtime-augmented coordinate lists to be respected. + for coord_name in self.coordinate_names: + try: + self._builder.add_coordinate( + name=coord_name, + dimensions=self.spatial_dimension_names, + data_type=ScalarType.FLOAT64, + compressor=compressors.Blosc(cname=compressors.BloscCname.zstd), + metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(coord_name)), + ) + except ValueError as exc: # coordinate may already exist from subclass override + if "same name twice" not in str(exc): + raise self._add_variables() self._add_trace_mask() diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 9709ad8c5..337436402 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -178,16 +178,19 @@ def _scan_for_headers( ) template._dim_names = actual_spatial_dims + (template.trace_domain,) - # Handle NonBinned: move non-binned dimensions to coordinates + # If using NonBinned override, expose non-binned dims as logical coordinates on the template instance if grid_overrides and "NonBinned" in grid_overrides and "non_binned_dims" in grid_overrides: non_binned_dims = tuple(grid_overrides["non_binned_dims"]) if non_binned_dims: logger.debug( - "NonBinned grid override: moving dimensions %s to coordinates", + "NonBinned grid override: exposing non-binned dims as coordinates: %s", non_binned_dims, ) - # Add non-binned dimensions as logical coordinates - template._logical_coord_names = template._logical_coord_names + non_binned_dims + # Append any missing names; keep existing order and avoid duplicates + existing = set(template.coordinate_names) + to_add = tuple(n for n in non_binned_dims if n not in existing) + if to_add: + template._logical_coord_names = template._logical_coord_names + to_add return segy_dimensions, segy_headers diff --git a/src/mdio/segy/utilities.py b/src/mdio/segy/utilities.py index 5d48c4c7a..55c940be2 100644 --- a/src/mdio/segy/utilities.py +++ b/src/mdio/segy/utilities.py @@ -82,11 +82,28 @@ def get_grid_plan( # noqa: C901, PLR0913 # Use the spatial dimension names from horizontal_coordinates (which may have been modified by grid overrides) # Extract only the dimension names (not including non-dimension coordinates or non-binned dimensions) # After grid overrides, trace might have been added to horizontal_coordinates - transformed_spatial_dims = [ - name - for name in horizontal_coordinates - if (name in horizontal_dimensions or name == "trace") and name not in non_binned_dims - ] + # Compute transformed spatial dims: drop non-binned dims, insert trace if present in headers + transformed_spatial_dims = [] + for name in horizontal_coordinates: + if name in non_binned_dims: + continue + if name == "trace" or name in horizontal_dimensions: + transformed_spatial_dims.append(name) + + # Recompute chunksize to match transformed dims + original_spatial_dims = list(template.spatial_dimension_names) + original_chunks = list(template.full_chunk_shape) + new_spatial_chunks: list[int] = [] + # Insert trace chunk size at N-1 when present, otherwise map remaining dims + for dim_name in transformed_spatial_dims: + if dim_name == "trace": + chunk_val = int(grid_overrides.get("chunksize", 1)) if "NonBinned" in grid_overrides else 1 + new_spatial_chunks.append(chunk_val) + continue + if dim_name in original_spatial_dims: + idx = original_spatial_dims.index(dim_name) + new_spatial_chunks.append(original_chunks[idx]) + chunksize = tuple(new_spatial_chunks + [original_chunks[-1]]) dimensions = [] for dim_name in transformed_spatial_dims: diff --git a/tests/integration/test_import_streamer_grid_overrides.py b/tests/integration/test_import_streamer_grid_overrides.py index 5fbffb936..65dc8b097 100644 --- a/tests/integration/test_import_streamer_grid_overrides.py +++ b/tests/integration/test_import_streamer_grid_overrides.py @@ -27,7 +27,8 @@ os.environ["MDIO__IMPORT__SAVE_SEGY_FILE_HEADER"] = "true" -@pytest.mark.parametrize("grid_override", [{"HasDuplicates": True}]) +# @pytest.mark.parametrize("grid_override", [{"NonBinned": True, "chunksize": 4, "non_binned_dims": ["channel"]}, {"HasDuplicates": True}]) +@pytest.mark.parametrize("grid_override", [{"NonBinned": True, "chunksize": 4, "non_binned_dims": ["channel"]}]) @pytest.mark.parametrize("chan_header_type", [StreamerShotGeometryType.C]) class TestImport4DNonReg: """Test for 4D segy import with grid overrides.""" @@ -43,11 +44,14 @@ def test_import_4d_segy( # noqa: PLR0913 segy_spec: SegySpec = get_segy_mock_4d_spec() segy_path = segy_mock_4d_shots[chan_header_type] + path = "tmp.mdio" + print(f"Running test with grid override: {grid_override}") + segy_to_mdio( segy_spec=segy_spec, mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"), input_path=segy_path, - output_path=zarr_tmp, + output_path=path, overwrite=True, grid_overrides=grid_override, ) @@ -58,7 +62,7 @@ def test_import_4d_segy( # noqa: PLR0913 cables = [0, 101, 201, 301] receivers_per_cable = [1, 5, 7, 5] - ds = open_mdio(zarr_tmp) + ds = open_mdio(path) assert ds["segy_file_header"].attrs["binaryHeader"]["samples_per_trace"] == num_samples assert ds.attrs["attributes"]["gridOverrides"] == grid_override @@ -66,17 +70,23 @@ def test_import_4d_segy( # noqa: PLR0913 xrt.assert_duckarray_equal(ds["shot_point"], shots) xrt.assert_duckarray_equal(ds["cable"], cables) - # HasDuplicates should create a trace dimension + # Both HasDuplicates and NonBinned should create a trace dimension expected = list(range(1, np.amax(receivers_per_cable) + 1)) xrt.assert_duckarray_equal(ds["trace"], expected) times_expected = list(range(0, num_samples, 1)) xrt.assert_duckarray_equal(ds["time"], times_expected) - # HasDuplicates uses chunksize of 1 for trace dimension + # Check trace chunk size based on grid override trace_chunks = ds["amplitude"].chunksizes.get("trace", None) if trace_chunks is not None: - assert all(chunk == 1 for chunk in trace_chunks) + if "NonBinned" in grid_override: + # NonBinned uses specified chunksize for trace dimension + expected_chunksize = grid_override.get("chunksize", 1) + assert all(chunk == expected_chunksize for chunk in trace_chunks) + else: + # HasDuplicates uses chunksize of 1 for trace dimension + assert all(chunk == 1 for chunk in trace_chunks) @pytest.mark.parametrize("grid_override", [{"AutoChannelWrap": True}, None]) diff --git a/tests/unit/test_segy_grid_overrides.py b/tests/unit/test_segy_grid_overrides.py index bebf6be80..250d6a494 100644 --- a/tests/unit/test_segy_grid_overrides.py +++ b/tests/unit/test_segy_grid_overrides.py @@ -103,10 +103,10 @@ def test_duplicates(self, mock_streamer_headers: dict[str, npt.NDArray]) -> None def test_non_binned(self, mock_streamer_headers: dict[str, npt.NDArray]) -> None: """Test the NonBinned Grid Override command.""" index_names = ("shot_point", "cable") - grid_overrides = {"NonBinned": True, "chunksize": 4} + grid_overrides = {"NonBinned": True, "chunksize": 4, "non_binned_dims": ["channel"]} - # Remove channel header - streamer_headers = mock_streamer_headers[list(index_names)] + # Keep channel header for non-binned processing + streamer_headers = mock_streamer_headers chunksize = (4, 4, 8) new_headers, new_names, new_chunks = run_override( @@ -123,7 +123,9 @@ def test_non_binned(self, mock_streamer_headers: dict[str, npt.NDArray]) -> None assert_array_equal(dims[0].coords, SHOTS) assert_array_equal(dims[1].coords, CABLES) - assert_array_equal(dims[2].coords, RECEIVERS) + # Trace coords are the unique channel values (1-20) + expected_trace_coords = np.arange(1, 21, dtype="int32") + assert_array_equal(dims[2].coords, expected_trace_coords) class TestStreamerGridOverrides: From 021297fca2bff23cce61f4b164ecd70b0b3ac08d Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 23 Oct 2025 17:43:51 +0000 Subject: [PATCH 11/21] pre-commit --- src/mdio/builder/templates/base.py | 3 +++ tests/integration/test_import_streamer_grid_overrides.py | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/mdio/builder/templates/base.py b/src/mdio/builder/templates/base.py index 0f976b141..708ab09be 100644 --- a/src/mdio/builder/templates/base.py +++ b/src/mdio/builder/templates/base.py @@ -81,6 +81,9 @@ def build_dataset( Returns: Dataset: The constructed dataset + + Raises: + ValueError: If coordinate already exists from subclass override. """ self._dim_sizes = sizes diff --git a/tests/integration/test_import_streamer_grid_overrides.py b/tests/integration/test_import_streamer_grid_overrides.py index 65dc8b097..c243b8f53 100644 --- a/tests/integration/test_import_streamer_grid_overrides.py +++ b/tests/integration/test_import_streamer_grid_overrides.py @@ -27,7 +27,8 @@ os.environ["MDIO__IMPORT__SAVE_SEGY_FILE_HEADER"] = "true" -# @pytest.mark.parametrize("grid_override", [{"NonBinned": True, "chunksize": 4, "non_binned_dims": ["channel"]}, {"HasDuplicates": True}]) +# @pytest.mark.parametrize("grid_override", [{"NonBinned": True, "chunksize": 4, "non_binned_dims": ["channel"]}, +# {"HasDuplicates": True}]) @pytest.mark.parametrize("grid_override", [{"NonBinned": True, "chunksize": 4, "non_binned_dims": ["channel"]}]) @pytest.mark.parametrize("chan_header_type", [StreamerShotGeometryType.C]) class TestImport4DNonReg: @@ -44,14 +45,13 @@ def test_import_4d_segy( # noqa: PLR0913 segy_spec: SegySpec = get_segy_mock_4d_spec() segy_path = segy_mock_4d_shots[chan_header_type] - path = "tmp.mdio" print(f"Running test with grid override: {grid_override}") segy_to_mdio( segy_spec=segy_spec, mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"), input_path=segy_path, - output_path=path, + output_path=zarr_tmp, overwrite=True, grid_overrides=grid_override, ) @@ -62,7 +62,7 @@ def test_import_4d_segy( # noqa: PLR0913 cables = [0, 101, 201, 301] receivers_per_cable = [1, 5, 7, 5] - ds = open_mdio(path) + ds = open_mdio(zarr_tmp) assert ds["segy_file_header"].attrs["binaryHeader"]["samples_per_trace"] == num_samples assert ds.attrs["attributes"]["gridOverrides"] == grid_override From 4f0af8db75431c77a8fec8270e6fca2678167645 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 23 Oct 2025 17:47:08 +0000 Subject: [PATCH 12/21] Revert debugging changes --- tests/integration/test_import_streamer_grid_overrides.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_import_streamer_grid_overrides.py b/tests/integration/test_import_streamer_grid_overrides.py index c243b8f53..6825e6ac0 100644 --- a/tests/integration/test_import_streamer_grid_overrides.py +++ b/tests/integration/test_import_streamer_grid_overrides.py @@ -27,9 +27,8 @@ os.environ["MDIO__IMPORT__SAVE_SEGY_FILE_HEADER"] = "true" -# @pytest.mark.parametrize("grid_override", [{"NonBinned": True, "chunksize": 4, "non_binned_dims": ["channel"]}, -# {"HasDuplicates": True}]) -@pytest.mark.parametrize("grid_override", [{"NonBinned": True, "chunksize": 4, "non_binned_dims": ["channel"]}]) +@pytest.mark.parametrize("grid_override", [{"NonBinned": True, "chunksize": 4, "non_binned_dims": ["channel"]}, + {"HasDuplicates": True}]) @pytest.mark.parametrize("chan_header_type", [StreamerShotGeometryType.C]) class TestImport4DNonReg: """Test for 4D segy import with grid overrides.""" @@ -45,8 +44,6 @@ def test_import_4d_segy( # noqa: PLR0913 segy_spec: SegySpec = get_segy_mock_4d_spec() segy_path = segy_mock_4d_shots[chan_header_type] - print(f"Running test with grid override: {grid_override}") - segy_to_mdio( segy_spec=segy_spec, mdio_template=TemplateRegistry().get("PreStackShotGathers3DTime"), From 16a205204ffc7254a88434fd2a4d0f15d1ae9cb4 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 23 Oct 2025 18:00:48 +0000 Subject: [PATCH 13/21] pre-commit --- tests/integration/test_import_streamer_grid_overrides.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_import_streamer_grid_overrides.py b/tests/integration/test_import_streamer_grid_overrides.py index 6825e6ac0..4a6337555 100644 --- a/tests/integration/test_import_streamer_grid_overrides.py +++ b/tests/integration/test_import_streamer_grid_overrides.py @@ -27,8 +27,9 @@ os.environ["MDIO__IMPORT__SAVE_SEGY_FILE_HEADER"] = "true" -@pytest.mark.parametrize("grid_override", [{"NonBinned": True, "chunksize": 4, "non_binned_dims": ["channel"]}, - {"HasDuplicates": True}]) +@pytest.mark.parametrize( + "grid_override", [{"NonBinned": True, "chunksize": 4, "non_binned_dims": ["channel"]}, {"HasDuplicates": True}] +) @pytest.mark.parametrize("chan_header_type", [StreamerShotGeometryType.C]) class TestImport4DNonReg: """Test for 4D segy import with grid overrides.""" From 6a17b16d766b6ded86fb65dd40f479348203b570 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 23 Oct 2025 18:19:20 +0000 Subject: [PATCH 14/21] Simplifications to DuplicateIndex and GridOverrides --- src/mdio/segy/utilities.py | 45 ++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/src/mdio/segy/utilities.py b/src/mdio/segy/utilities.py index 55c940be2..be585bb6b 100644 --- a/src/mdio/segy/utilities.py +++ b/src/mdio/segy/utilities.py @@ -74,39 +74,36 @@ def get_grid_plan( # noqa: C901, PLR0913 template=template, ) - # Determine which dimensions are non-binned (converted to coordinates) + # After grid overrides, determine final spatial dimensions and their chunk sizes non_binned_dims = set() if "NonBinned" in grid_overrides and "non_binned_dims" in grid_overrides: non_binned_dims = set(grid_overrides["non_binned_dims"]) - # Use the spatial dimension names from horizontal_coordinates (which may have been modified by grid overrides) - # Extract only the dimension names (not including non-dimension coordinates or non-binned dimensions) - # After grid overrides, trace might have been added to horizontal_coordinates - # Compute transformed spatial dims: drop non-binned dims, insert trace if present in headers - transformed_spatial_dims = [] + # Create mapping from dimension name to original chunk size for easy lookup + original_spatial_dims = list(template.spatial_dimension_names) + original_chunks = list(template.full_chunk_shape[:-1]) # Exclude vertical (sample/time) dimension + dim_to_chunk = dict(zip(original_spatial_dims, original_chunks, strict=True)) + + # Final spatial dimensions: keep trace and original dims, exclude non-binned dims + final_spatial_dims = [] + final_spatial_chunks = [] for name in horizontal_coordinates: if name in non_binned_dims: - continue - if name == "trace" or name in horizontal_dimensions: - transformed_spatial_dims.append(name) - - # Recompute chunksize to match transformed dims - original_spatial_dims = list(template.spatial_dimension_names) - original_chunks = list(template.full_chunk_shape) - new_spatial_chunks: list[int] = [] - # Insert trace chunk size at N-1 when present, otherwise map remaining dims - for dim_name in transformed_spatial_dims: - if dim_name == "trace": + continue # Skip dimensions that became coordinates + if name == "trace": + # Special handling for trace dimension chunk_val = int(grid_overrides.get("chunksize", 1)) if "NonBinned" in grid_overrides else 1 - new_spatial_chunks.append(chunk_val) - continue - if dim_name in original_spatial_dims: - idx = original_spatial_dims.index(dim_name) - new_spatial_chunks.append(original_chunks[idx]) - chunksize = tuple(new_spatial_chunks + [original_chunks[-1]]) + final_spatial_dims.append(name) + final_spatial_chunks.append(chunk_val) + elif name in dim_to_chunk: + # Use original chunk size for known dimensions + final_spatial_dims.append(name) + final_spatial_chunks.append(dim_to_chunk[name]) + + chunksize = tuple(final_spatial_chunks + [template.full_chunk_shape[-1]]) dimensions = [] - for dim_name in transformed_spatial_dims: + for dim_name in final_spatial_dims: if dim_name not in headers_subset.dtype.names: continue dim_unique = np.unique(headers_subset[dim_name]) From e71d379176be8155a60343404623bdfa9dca8980 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 23 Oct 2025 18:27:15 +0000 Subject: [PATCH 15/21] Add base safety for grid override template mutation --- src/mdio/builder/templates/base.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/mdio/builder/templates/base.py b/src/mdio/builder/templates/base.py index 708ab09be..92d8da1c9 100644 --- a/src/mdio/builder/templates/base.py +++ b/src/mdio/builder/templates/base.py @@ -92,9 +92,8 @@ def build_dataset( self._builder = MDIODatasetBuilder(name=name, attributes=attributes) self._add_dimensions() self._add_coordinates() - # Ensure any coordinates declared on the template but not added by subclass overrides - # are materialized with generic defaults. This keeps templates override-agnostic while - # allowing runtime-augmented coordinate lists to be respected. + # Ensure any coordinates declared on the template but not added by _add_coordinates + # are materialized with generic defaults. This handles coordinates added by grid overrides. for coord_name in self.coordinate_names: try: self._builder.add_coordinate( @@ -104,7 +103,7 @@ def build_dataset( compressor=compressors.Blosc(cname=compressors.BloscCname.zstd), metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(coord_name)), ) - except ValueError as exc: # coordinate may already exist from subclass override + except ValueError as exc: # coordinate may already exist if "same name twice" not in str(exc): raise self._add_variables() @@ -253,14 +252,21 @@ def _add_coordinates(self) -> None: ) # Add non-dimension coordinates + # Note: coordinate_names may be modified at runtime by grid overrides, + # so we need to handle dynamic additions gracefully for name in self.coordinate_names: - self._builder.add_coordinate( - name=name, - dimensions=self.spatial_dimension_names, - data_type=ScalarType.FLOAT64, - compressor=compressors.Blosc(cname=compressors.BloscCname.zstd), - metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(name)), - ) + try: + self._builder.add_coordinate( + name=name, + dimensions=self.spatial_dimension_names, + data_type=ScalarType.FLOAT64, + compressor=compressors.Blosc(cname=compressors.BloscCname.zstd), + metadata=CoordinateMetadata(units_v1=self.get_unit_by_key(name)), + ) + except ValueError as exc: + # Coordinate may already exist from subclass override + if "same name twice" not in str(exc): + raise def _add_trace_mask(self) -> None: """Add trace mask variables.""" From 076cf60d3b61377d53ccaa56aa6126ae469b36c7 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Thu, 23 Oct 2025 18:41:19 +0000 Subject: [PATCH 16/21] Extract template update logic --- src/mdio/converters/segy.py | 72 ++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 337436402..71b888514 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -134,31 +134,27 @@ def grid_density_qc(grid: Grid, num_traces: int) -> None: raise GridTraceSparsityError(grid.shape, num_traces, msg) -def _scan_for_headers( - segy_file_kwargs: SegyFileArguments, - segy_file_info: SegyFileInfo, +def _update_template_from_grid_overrides( template: AbstractDatasetTemplate, - grid_overrides: dict[str, Any] | None = None, -) -> tuple[list[Dimension], SegyHeaderArray]: - """Extract trace dimensions and index headers from the SEG-Y file. + grid_overrides: dict[str, Any] | None, + segy_dimensions: list[Dimension], + full_chunk_shape: tuple[int, ...], + chunk_size: tuple[int, ...], +) -> None: + """Update template attributes to match grid plan results after grid overrides. - This is an expensive operation. - It scans the SEG-Y file in chunks by using ProcessPoolExecutor. + This function modifies the template in-place to reflect changes from grid overrides: + - Updates chunk shape if it changed due to overrides + - Updates dimension names if they changed due to overrides + - Adds non-binned dimensions as coordinates for NonBinned override - Note: - If grid_overrides are applied to the template before calling this function, - the chunk_size returned from get_grid_plan should match the template's chunk shape. + Args: + template: The template to update + grid_overrides: Grid override configuration + segy_dimensions: Dimensions returned from grid planning + full_chunk_shape: Original template chunk shape + chunk_size: Chunk size returned from grid planning """ - full_chunk_shape = template.full_chunk_shape - segy_dimensions, chunk_size, segy_headers = get_grid_plan( - segy_file_kwargs=segy_file_kwargs, - segy_file_info=segy_file_info, - return_headers=True, - template=template, - chunksize=full_chunk_shape, - grid_overrides=grid_overrides, - ) - # Update template to match grid_plan results after grid overrides if full_chunk_shape != chunk_size: logger.debug( @@ -192,6 +188,40 @@ def _scan_for_headers( if to_add: template._logical_coord_names = template._logical_coord_names + to_add + +def _scan_for_headers( + segy_file_kwargs: SegyFileArguments, + segy_file_info: SegyFileInfo, + template: AbstractDatasetTemplate, + grid_overrides: dict[str, Any] | None = None, +) -> tuple[list[Dimension], SegyHeaderArray]: + """Extract trace dimensions and index headers from the SEG-Y file. + + This is an expensive operation. + It scans the SEG-Y file in chunks by using ProcessPoolExecutor. + + Note: + If grid_overrides are applied to the template before calling this function, + the chunk_size returned from get_grid_plan should match the template's chunk shape. + """ + full_chunk_shape = template.full_chunk_shape + segy_dimensions, chunk_size, segy_headers = get_grid_plan( + segy_file_kwargs=segy_file_kwargs, + segy_file_info=segy_file_info, + return_headers=True, + template=template, + chunksize=full_chunk_shape, + grid_overrides=grid_overrides, + ) + + _update_template_from_grid_overrides( + template=template, + grid_overrides=grid_overrides, + segy_dimensions=segy_dimensions, + full_chunk_shape=full_chunk_shape, + chunk_size=chunk_size, + ) + return segy_dimensions, segy_headers From 32850bf8109c7b5eaefac006672cbd207d2d71ea Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 11 Nov 2025 16:38:49 +0000 Subject: [PATCH 17/21] Resolve nondeterminstic ordering causing incorrect template mutations --- src/mdio/segy/utilities.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mdio/segy/utilities.py b/src/mdio/segy/utilities.py index df40cfdd7..3e9283a95 100644 --- a/src/mdio/segy/utilities.py +++ b/src/mdio/segy/utilities.py @@ -64,9 +64,9 @@ def get_grid_plan( # noqa: C901, PLR0913 # Exclude calculated dimensions - they don't exist in SEG-Y headers calculated_dims = set(template.calculated_dimension_names) - # Remove any to be computed fields + # Remove any to be computed fields - preserve order by using list comprehension instead of set operations computed_fields = set(template.calculated_dimension_names) - horizontal_coordinates = tuple(set(horizontal_coordinates) - computed_fields) + horizontal_coordinates = tuple(c for c in horizontal_coordinates if c not in computed_fields) headers_subset = parse_headers( segy_file_kwargs=segy_file_kwargs, From de4d8b4200740d5231a49e5dbc7fc3f28714c20e Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 11 Nov 2025 17:02:59 +0000 Subject: [PATCH 18/21] Bandaid fix to join non-binned and autoshotwrap overrides -- Should be refactored for a uniform experience --- src/mdio/converters/segy.py | 11 ++++++++++- src/mdio/segy/utilities.py | 32 +++++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 57e9e5859..68aaa13eb 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -156,6 +156,16 @@ def _update_template_from_grid_overrides( chunk_size: Chunk size returned from grid planning """ # Update template to match grid_plan results after grid overrides + # Extract actual spatial dimensions from segy_dimensions (excluding vertical dimension) + actual_spatial_dims = tuple(dim.name for dim in segy_dimensions[:-1]) + + # Align chunk_size with actual dimensions - truncate if dimensions were filtered out + num_actual_spatial = len(actual_spatial_dims) + num_chunk_spatial = len(chunk_size) - 1 # Exclude vertical dimension chunk + if num_actual_spatial != num_chunk_spatial: + # Truncate chunk_size to match actual dimensions + chunk_size = chunk_size[:num_actual_spatial] + (chunk_size[-1],) + if full_chunk_shape != chunk_size: logger.debug( "Adjusting template chunk shape from %s to %s to match grid after overrides", @@ -165,7 +175,6 @@ def _update_template_from_grid_overrides( template._var_chunk_shape = chunk_size # Update dimensions if they don't match grid_plan results - actual_spatial_dims = tuple(dim.name for dim in segy_dimensions[:-1]) if template.spatial_dimension_names != actual_spatial_dims: logger.debug( "Adjusting template dimensions from %s to %s to match grid after overrides", diff --git a/src/mdio/segy/utilities.py b/src/mdio/segy/utilities.py index 3e9283a95..461bc96c4 100644 --- a/src/mdio/segy/utilities.py +++ b/src/mdio/segy/utilities.py @@ -110,8 +110,6 @@ def get_grid_plan( # noqa: C901, PLR0913 final_spatial_dims.append(name) final_spatial_chunks.append(dim_to_chunk[name]) - chunksize = tuple(final_spatial_chunks + [template.full_chunk_shape[-1]]) - if len(computed_fields) > 0 and not computed_fields.issubset(headers_subset.dtype.names): err = ( f"Required computed fields {sorted(computed_fields)} for template {template.name} " @@ -119,8 +117,36 @@ def get_grid_plan( # noqa: C901, PLR0913 ) raise ValueError(err) + # Create dimensions from final_spatial_dims plus any computed fields that were added by grid overrides + all_dimension_names = list(final_spatial_dims) + added_computed_fields = [] + for computed_field in computed_fields: + if computed_field in headers_subset.dtype.names and computed_field not in all_dimension_names: + # Insert in template order + if computed_field in template.spatial_dimension_names: + insert_idx = template.spatial_dimension_names.index(computed_field) + # Find position in all_dimension_names that corresponds to this template position + actual_idx = min(insert_idx, len(all_dimension_names)) + all_dimension_names.insert(actual_idx, computed_field) + # Track where we inserted and what chunk size it should have + template_chunk_idx = template.spatial_dimension_names.index(computed_field) + chunk_val = template.full_chunk_shape[template_chunk_idx] + added_computed_fields.append((actual_idx, chunk_val)) + else: + all_dimension_names.append(computed_field) + added_computed_fields.append((len(all_dimension_names) - 1, 1)) + + # Build chunksize including chunks for computed fields + if added_computed_fields: + chunk_list = list(final_spatial_chunks) + for insert_idx, chunk_val in sorted(added_computed_fields, reverse=True): + chunk_list.insert(insert_idx, chunk_val) + chunksize = tuple(chunk_list + [template.full_chunk_shape[-1]]) + else: + chunksize = tuple(final_spatial_chunks + [template.full_chunk_shape[-1]]) + dimensions = [] - for dim_name in final_spatial_dims: + for dim_name in all_dimension_names: if dim_name not in headers_subset.dtype.names: continue dim_unique = np.unique(headers_subset[dim_name]) From 0ee887796e6eb75f0720631baf5062f5f3f90e5c Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Tue, 11 Nov 2025 17:11:06 +0000 Subject: [PATCH 19/21] pre-commit --- src/mdio/converters/segy.py | 4 ++-- src/mdio/segy/utilities.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 68aaa13eb..ec8d01b7c 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -158,14 +158,14 @@ def _update_template_from_grid_overrides( # Update template to match grid_plan results after grid overrides # Extract actual spatial dimensions from segy_dimensions (excluding vertical dimension) actual_spatial_dims = tuple(dim.name for dim in segy_dimensions[:-1]) - + # Align chunk_size with actual dimensions - truncate if dimensions were filtered out num_actual_spatial = len(actual_spatial_dims) num_chunk_spatial = len(chunk_size) - 1 # Exclude vertical dimension chunk if num_actual_spatial != num_chunk_spatial: # Truncate chunk_size to match actual dimensions chunk_size = chunk_size[:num_actual_spatial] + (chunk_size[-1],) - + if full_chunk_shape != chunk_size: logger.debug( "Adjusting template chunk shape from %s to %s to match grid after overrides", diff --git a/src/mdio/segy/utilities.py b/src/mdio/segy/utilities.py index 461bc96c4..48e2e50f7 100644 --- a/src/mdio/segy/utilities.py +++ b/src/mdio/segy/utilities.py @@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) -def get_grid_plan( # noqa: C901, PLR0913 +def get_grid_plan( # noqa: C901, PLR0912, PLR0913, PLR0915 segy_file_kwargs: SegyFileArguments, segy_file_info: SegyFileInfo, chunksize: tuple[int, ...] | None, From c099fe01bab09a3c11e744c10bddf5610fe754e8 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 8 Dec 2025 21:22:14 +0000 Subject: [PATCH 20/21] Fix memory consumption on grid calculation and not appropriately calculating trace dimension --- src/mdio/segy/geometry.py | 12 +++++++++++- src/mdio/segy/utilities.py | 6 ++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/mdio/segy/geometry.py b/src/mdio/segy/geometry.py index 0fab49e97..f0e8a1d35 100644 --- a/src/mdio/segy/geometry.py +++ b/src/mdio/segy/geometry.py @@ -389,7 +389,17 @@ def transform( # Filter out coordinate fields, keep only dimensions for trace indexing coord_fields = set(template.coordinate_names) if template else set() - dim_fields = [name for name in index_headers.dtype.names if name != "trace" and name not in coord_fields] + + # For NonBinned: non_binned_dims should be excluded from trace indexing grouping + # because they become coordinates indexed by the trace dimension, not grouping keys. + # The trace index should count all traces per remaining dimension combination. + non_binned_dims = set(grid_overrides.get("non_binned_dims", [])) if grid_overrides else set() + + dim_fields = [ + name + for name in index_headers.dtype.names + if name != "trace" and name not in coord_fields and name not in non_binned_dims + ] # Create trace indices on dimension fields only dim_headers = index_headers[dim_fields] if dim_fields else index_headers diff --git a/src/mdio/segy/utilities.py b/src/mdio/segy/utilities.py index 48e2e50f7..d1e42416a 100644 --- a/src/mdio/segy/utilities.py +++ b/src/mdio/segy/utilities.py @@ -68,6 +68,12 @@ def get_grid_plan( # noqa: C901, PLR0912, PLR0913, PLR0915 computed_fields = set(template.calculated_dimension_names) horizontal_coordinates = tuple(c for c in horizontal_coordinates if c not in computed_fields) + # Ensure non_binned_dims are included in the headers to parse, even if not in template + if grid_overrides and "non_binned_dims" in grid_overrides: + for dim in grid_overrides["non_binned_dims"]: + if dim not in horizontal_coordinates: + horizontal_coordinates = horizontal_coordinates + (dim,) + headers_subset = parse_headers( segy_file_kwargs=segy_file_kwargs, num_traces=segy_file_info.num_traces, From 51600e08e5016309bc895ff3392ac902bb56e4e4 Mon Sep 17 00:00:00 2001 From: BrianMichell Date: Mon, 15 Dec 2025 19:23:59 +0000 Subject: [PATCH 21/21] Apply proper dimensions to non-binned coordiante arrays -- Fix large memory spike between header scan and trace ingestion --- src/mdio/converters/segy.py | 122 ++++++++++++++++++++++++++++++++++-- 1 file changed, 116 insertions(+), 6 deletions(-) diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index ec8d01b7c..5937ce264 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -134,6 +134,75 @@ def grid_density_qc(grid: Grid, num_traces: int) -> None: raise GridTraceSparsityError(grid.shape, num_traces, msg) +def _patch_add_coordinates_for_non_binned( + template: AbstractDatasetTemplate, + non_binned_dims: set[str], +) -> None: + """Patch template's _add_coordinates to skip adding non-binned dims as dimension coordinates. + + When NonBinned override is used, dimensions like 'offset' or 'azimuth' become coordinates + instead of dimensions. However, template subclasses may still try to add them as 1D + dimension coordinates (e.g., with dimensions=("offset",)). Since 'offset' is no longer + a dimension, the builder substitutes 'trace', resulting in wrong coordinate dimensions. + + This function patches the template's _add_coordinates method to intercept calls to + builder.add_coordinate and skip adding coordinates that are non-binned dims with + single-element dimension tuples. These coordinates will be added later by build_dataset + with the correct spatial_dimension_names (e.g., (inline, crossline, trace)). + + Args: + template: The template to patch + non_binned_dims: Set of dimension names that became coordinates due to NonBinned override + """ + # Check if already patched to avoid duplicate patching + if hasattr(template, "_mdio_non_binned_patched"): + return + + # Store the original _add_coordinates method + original_add_coordinates = template._add_coordinates + + def patched_add_coordinates() -> None: + """Wrapper that intercepts builder.add_coordinate calls for non-binned dims.""" + # Store the original add_coordinate method from the builder + original_builder_add_coordinate = template._builder.add_coordinate + + def filtered_add_coordinate( # noqa: ANN202 + name: str, + *, + dimensions: tuple[str, ...], + **kwargs, # noqa: ANN003 + ): + """Skip adding non-binned dims as 1D dimension coordinates.""" + # Check if this is a non-binned dim being added as a 1D dimension coordinate + # (i.e., the coordinate name matches a non-binned dim and has only 1 dimension) + if name in non_binned_dims and len(dimensions) == 1: + logger.debug( + "Skipping 1D coordinate '%s' with dims %s - will be added with full spatial dims", + name, + dimensions, + ) + return template._builder # Return builder for chaining, but don't add + + # Otherwise, call the original method + return original_builder_add_coordinate(name, dimensions=dimensions, **kwargs) + + # Temporarily replace builder's add_coordinate + template._builder.add_coordinate = filtered_add_coordinate + + try: + # Call the original _add_coordinates + original_add_coordinates() + finally: + # Restore the original add_coordinate method + template._builder.add_coordinate = original_builder_add_coordinate + + # Replace the template's _add_coordinates method + template._add_coordinates = patched_add_coordinates + + # Mark as patched to prevent duplicate patching + template._mdio_non_binned_patched = True + + def _update_template_from_grid_overrides( template: AbstractDatasetTemplate, grid_overrides: dict[str, Any] | None, @@ -147,6 +216,7 @@ def _update_template_from_grid_overrides( - Updates chunk shape if it changed due to overrides - Updates dimension names if they changed due to overrides - Adds non-binned dimensions as coordinates for NonBinned override + - Patches _add_coordinates to skip adding non-binned dims as dimension coordinates Args: template: The template to update @@ -184,6 +254,7 @@ def _update_template_from_grid_overrides( template._dim_names = actual_spatial_dims + (template.trace_domain,) # If using NonBinned override, expose non-binned dims as logical coordinates on the template instance + # and patch _add_coordinates to skip adding them as 1D dimension coordinates if grid_overrides and "NonBinned" in grid_overrides and "non_binned_dims" in grid_overrides: non_binned_dims = tuple(grid_overrides["non_binned_dims"]) if non_binned_dims: @@ -197,6 +268,11 @@ def _update_template_from_grid_overrides( if to_add: template._logical_coord_names = template._logical_coord_names + to_add + # Patch _add_coordinates to skip adding non-binned dims as 1D dimension coordinates + # This prevents them from being added with wrong dimensions (e.g., just "trace") + # They will be added later by build_dataset with full spatial_dimension_names + _patch_add_coordinates_for_non_binned(template, set(non_binned_dims)) + def _scan_for_headers( segy_file_kwargs: SegyFileArguments, @@ -303,7 +379,8 @@ def _get_coordinates( if coord_name not in segy_headers.dtype.names: err = f"Coordinate '{coord_name}' not found in SEG-Y dimensions." raise ValueError(err) - non_dim_coords[coord_name] = segy_headers[coord_name] + # Copy the data to allow segy_headers to be garbage collected + non_dim_coords[coord_name] = np.array(segy_headers[coord_name]) return dimensions_coords, non_dim_coords @@ -325,25 +402,54 @@ def populate_non_dim_coordinates( drop_vars_delayed: list[str], spatial_coordinate_scalar: int, ) -> tuple[xr_Dataset, list[str]]: - """Populate the xarray dataset with coordinate variables.""" + """Populate the xarray dataset with coordinate variables. + + Memory optimization: Processes coordinates one at a time and explicitly + releases intermediate arrays to reduce peak memory usage. + """ non_data_domain_dims = grid.dim_names[:-1] # minus the data domain dimension - for coord_name, coord_values in coordinates.items(): + + # Process coordinates one at a time to minimize peak memory + coord_names = list(coordinates.keys()) + for coord_name in coord_names: + coord_values = coordinates.pop(coord_name) # Remove from dict to free memory da_coord = dataset[coord_name] - tmp_coord_values = dataset[coord_name].values + # Get coordinate shape from dataset (uses dask shape, no memory allocation) + coord_shape = da_coord.shape + + # Create output array with fill value + fill_value = da_coord.encoding.get("_FillValue") or da_coord.encoding.get("fill_value") + if fill_value is None: + fill_value = np.nan + tmp_coord_values = np.full(coord_shape, fill_value, dtype=da_coord.dtype) + + # Compute slices for this coordinate's dimensions coord_axes = tuple(non_data_domain_dims.index(coord_dim) for coord_dim in da_coord.dims) coord_slices = tuple(slice(None) if idx in coord_axes else 0 for idx in range(len(non_data_domain_dims))) - coord_trace_indices = grid.map[coord_slices] + # Read only the required slice from grid map + coord_trace_indices = np.asarray(grid.map[coord_slices]) + + # Find valid (non-null) indices not_null = coord_trace_indices != grid.map.fill_value - tmp_coord_values[not_null] = coord_values[coord_trace_indices[not_null]] + # Populate values efficiently + if not_null.any(): + valid_indices = coord_trace_indices[not_null] + tmp_coord_values[not_null] = coord_values[valid_indices] + + # Apply scalar if needed if coord_name in SCALE_COORDINATE_KEYS: tmp_coord_values = _apply_coordinate_scalar(tmp_coord_values, spatial_coordinate_scalar) + # Assign to dataset dataset[coord_name][:] = tmp_coord_values drop_vars_delayed.append(coord_name) + # Explicitly release intermediate arrays + del tmp_coord_values, coord_trace_indices, not_null, coord_values + # TODO(Altay): Add verification of reduced coordinates being the same as the first # https://github.com/TGSAI/mdio-python/issues/645 @@ -624,6 +730,10 @@ def segy_to_mdio( # noqa PLR0913 grid = _build_and_check_grid(segy_dimensions, segy_file_info, segy_headers) _, non_dim_coords = _get_coordinates(grid, segy_headers, mdio_template) + + # Explicitly delete segy_headers to free memory - coordinate values have been copied + del segy_headers + header_dtype = to_structured_type(segy_spec.trace.header.dtype) if settings.raw_headers: