From 40ed5e0e08e3c47c4fffc1cbe9e6474ec83b6af5 Mon Sep 17 00:00:00 2001
From: Mark Roberts <26778185+markspec@users.noreply.github.com>
Date: Sun, 10 Mar 2024 13:49:22 -0500
Subject: [PATCH] AutoShotWrap grid override (#338)

* Added AutoShopWrap grid override.  This allows ingestion with gun indexed by unwrapping shot_point for shot_lines.

* Add documentation to cover segy_to_mdio AutoShotWrap grid override.

* Resolve merge conflicts.

* Update Dockerfile to align with nox pipeline checks.

* Merge poetry.lock.

* Fix ingestion tests.

* Fix issues with tests.

* Switch to poetry.lock from main to resolve conflict.

* Add some more explainination on ShotGumGeometryType..

* Linting updates.

* Remove unused code.

* Update .devcontainer/Dockerfile

* Update tests/integration/conftest.py

* update long text syntax

* Fix linting.

---------

Co-authored-by: Altay Sansal <tasansal@users.noreply.github.com>
---
 .devcontainer/Dockerfile                     |  10 +-
 src/mdio/converters/segy.py                  |  29 ++++-
 src/mdio/segy/geometry.py                    | 126 ++++++++++++++++++-
 tests/integration/conftest.py                |  17 ++-
 tests/integration/test_segy_import_export.py |  90 ++++++++++++-
 5 files changed, 261 insertions(+), 11 deletions(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 3dd31294..5deb8385 100755
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -1,16 +1,16 @@
-ARG PYTHON_VERSION=3.11
+ARG PYTHON_VERSION=3.12
 ARG LINUX_DISTRO=bookworm
 
 FROM mcr.microsoft.com/devcontainers/python:1-${PYTHON_VERSION}-${LINUX_DISTRO}
 
 # Install git for nox pre-commit
 RUN apt-get update \
-    && apt-get install -y --no-install-recommends  \
-    git \
-    && rm -rf /var/lib/apt/lists/*
+  && apt-get install -y --no-install-recommends  \
+  git \
+  && rm -rf /var/lib/apt/lists/*
 
 # Poetry
-ARG POETRY_VERSION="1.6.1"
+ARG POETRY_VERSION="1.8.2"
 RUN if [ "${POETRY_VERSION}" != "none" ]; then bash -c "umask 0002 && pip3 install poetry==${POETRY_VERSION}"; fi
 
 # Nox
diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py
index 7dc17edc..6f121b49 100644
--- a/src/mdio/converters/segy.py
+++ b/src/mdio/converters/segy.py
@@ -90,6 +90,8 @@ def grid_density_qc(grid: Grid, num_traces: int) -> None:
 
     logger.debug(f"Dimensions: {dims}")
     logger.debug(f"num_traces = {num_traces}")
+    logger.debug(f"grid_traces = {grid_traces}")
+    logger.debug(f"sparsity = {grid_traces / num_traces}")
 
     grid_sparsity_ratio_limit = os.getenv("MDIO__GRID__SPARSITY_RATIO_LIMIT", 10)
     try:
@@ -270,7 +272,7 @@ def segy_to_mdio(
         ...     mdio_path_or_buffer="s3://bucket/shot_file.mdio",
         ...     index_bytes=(17, 137, 13),
         ...     index_lengths=(4, 2, 4),
-        ...     index_names=("shot", "cable", "channel"),
+        ...     index_names=("shot_point", "cable", "channel"),
         ...     chunksize=(8, 2, 128, 1024),
         ...     grid_overrides={"ChannelWrap": True, "ChannelsPerCable": 800},
         ... )
@@ -283,6 +285,31 @@ def segy_to_mdio(
         >>>    grid_overrides={"AutoChannelWrap": True,
                                "AutoChannelTraceQC":  1000000}
 
+        For ingestion of pre-stack streamer data where the user needs to
+        access/index *common-channel gathers* (single gun) then the following
+        strategy can be used to densely ingest while indexing on gun number:
+
+        >>> segy_to_mdio(
+        ...     segy_path="prefix/shot_file.segy",
+        ...     mdio_path_or_buffer="s3://bucket/shot_file.mdio",
+        ...     index_bytes=(133, 171, 17, 137, 13),
+        ...     index_lengths=(2, 2, 4, 2, 4),
+        ...     index_names=("shot_line", "gun", "shot_point", "cable", "channel"),
+        ...     chunksize=(1, 1, 8, 1, 128, 1024),
+        ...     grid_overrides={
+        ...         "AutoShotWrap": True,
+        ...         "AutoChannelWrap": True,
+        ...         "AutoChannelTraceQC":  1000000
+        ...     },
+        ... )
+
+        For AutoShotWrap and AutoChannelWrap to work, the user must provide
+        "shot_line", "gun", "shot_point", "cable", "channel". For improved
+        common-channel performance consider modifying the chunksize to be
+        (1, 1, 32, 1, 32, 2048) for good common-shot and common-channel
+        performance or (1, 1, 128, 1, 1, 2048) for common-channel
+        performance.
+
         For cases with no well-defined trace header for indexing a NonBinned
         grid override is provided.This creates the index and attributes an
         incrementing integer to the trace for the index based on first in first
diff --git a/src/mdio/segy/geometry.py b/src/mdio/segy/geometry.py
index 5eb5de55..a18a5a00 100644
--- a/src/mdio/segy/geometry.py
+++ b/src/mdio/segy/geometry.py
@@ -60,6 +60,28 @@ class StreamerShotGeometryType(Enum):
     C = auto()
 
 
+class ShotGunGeometryType(Enum):
+    r"""Shot geometry template types for multi-gun acquisition.
+
+    For shot lines with multiple guns, we can have two configurations for
+    numbering shot_point. The desired index is to have the shot point index
+    for a given gun to be dense and unique (configuration A). Typically the
+    shot_point is unique for the line and therefore is not dense for each
+    gun (configuration B).
+
+    Configuration A:
+        Gun 1 ->         1------------------20
+        Gun 2 ->         1------------------20
+
+    Configuration B:
+        Gun 1 ->         1------------------39
+        Gun 2 ->         2------------------40
+
+    """
+    A = auto()
+    B = auto()
+
+
 def analyze_streamer_headers(
     index_headers: dict[str, npt.NDArray],
 ) -> tuple[npt.NDArray, npt.NDArray, npt.NDArray, StreamerShotGeometryType]:
@@ -91,6 +113,7 @@ def analyze_streamer_headers(
 
     # Check channel numbers do not overlap for case B
     geom_type = StreamerShotGeometryType.B
+
     for idx1, cable1 in enumerate(unique_cables):
         min_val1 = cable_chan_min[idx1]
         max_val1 = cable_chan_max[idx1]
@@ -124,13 +147,65 @@ def analyze_streamer_headers(
     return unique_cables, cable_chan_min, cable_chan_max, geom_type
 
 
+def analyze_shotlines_for_guns(
+    index_headers: dict[str, npt.NDArray],
+) -> tuple[npt.NDArray, npt.NDArray, ShotGunGeometryType]:
+    """Check input headers for SEG-Y input to help determine geometry of shots and guns.
+
+    This function reads in trace_qc_count headers and finds the unique gun values.
+    The function then checks to ensure shot numbers are dense.
+
+    Args:
+        index_headers: numpy array with index headers
+
+    Returns:
+        tuple of unique_shot_lines, unique_guns_in_shot_line, geom_type
+    """
+    # Find unique cable ids
+    unique_shot_lines = np.sort(np.unique(index_headers["shot_line"]))
+    unique_guns = np.sort(np.unique(index_headers["gun"]))
+    logger.info(f"unique_shot_lines: {unique_shot_lines}")
+    logger.info(f"unique_guns: {unique_guns}")
+
+    # Find channel min and max values for each cable
+    # unique_guns_in_shot_line = np.empty(unique_shot_lines.shape)
+    unique_guns_in_shot_line = dict()
+
+    geom_type = ShotGunGeometryType.B
+    # Check shot numbers are still unique if div/num_guns
+    for shot_line in unique_shot_lines:
+        shot_line_mask = index_headers["shot_line"] == shot_line
+        shot_current_sl = index_headers["shot_point"][shot_line_mask]
+        gun_current_sl = index_headers["gun"][shot_line_mask]
+
+        unique_guns_sl = np.sort(np.unique(gun_current_sl))
+        num_guns_sl = unique_guns_sl.shape[0]
+        # unique_guns_in_shot_line[idx] = list(unique_guns_sl)
+        unique_guns_in_shot_line[str(shot_line)] = list(unique_guns_sl)
+
+        for gun in unique_guns_sl:
+            gun_mask = gun_current_sl == gun
+            shots_current_sl_gun = shot_current_sl[gun_mask]
+            num_shots_sl = np.unique(shots_current_sl_gun).shape[0]
+            mod_shots = np.floor(shots_current_sl_gun / num_guns_sl)
+            if len(np.unique(mod_shots)) != num_shots_sl:
+                msg = (
+                    f"Shot line {shot_line} has {num_shots_sl} when using div by "
+                    f"{num_guns_sl} (num_guns) has  {np.unique(mod_shots)} unique mod shots."
+                )
+                logger.info(msg)
+                geom_type = ShotGunGeometryType.A
+                return unique_shot_lines, unique_guns_in_shot_line, geom_type
+    return unique_shot_lines, unique_guns_in_shot_line, geom_type
+
+
 def create_counter(
     depth: int,
     total_depth: int,
     unique_headers: dict[str, npt.NDArray],
     header_names: list[str],
 ):
-    """Helper funtion to create dictionary tree for counting trace key for auto index."""
+    """Helper function to create dictionary tree for counting trace key for auto index."""
     if depth == total_depth:
         return 0
 
@@ -490,6 +565,54 @@ def transform(
         return index_headers
 
 
+class AutoShotWrap(GridOverrideCommand):
+    """Automatically determine ShotGun acquisition type."""
+
+    required_keys = {"shot_line", "gun", "shot_point", "cable", "channel"}
+    required_parameters = None
+
+    def validate(
+        self,
+        index_headers: dict[str, npt.NDArray],
+        grid_overrides: dict[str, bool | int],
+    ) -> None:
+        """Validate if this transform should run on the type of data."""
+        self.check_required_keys(index_headers)
+        self.check_required_params(grid_overrides)
+
+    def transform(
+        self,
+        index_headers: dict[str, npt.NDArray],
+        grid_overrides: dict[str, bool | int],
+    ) -> dict[str, npt.NDArray]:
+        """Perform the grid transform."""
+        self.validate(index_headers, grid_overrides)
+
+        result = analyze_shotlines_for_guns(index_headers)
+        unique_shot_lines, unique_guns_in_shot_line, geom_type = result
+        logger.info(f"Ingesting dataset as shot type: {geom_type.name}")
+
+        # TODO: Add strict=True and remove noqa when min Python is 3.10
+        max_num_guns = 1
+        for shot_line in unique_shot_lines:
+            logger.info(
+                f"shot_line: {shot_line} has guns: {unique_guns_in_shot_line[str(shot_line)]}"
+            )
+            num_guns = len(unique_guns_in_shot_line[str(shot_line)])
+            if num_guns > max_num_guns:
+                max_num_guns = num_guns
+
+        # This might be slow and potentially could be improved with a rewrite
+        # to prevent so many lookups
+        if geom_type == ShotGunGeometryType.B:
+            for shot_line in unique_shot_lines:
+                shot_line_idxs = np.where(index_headers["shot_line"][:] == shot_line)
+                index_headers["shot_point"][shot_line_idxs] = np.floor(
+                    index_headers["shot_point"][shot_line_idxs] / max_num_guns
+                )
+        return index_headers
+
+
 class GridOverrider:
     """Executor for grid overrides.
 
@@ -503,6 +626,7 @@ def __init__(self):
         """Define allowed overrides and parameters here."""
         self.commands = {
             "AutoChannelWrap": AutoChannelWrap(),
+            "AutoShotWrap": AutoShotWrap(),
             "CalculateCable": CalculateCable(),
             "ChannelWrap": ChannelWrap(),
             "NonBinned": NonBinned(),
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 9dce0130..1d943cfa 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -18,6 +18,7 @@ def create_segy_mock_4d(
     shots: list,
     cables: list,
     receivers_per_cable: list,
+    guns: list | None = None,
     chan_header_type: StreamerShotGeometryType = StreamerShotGeometryType.A,
     index_receivers: bool = True,
 ) -> str:
@@ -55,20 +56,30 @@ def create_segy_mock_4d(
         index_receivers = False
 
     shot_headers = np.hstack([np.repeat(shot, total_chan) for shot in shots])
+
+    gun_per_shot = []
+    for shot in shots:
+        gun_per_shot.append(guns[(shot % len(guns))])
+    gun_headers = np.hstack([np.repeat(gun, total_chan) for gun in gun_per_shot])
+
     cable_headers = np.tile(cable_headers, shot_count)
     channel_headers = np.tile(channel_headers, shot_count)
 
     with segyio.create(segy_file, spec) as f:
         for trc_idx in range(trace_count):
             shot = shot_headers[trc_idx]
+            gun = gun_headers[trc_idx]
             cable = cable_headers[trc_idx]
             channel = channel_headers[trc_idx]
+            source_line = 1
 
             # offset is byte location 37 - offset 4 bytes
             # fldr is byte location 9 - shot 4 byte
             # ep is byte location 17 - shot 4 byte
             # stae is byte location 137 - cable 2 byte
             # tracf is byte location 13 - channel 4 byte
+            # grnors is byte location 171 - gun 2 bytes
+            # styp is byte location 133 - source_line 2 bytes
 
             if index_receivers:
                 f.header[trc_idx].update(
@@ -77,6 +88,8 @@ def create_segy_mock_4d(
                     ep=shot,
                     stae=cable,
                     tracf=channel,
+                    grnors=gun,
+                    styp=source_line,
                 )
             else:
                 f.header[trc_idx].update(
@@ -98,7 +111,8 @@ def create_segy_mock_4d(
 def segy_mock_4d_shots(fake_segy_tmp: str) -> dict[StreamerShotGeometryType, str]:
     """Generate mock 4D shot SEG-Y files."""
     num_samples = 25
-    shots = [2, 3, 5]
+    shots = [2, 3, 5, 6, 7, 8, 9]
+    guns = [1, 2]
     cables = [0, 101, 201, 301]
     receivers_per_cable = [1, 5, 7, 5]
 
@@ -112,6 +126,7 @@ def segy_mock_4d_shots(fake_segy_tmp: str) -> dict[StreamerShotGeometryType, str
             cables=cables,
             receivers_per_cable=receivers_per_cable,
             chan_header_type=chan_header_type,
+            guns=guns,
         )
 
     return segy_paths
diff --git a/tests/integration/test_segy_import_export.py b/tests/integration/test_segy_import_export.py
index 40a3bc65..0eb0cf23 100644
--- a/tests/integration/test_segy_import_export.py
+++ b/tests/integration/test_segy_import_export.py
@@ -57,7 +57,7 @@ def test_import_4d_segy(
 
         # Expected values
         num_samples = 25
-        shots = [2, 3, 5]
+        shots = [2, 3, 5, 6, 7, 8, 9]
         cables = [0, 101, 201, 301]
         receivers_per_cable = [1, 5, 7, 5]
 
@@ -114,7 +114,7 @@ def test_import_4d_segy(
 
         # Expected values
         num_samples = 25
-        shots = [2, 3, 5]
+        shots = [2, 3, 5, 6, 7, 8, 9]
         cables = [0, 101, 201, 301]
         receivers_per_cable = [1, 5, 7, 5]
 
@@ -177,7 +177,7 @@ def test_import_4d_segy(
         segy_path = segy_mock_4d_shots[chan_header_type]
         os.environ["MDIO__GRID__SPARSITY_RATIO_LIMIT"] = "1.1"
 
-        with pytest.raises(GridTraceSparsityError):
+        with pytest.raises(GridTraceSparsityError) as execinfo:
             segy_to_mdio(
                 segy_path=segy_path,
                 mdio_path_or_buffer=zarr_tmp.__str__(),
@@ -190,6 +190,90 @@ def test_import_4d_segy(
                 grid_overrides=grid_overrides,
             )
 
+        os.environ["MDIO__GRID__SPARSITY_RATIO_LIMIT"] = "10"
+        assert (
+            "This grid is very sparse and most likely user error with indexing."
+            in str(execinfo.value)
+        )
+
+
+@pytest.mark.parametrize("header_locations", [(133, 171, 17, 137, 13)])
+@pytest.mark.parametrize(
+    "header_names", [("shot_line", "gun", "shot_point", "cable", "channel")]
+)
+@pytest.mark.parametrize(
+    "header_types", [("int16", "int16", "int32", "int16", "int32")]
+)
+@pytest.mark.parametrize("endian", ["big"])
+@pytest.mark.parametrize(
+    "grid_overrides", [{"AutoChannelWrap": True, "AutoShotWrap": True}, None]
+)
+@pytest.mark.parametrize(
+    "chan_header_type", [StreamerShotGeometryType.A, StreamerShotGeometryType.B]
+)
+class TestImport6D:
+    """Test for 6D segy import with grid overrides."""
+
+    def test_import_6d_segy(
+        self,
+        segy_mock_4d_shots,
+        zarr_tmp,
+        header_locations,
+        header_names,
+        header_types,
+        endian,
+        grid_overrides,
+        chan_header_type,
+    ):
+        """Test importing a SEG-Y file to MDIO."""
+        segy_path = segy_mock_4d_shots[chan_header_type]
+
+        segy_to_mdio(
+            segy_path=segy_path,
+            mdio_path_or_buffer=zarr_tmp.__str__(),
+            index_bytes=header_locations,
+            index_names=header_names,
+            index_types=header_types,
+            chunksize=(1, 1, 8, 1, 12, 36),
+            overwrite=True,
+            endian=endian,
+            grid_overrides=grid_overrides,
+        )
+
+        # Expected values
+        num_samples = 25
+        shots = [2, 3, 5, 6, 7, 8, 9]  # original shot list
+        if grid_overrides is not None and "AutoShotWrap" in grid_overrides:
+            shots_new = [
+                int(shot / 2) for shot in shots
+            ]  # Updated shot index when ingesting with 2 guns
+            shots_set = set(shots_new)  # remove duplicates
+            shots = list(shots_set)  # Unique shot points for 6D indexed with gun
+        cables = [0, 101, 201, 301]
+        guns = [1, 2]
+        receivers_per_cable = [1, 5, 7, 5]
+
+        # QC mdio output
+        mdio = MDIOReader(zarr_tmp.__str__(), access_pattern="012345")
+        assert mdio.binary_header["Samples"] == num_samples
+        grid = mdio.grid
+
+        assert grid.select_dim(header_names[1]) == Dimension(guns, header_names[1])
+        assert grid.select_dim(header_names[2]) == Dimension(shots, header_names[2])
+        assert grid.select_dim(header_names[3]) == Dimension(cables, header_names[3])
+
+        if chan_header_type == StreamerShotGeometryType.B and grid_overrides is None:
+            assert grid.select_dim(header_names[4]) == Dimension(
+                range(1, np.sum(receivers_per_cable) + 1), header_names[4]
+            )
+        else:
+            assert grid.select_dim(header_names[4]) == Dimension(
+                range(1, np.amax(receivers_per_cable) + 1), header_names[4]
+            )
+
+        samples_exp = Dimension(range(0, num_samples, 1), "sample")
+        assert grid.select_dim("sample") == samples_exp
+
 
 @pytest.mark.parametrize("header_locations", [(17, 13)])
 @pytest.mark.parametrize("header_names", [("inline", "crossline")])