From 40ed5e0e08e3c47c4fffc1cbe9e6474ec83b6af5 Mon Sep 17 00:00:00 2001 From: Mark Roberts <26778185+markspec@users.noreply.github.com> Date: Sun, 10 Mar 2024 13:49:22 -0500 Subject: [PATCH] AutoShotWrap grid override (#338) * Added AutoShopWrap grid override. This allows ingestion with gun indexed by unwrapping shot_point for shot_lines. * Add documentation to cover segy_to_mdio AutoShotWrap grid override. * Resolve merge conflicts. * Update Dockerfile to align with nox pipeline checks. * Merge poetry.lock. * Fix ingestion tests. * Fix issues with tests. * Switch to poetry.lock from main to resolve conflict. * Add some more explainination on ShotGumGeometryType.. * Linting updates. * Remove unused code. * Update .devcontainer/Dockerfile * Update tests/integration/conftest.py * update long text syntax * Fix linting. --------- Co-authored-by: Altay Sansal --- .devcontainer/Dockerfile | 10 +- src/mdio/converters/segy.py | 29 ++++- src/mdio/segy/geometry.py | 126 ++++++++++++++++++- tests/integration/conftest.py | 17 ++- tests/integration/test_segy_import_export.py | 90 ++++++++++++- 5 files changed, 261 insertions(+), 11 deletions(-) diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 3dd31294..5deb8385 100755 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -1,16 +1,16 @@ -ARG PYTHON_VERSION=3.11 +ARG PYTHON_VERSION=3.12 ARG LINUX_DISTRO=bookworm FROM mcr.microsoft.com/devcontainers/python:1-${PYTHON_VERSION}-${LINUX_DISTRO} # Install git for nox pre-commit RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - git \ - && rm -rf /var/lib/apt/lists/* + && apt-get install -y --no-install-recommends \ + git \ + && rm -rf /var/lib/apt/lists/* # Poetry -ARG POETRY_VERSION="1.6.1" +ARG POETRY_VERSION="1.8.2" RUN if [ "${POETRY_VERSION}" != "none" ]; then bash -c "umask 0002 && pip3 install poetry==${POETRY_VERSION}"; fi # Nox diff --git a/src/mdio/converters/segy.py b/src/mdio/converters/segy.py index 7dc17edc..6f121b49 100644 --- a/src/mdio/converters/segy.py +++ b/src/mdio/converters/segy.py @@ -90,6 +90,8 @@ def grid_density_qc(grid: Grid, num_traces: int) -> None: logger.debug(f"Dimensions: {dims}") logger.debug(f"num_traces = {num_traces}") + logger.debug(f"grid_traces = {grid_traces}") + logger.debug(f"sparsity = {grid_traces / num_traces}") grid_sparsity_ratio_limit = os.getenv("MDIO__GRID__SPARSITY_RATIO_LIMIT", 10) try: @@ -270,7 +272,7 @@ def segy_to_mdio( ... mdio_path_or_buffer="s3://bucket/shot_file.mdio", ... index_bytes=(17, 137, 13), ... index_lengths=(4, 2, 4), - ... index_names=("shot", "cable", "channel"), + ... index_names=("shot_point", "cable", "channel"), ... chunksize=(8, 2, 128, 1024), ... grid_overrides={"ChannelWrap": True, "ChannelsPerCable": 800}, ... ) @@ -283,6 +285,31 @@ def segy_to_mdio( >>> grid_overrides={"AutoChannelWrap": True, "AutoChannelTraceQC": 1000000} + For ingestion of pre-stack streamer data where the user needs to + access/index *common-channel gathers* (single gun) then the following + strategy can be used to densely ingest while indexing on gun number: + + >>> segy_to_mdio( + ... segy_path="prefix/shot_file.segy", + ... mdio_path_or_buffer="s3://bucket/shot_file.mdio", + ... index_bytes=(133, 171, 17, 137, 13), + ... index_lengths=(2, 2, 4, 2, 4), + ... index_names=("shot_line", "gun", "shot_point", "cable", "channel"), + ... chunksize=(1, 1, 8, 1, 128, 1024), + ... grid_overrides={ + ... "AutoShotWrap": True, + ... "AutoChannelWrap": True, + ... "AutoChannelTraceQC": 1000000 + ... }, + ... ) + + For AutoShotWrap and AutoChannelWrap to work, the user must provide + "shot_line", "gun", "shot_point", "cable", "channel". For improved + common-channel performance consider modifying the chunksize to be + (1, 1, 32, 1, 32, 2048) for good common-shot and common-channel + performance or (1, 1, 128, 1, 1, 2048) for common-channel + performance. + For cases with no well-defined trace header for indexing a NonBinned grid override is provided.This creates the index and attributes an incrementing integer to the trace for the index based on first in first diff --git a/src/mdio/segy/geometry.py b/src/mdio/segy/geometry.py index 5eb5de55..a18a5a00 100644 --- a/src/mdio/segy/geometry.py +++ b/src/mdio/segy/geometry.py @@ -60,6 +60,28 @@ class StreamerShotGeometryType(Enum): C = auto() +class ShotGunGeometryType(Enum): + r"""Shot geometry template types for multi-gun acquisition. + + For shot lines with multiple guns, we can have two configurations for + numbering shot_point. The desired index is to have the shot point index + for a given gun to be dense and unique (configuration A). Typically the + shot_point is unique for the line and therefore is not dense for each + gun (configuration B). + + Configuration A: + Gun 1 -> 1------------------20 + Gun 2 -> 1------------------20 + + Configuration B: + Gun 1 -> 1------------------39 + Gun 2 -> 2------------------40 + + """ + A = auto() + B = auto() + + def analyze_streamer_headers( index_headers: dict[str, npt.NDArray], ) -> tuple[npt.NDArray, npt.NDArray, npt.NDArray, StreamerShotGeometryType]: @@ -91,6 +113,7 @@ def analyze_streamer_headers( # Check channel numbers do not overlap for case B geom_type = StreamerShotGeometryType.B + for idx1, cable1 in enumerate(unique_cables): min_val1 = cable_chan_min[idx1] max_val1 = cable_chan_max[idx1] @@ -124,13 +147,65 @@ def analyze_streamer_headers( return unique_cables, cable_chan_min, cable_chan_max, geom_type +def analyze_shotlines_for_guns( + index_headers: dict[str, npt.NDArray], +) -> tuple[npt.NDArray, npt.NDArray, ShotGunGeometryType]: + """Check input headers for SEG-Y input to help determine geometry of shots and guns. + + This function reads in trace_qc_count headers and finds the unique gun values. + The function then checks to ensure shot numbers are dense. + + Args: + index_headers: numpy array with index headers + + Returns: + tuple of unique_shot_lines, unique_guns_in_shot_line, geom_type + """ + # Find unique cable ids + unique_shot_lines = np.sort(np.unique(index_headers["shot_line"])) + unique_guns = np.sort(np.unique(index_headers["gun"])) + logger.info(f"unique_shot_lines: {unique_shot_lines}") + logger.info(f"unique_guns: {unique_guns}") + + # Find channel min and max values for each cable + # unique_guns_in_shot_line = np.empty(unique_shot_lines.shape) + unique_guns_in_shot_line = dict() + + geom_type = ShotGunGeometryType.B + # Check shot numbers are still unique if div/num_guns + for shot_line in unique_shot_lines: + shot_line_mask = index_headers["shot_line"] == shot_line + shot_current_sl = index_headers["shot_point"][shot_line_mask] + gun_current_sl = index_headers["gun"][shot_line_mask] + + unique_guns_sl = np.sort(np.unique(gun_current_sl)) + num_guns_sl = unique_guns_sl.shape[0] + # unique_guns_in_shot_line[idx] = list(unique_guns_sl) + unique_guns_in_shot_line[str(shot_line)] = list(unique_guns_sl) + + for gun in unique_guns_sl: + gun_mask = gun_current_sl == gun + shots_current_sl_gun = shot_current_sl[gun_mask] + num_shots_sl = np.unique(shots_current_sl_gun).shape[0] + mod_shots = np.floor(shots_current_sl_gun / num_guns_sl) + if len(np.unique(mod_shots)) != num_shots_sl: + msg = ( + f"Shot line {shot_line} has {num_shots_sl} when using div by " + f"{num_guns_sl} (num_guns) has {np.unique(mod_shots)} unique mod shots." + ) + logger.info(msg) + geom_type = ShotGunGeometryType.A + return unique_shot_lines, unique_guns_in_shot_line, geom_type + return unique_shot_lines, unique_guns_in_shot_line, geom_type + + def create_counter( depth: int, total_depth: int, unique_headers: dict[str, npt.NDArray], header_names: list[str], ): - """Helper funtion to create dictionary tree for counting trace key for auto index.""" + """Helper function to create dictionary tree for counting trace key for auto index.""" if depth == total_depth: return 0 @@ -490,6 +565,54 @@ def transform( return index_headers +class AutoShotWrap(GridOverrideCommand): + """Automatically determine ShotGun acquisition type.""" + + required_keys = {"shot_line", "gun", "shot_point", "cable", "channel"} + required_parameters = None + + def validate( + self, + index_headers: dict[str, npt.NDArray], + grid_overrides: dict[str, bool | int], + ) -> None: + """Validate if this transform should run on the type of data.""" + self.check_required_keys(index_headers) + self.check_required_params(grid_overrides) + + def transform( + self, + index_headers: dict[str, npt.NDArray], + grid_overrides: dict[str, bool | int], + ) -> dict[str, npt.NDArray]: + """Perform the grid transform.""" + self.validate(index_headers, grid_overrides) + + result = analyze_shotlines_for_guns(index_headers) + unique_shot_lines, unique_guns_in_shot_line, geom_type = result + logger.info(f"Ingesting dataset as shot type: {geom_type.name}") + + # TODO: Add strict=True and remove noqa when min Python is 3.10 + max_num_guns = 1 + for shot_line in unique_shot_lines: + logger.info( + f"shot_line: {shot_line} has guns: {unique_guns_in_shot_line[str(shot_line)]}" + ) + num_guns = len(unique_guns_in_shot_line[str(shot_line)]) + if num_guns > max_num_guns: + max_num_guns = num_guns + + # This might be slow and potentially could be improved with a rewrite + # to prevent so many lookups + if geom_type == ShotGunGeometryType.B: + for shot_line in unique_shot_lines: + shot_line_idxs = np.where(index_headers["shot_line"][:] == shot_line) + index_headers["shot_point"][shot_line_idxs] = np.floor( + index_headers["shot_point"][shot_line_idxs] / max_num_guns + ) + return index_headers + + class GridOverrider: """Executor for grid overrides. @@ -503,6 +626,7 @@ def __init__(self): """Define allowed overrides and parameters here.""" self.commands = { "AutoChannelWrap": AutoChannelWrap(), + "AutoShotWrap": AutoShotWrap(), "CalculateCable": CalculateCable(), "ChannelWrap": ChannelWrap(), "NonBinned": NonBinned(), diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 9dce0130..1d943cfa 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -18,6 +18,7 @@ def create_segy_mock_4d( shots: list, cables: list, receivers_per_cable: list, + guns: list | None = None, chan_header_type: StreamerShotGeometryType = StreamerShotGeometryType.A, index_receivers: bool = True, ) -> str: @@ -55,20 +56,30 @@ def create_segy_mock_4d( index_receivers = False shot_headers = np.hstack([np.repeat(shot, total_chan) for shot in shots]) + + gun_per_shot = [] + for shot in shots: + gun_per_shot.append(guns[(shot % len(guns))]) + gun_headers = np.hstack([np.repeat(gun, total_chan) for gun in gun_per_shot]) + cable_headers = np.tile(cable_headers, shot_count) channel_headers = np.tile(channel_headers, shot_count) with segyio.create(segy_file, spec) as f: for trc_idx in range(trace_count): shot = shot_headers[trc_idx] + gun = gun_headers[trc_idx] cable = cable_headers[trc_idx] channel = channel_headers[trc_idx] + source_line = 1 # offset is byte location 37 - offset 4 bytes # fldr is byte location 9 - shot 4 byte # ep is byte location 17 - shot 4 byte # stae is byte location 137 - cable 2 byte # tracf is byte location 13 - channel 4 byte + # grnors is byte location 171 - gun 2 bytes + # styp is byte location 133 - source_line 2 bytes if index_receivers: f.header[trc_idx].update( @@ -77,6 +88,8 @@ def create_segy_mock_4d( ep=shot, stae=cable, tracf=channel, + grnors=gun, + styp=source_line, ) else: f.header[trc_idx].update( @@ -98,7 +111,8 @@ def create_segy_mock_4d( def segy_mock_4d_shots(fake_segy_tmp: str) -> dict[StreamerShotGeometryType, str]: """Generate mock 4D shot SEG-Y files.""" num_samples = 25 - shots = [2, 3, 5] + shots = [2, 3, 5, 6, 7, 8, 9] + guns = [1, 2] cables = [0, 101, 201, 301] receivers_per_cable = [1, 5, 7, 5] @@ -112,6 +126,7 @@ def segy_mock_4d_shots(fake_segy_tmp: str) -> dict[StreamerShotGeometryType, str cables=cables, receivers_per_cable=receivers_per_cable, chan_header_type=chan_header_type, + guns=guns, ) return segy_paths diff --git a/tests/integration/test_segy_import_export.py b/tests/integration/test_segy_import_export.py index 40a3bc65..0eb0cf23 100644 --- a/tests/integration/test_segy_import_export.py +++ b/tests/integration/test_segy_import_export.py @@ -57,7 +57,7 @@ def test_import_4d_segy( # Expected values num_samples = 25 - shots = [2, 3, 5] + shots = [2, 3, 5, 6, 7, 8, 9] cables = [0, 101, 201, 301] receivers_per_cable = [1, 5, 7, 5] @@ -114,7 +114,7 @@ def test_import_4d_segy( # Expected values num_samples = 25 - shots = [2, 3, 5] + shots = [2, 3, 5, 6, 7, 8, 9] cables = [0, 101, 201, 301] receivers_per_cable = [1, 5, 7, 5] @@ -177,7 +177,7 @@ def test_import_4d_segy( segy_path = segy_mock_4d_shots[chan_header_type] os.environ["MDIO__GRID__SPARSITY_RATIO_LIMIT"] = "1.1" - with pytest.raises(GridTraceSparsityError): + with pytest.raises(GridTraceSparsityError) as execinfo: segy_to_mdio( segy_path=segy_path, mdio_path_or_buffer=zarr_tmp.__str__(), @@ -190,6 +190,90 @@ def test_import_4d_segy( grid_overrides=grid_overrides, ) + os.environ["MDIO__GRID__SPARSITY_RATIO_LIMIT"] = "10" + assert ( + "This grid is very sparse and most likely user error with indexing." + in str(execinfo.value) + ) + + +@pytest.mark.parametrize("header_locations", [(133, 171, 17, 137, 13)]) +@pytest.mark.parametrize( + "header_names", [("shot_line", "gun", "shot_point", "cable", "channel")] +) +@pytest.mark.parametrize( + "header_types", [("int16", "int16", "int32", "int16", "int32")] +) +@pytest.mark.parametrize("endian", ["big"]) +@pytest.mark.parametrize( + "grid_overrides", [{"AutoChannelWrap": True, "AutoShotWrap": True}, None] +) +@pytest.mark.parametrize( + "chan_header_type", [StreamerShotGeometryType.A, StreamerShotGeometryType.B] +) +class TestImport6D: + """Test for 6D segy import with grid overrides.""" + + def test_import_6d_segy( + self, + segy_mock_4d_shots, + zarr_tmp, + header_locations, + header_names, + header_types, + endian, + grid_overrides, + chan_header_type, + ): + """Test importing a SEG-Y file to MDIO.""" + segy_path = segy_mock_4d_shots[chan_header_type] + + segy_to_mdio( + segy_path=segy_path, + mdio_path_or_buffer=zarr_tmp.__str__(), + index_bytes=header_locations, + index_names=header_names, + index_types=header_types, + chunksize=(1, 1, 8, 1, 12, 36), + overwrite=True, + endian=endian, + grid_overrides=grid_overrides, + ) + + # Expected values + num_samples = 25 + shots = [2, 3, 5, 6, 7, 8, 9] # original shot list + if grid_overrides is not None and "AutoShotWrap" in grid_overrides: + shots_new = [ + int(shot / 2) for shot in shots + ] # Updated shot index when ingesting with 2 guns + shots_set = set(shots_new) # remove duplicates + shots = list(shots_set) # Unique shot points for 6D indexed with gun + cables = [0, 101, 201, 301] + guns = [1, 2] + receivers_per_cable = [1, 5, 7, 5] + + # QC mdio output + mdio = MDIOReader(zarr_tmp.__str__(), access_pattern="012345") + assert mdio.binary_header["Samples"] == num_samples + grid = mdio.grid + + assert grid.select_dim(header_names[1]) == Dimension(guns, header_names[1]) + assert grid.select_dim(header_names[2]) == Dimension(shots, header_names[2]) + assert grid.select_dim(header_names[3]) == Dimension(cables, header_names[3]) + + if chan_header_type == StreamerShotGeometryType.B and grid_overrides is None: + assert grid.select_dim(header_names[4]) == Dimension( + range(1, np.sum(receivers_per_cable) + 1), header_names[4] + ) + else: + assert grid.select_dim(header_names[4]) == Dimension( + range(1, np.amax(receivers_per_cable) + 1), header_names[4] + ) + + samples_exp = Dimension(range(0, num_samples, 1), "sample") + assert grid.select_dim("sample") == samples_exp + @pytest.mark.parametrize("header_locations", [(17, 13)]) @pytest.mark.parametrize("header_names", [("inline", "crossline")])