MTgeophysics · kujaku11 · Oct 1, 2024 · Sep 28, 2024 · Sep 28, 2024 · Sep 30, 2024
diff --git a/.github/workflows/testing_in_conda.yml b/.github/workflows/testing_in_conda.yml
@@ -37,7 +37,7 @@ jobs:
         conda install pytest-subtests
         conda install pytest-cov
         pip install git+https://github.com/kujaku11/mt_metadata.git@main
-        pip install git+https://github.com/kujaku11/mth5.git@tf_optimize
+        pip install git+https://github.com/kujaku11/mth5.git@master
         pip install git+https://github.com/simpeg/aurora@main
         git clone https://github.com/MTgeophysics/mtpy_data.git
         cd mtpy_data

diff --git a/mtpy/core/mt_collection.py b/mtpy/core/mt_collection.py
@@ -483,19 +483,18 @@ def check_for_duplicates(self, locate="location", sig_figs=6):
             return self.dataframe[self.dataframe.duplicated(query)]
         return None
 
-    def apply_bbox(self, lon_min, lon_max, lat_min, lat_max):
-        """Return :class:`pandas.DataFrame` of station within bounding box.
-
-        :param lon_min: Minimum longitude.
-        :type lon_min: float
-        :param lon_max: Maximum longitude.
-        :type lon_max: float
-        :param lat_min: Minimum latitude.
-        :type lat_min: float
-        :param lat_max: Maximum longitude.
-        :type lat_max: float
-        :return: Only stations within the given bounding box.
-        :rtype: :class:`pandas.DataFrame`
+    def apply_bbox(self, lon_min: float, lon_max: float, lat_min: float, lat_max: float) -> None:
+        """
+            Sets self.working_dataframe to only stations within bounding box.
+
+            :param lon_min: Minimum longitude.
+            :type lon_min: float
+            :param lon_max: Maximum longitude.
+            :type lon_max: float
+            :param lat_min: Minimum latitude.
+            :type lat_min: float
+            :param lat_max: Maximum longitude.
+            :type lat_max: float
         """
 
         if self.has_data():

diff --git a/mtpy/processing/kernel_dataset.py b/mtpy/processing/kernel_dataset.py
@@ -3,16 +3,12 @@
 This module contains a class for representing a dataset that can be processed.
 
 Development Notes:
-The KernelDataset could potentially be moved into mth5 or mtpy and used
-as the dataset description for other processing flows.
 
 Players on the stage:  One or more mth5s.
 
-Each mth5 has a mth5_obj.channel_summary dataframe which tells what data are available.
-Use a compressed view of this df with one line per acquisition run -- a "run_summary".
-
-Run_summary provides options for the local and possibly remote reference stations.
-Candidates for local station are the unique values in the station column.
+Each mth5 has a "run_summary" dataframe available. Run_summary provides options for
+the local and possibly remote reference stations. Candidates for local station are
+the unique values in the station column.
 
 For any candidate station, there are some integer n runs available.
 This yields 2^n - 1 possible combinations that can be processed, neglecting any
@@ -29,12 +25,11 @@
 
 The intended usage process is as follows:
  0. Start with a list of mth5s
- 1. Extract channel_summaries from each mth5 and join them vertically
- 2. Compress to a run_summary
- 3. Stare at the run_summary_df & Select a station "S" to process
- 4. Select a non-empty set of runs for station "S"
- 5. Select a remote reference "RR", (this is allowed to be None)
- 6. Extract the sub-dataframe corresponding to acquisition_runs from "S" and "RR"
+ 1. Extract a run_summary
+ 2. Stare at the run_summary_df, and select a station "S" to process
+ 3. Select a non-empty set of runs for station "S"
+ 4. Select a remote reference "RR", (this is allowed to be None)
+ 5. Extract the sub-dataframe corresponding to acquisition_runs from "S" and "RR"
  7. If the remote is not None:
   - Drop the runs (rows) associated with RR that do not intersect with S
   - Restrict start/end times of RR runs that intersect with S so overlap is complete.
@@ -49,10 +44,6 @@
 
 TODO: Consider supporting a default value for 'channel_scale_factors' that is None,
 
-TODO: As of March 2023 a RunSummary is available at the station level in mth5, but
- the aurora version is still being used.  This should be merged if possible so that
- aurora uses the built-in mth5 method. -- Run Summary exists atstation level in mth5
-
 TODO: Might need to groupby survey & station, for now consider station_id  unique.
 
 """
@@ -234,23 +225,42 @@ def clone_dataframe(self) -> pd.DataFrame:
         """Return a deep copy of dataframe."""
         return copy.deepcopy(self.df)
 
-    def _add_columns(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Add columns with appropriate dtypes."""
-
+    def _add_columns(
+            self,
+            df: pd.DataFrame,
+            null_columns: Optional[Union[list, tuple]] = ("fc",)
+    ) -> pd.DataFrame:
+        """
+        Add columns with appropriate dtypes.
+
+        :param df: A kernel dataset dataframe, possibly not all columns present.
+        :type df: pd.Dataframe
+        :param null_columns: Columns that will not init to their expected dtype, but rather init to null.
+        :type null_columns: Optional[Union[list, tuple]]
+        :return: Kernel dataset dataframe, with all columns present.
+        :rtype:  pd.Dataframe
+        """
         for col, dtype in KERNEL_DATASET_DTYPE:
             if not col in df.columns:
                 if col in ["survey", "station", "run", "start", "end"]:
                     raise ValueError(
                         f"{col} must be a filled column in the dataframe"
                     )
+                set_null = False
                 try:
                     df[col] = dtype(0)
+                    assigned_dtype = dtype
                 except TypeError:
-                    df[col] = None
-                logger.info(
-                    f"KernelDataset DataFrame needs column {col}, adding "
-                    f"and setting dtype to {dtype}."
-                )
+                    df[col] = None  # TODO: update to pd.NA
+                    assigned_dtype = type(None)
+                if col in null_columns:
+                    df[col] = pd.NA
+                    assigned_dtype = type(pd.NA)
+
+                msg = f"KernelDataset DataFrame needs column {col}, adding and " \
+                      f"setting dtype to {assigned_dtype}."
+                logger.info(msg)
+
         return df
 
     @property