prevent redundant query for loc_df

I noticed that in trip_segmentation, we query the user timeseries for background/filtered_location. Then we call segment_into_trips which performs the same query on the same timeseries with the same timequery. Passing the loc_df as an argument eliminates the need for this query
e-mission · Jan 30, 2025 · d552b76 · d552b76
1 parent 47ab8be
commit d552b76
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 8 deletions.
diff --git a/emission/analysis/intake/segmentation/trip_segmentation.py b/emission/analysis/intake/segmentation/trip_segmentation.py
@@ -100,7 +100,7 @@ def segment_current_trips(user_id):
     if len(filters_in_df) == 1:
         # Common case - let's make it easy
         with ect.Timer() as t_segment_trips:
-            segmentation_points = filter_methods[filters_in_df[0]].segment_into_trips(ts, time_query)
+            segmentation_points = filter_methods[filters_in_df[0]].segment_into_trips(ts, time_query, loc_df)
         esds.store_pipeline_time(user_id, ecwp.PipelineStages.TRIP_SEGMENTATION.name + "/segment_into_trips", time.time(), t_segment_trips.elapsed)
     else:
         with ect.Timer() as t_get_combined_segmentation:

diff --git a/.../analysis/intake/segmentation/trip_segmentation_methods/dwell_segmentation_dist_filter.py b/.../analysis/intake/segmentation/trip_segmentation_methods/dwell_segmentation_dist_filter.py
@@ -42,7 +42,7 @@ def __init__(self, time_threshold, point_threshold, distance_threshold):
         self.point_threshold = point_threshold
         self.distance_threshold = distance_threshold
 
-    def segment_into_trips(self, timeseries, time_query):
+    def segment_into_trips(self, timeseries, time_query, filtered_points_df):
         """
         Examines the timeseries database for a specific range and returns the
         segmentation points. Note that the input is the entire timeseries and
@@ -51,7 +51,7 @@ def segment_into_trips(self, timeseries, time_query):
         segmentation points.
         """
         with ect.Timer() as t_get_filtered_points:
-            self.filtered_points_df = timeseries.get_data_df("background/filtered_location", time_query)
+            self.filtered_points_df = filtered_points_df
             user_id = self.filtered_points_df["user_id"].iloc[0]
         esds.store_pipeline_time(
             user_id,

diff --git a/.../analysis/intake/segmentation/trip_segmentation_methods/dwell_segmentation_time_filter.py b/.../analysis/intake/segmentation/trip_segmentation_methods/dwell_segmentation_time_filter.py
@@ -57,22 +57,21 @@ def __init__(self, time_threshold, point_threshold, distance_threshold):
         self.point_threshold = point_threshold
         self.distance_threshold = distance_threshold
 
-    def segment_into_trips(self, timeseries, time_query):
+    def segment_into_trips(self, timeseries, time_query, filtered_points_df):
         """
         Examines the timeseries database for a specific range and returns the
         segmentation points. Note that the input is the entire timeseries and
         the time range. This allows algorithms to use whatever combination of
         data that they want from the sensor streams in order to determine the
         segmentation points.
         """
-        filtered_points_pre_ts_diff_df = timeseries.get_data_df("background/filtered_location", time_query)
-        user_id = filtered_points_pre_ts_diff_df["user_id"].iloc[0]
+        user_id = filtered_points_df["user_id"].iloc[0]
         # Sometimes, we can get bogus points because data.ts and
         # metadata.write_ts are off by a lot. If we don't do this, we end up
         # appearing to travel back in time
         # https://github.com/e-mission/e-mission-server/issues/457
-        filtered_points_df = filtered_points_pre_ts_diff_df[
-            (filtered_points_pre_ts_diff_df.metadata_write_ts - filtered_points_pre_ts_diff_df.ts) < 1000
+        filtered_points_df = filtered_points_df[
+            (filtered_points_df.metadata_write_ts - filtered_points_df.ts) < 1000
         ]
         filtered_points_df.reset_index(inplace=True)
         transition_df = timeseries.get_data_df("statemachine/transition", time_query)