Skip to content

Commit

Permalink
prevent redundant query for loc_df
Browse files Browse the repository at this point in the history
I noticed that in trip_segmentation, we query the user timeseries for background/filtered_location. Then we call segment_into_trips which performs the same query on the same timeseries with the same timequery.
Passing the loc_df as an argument eliminates the need for this query
  • Loading branch information
JGreenlee committed Jan 30, 2025
1 parent 47ab8be commit d552b76
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 8 deletions.
2 changes: 1 addition & 1 deletion emission/analysis/intake/segmentation/trip_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def segment_current_trips(user_id):
if len(filters_in_df) == 1:
# Common case - let's make it easy
with ect.Timer() as t_segment_trips:
segmentation_points = filter_methods[filters_in_df[0]].segment_into_trips(ts, time_query)
segmentation_points = filter_methods[filters_in_df[0]].segment_into_trips(ts, time_query, loc_df)
esds.store_pipeline_time(user_id, ecwp.PipelineStages.TRIP_SEGMENTATION.name + "/segment_into_trips", time.time(), t_segment_trips.elapsed)
else:
with ect.Timer() as t_get_combined_segmentation:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, time_threshold, point_threshold, distance_threshold):
self.point_threshold = point_threshold
self.distance_threshold = distance_threshold

def segment_into_trips(self, timeseries, time_query):
def segment_into_trips(self, timeseries, time_query, filtered_points_df):
"""
Examines the timeseries database for a specific range and returns the
segmentation points. Note that the input is the entire timeseries and
Expand All @@ -51,7 +51,7 @@ def segment_into_trips(self, timeseries, time_query):
segmentation points.
"""
with ect.Timer() as t_get_filtered_points:
self.filtered_points_df = timeseries.get_data_df("background/filtered_location", time_query)
self.filtered_points_df = filtered_points_df
user_id = self.filtered_points_df["user_id"].iloc[0]
esds.store_pipeline_time(
user_id,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,22 +57,21 @@ def __init__(self, time_threshold, point_threshold, distance_threshold):
self.point_threshold = point_threshold
self.distance_threshold = distance_threshold

def segment_into_trips(self, timeseries, time_query):
def segment_into_trips(self, timeseries, time_query, filtered_points_df):
"""
Examines the timeseries database for a specific range and returns the
segmentation points. Note that the input is the entire timeseries and
the time range. This allows algorithms to use whatever combination of
data that they want from the sensor streams in order to determine the
segmentation points.
"""
filtered_points_pre_ts_diff_df = timeseries.get_data_df("background/filtered_location", time_query)
user_id = filtered_points_pre_ts_diff_df["user_id"].iloc[0]
user_id = filtered_points_df["user_id"].iloc[0]
# Sometimes, we can get bogus points because data.ts and
# metadata.write_ts are off by a lot. If we don't do this, we end up
# appearing to travel back in time
# https://github.com/e-mission/e-mission-server/issues/457
filtered_points_df = filtered_points_pre_ts_diff_df[
(filtered_points_pre_ts_diff_df.metadata_write_ts - filtered_points_pre_ts_diff_df.ts) < 1000
filtered_points_df = filtered_points_df[
(filtered_points_df.metadata_write_ts - filtered_points_df.ts) < 1000
]
filtered_points_df.reset_index(inplace=True)
transition_df = timeseries.get_data_df("statemachine/transition", time_query)
Expand Down

0 comments on commit d552b76

Please sign in to comment.