From 683727c46c78df419452765c147dd6f224a3726e Mon Sep 17 00:00:00 2001 From: Craig de Stigter Date: Wed, 8 Jan 2025 13:29:53 +1300 Subject: [PATCH] Reduce TTFB for diff generation Lazily populate diff items. This mostly eliminates TTFB for diffs and also avoids linear-scaled memory consumption during large diffs Output seems to be ~30% faster overall --- kart/base_diff_writer.py | 2 +- kart/diff_structs.py | 81 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 79 insertions(+), 4 deletions(-) diff --git a/kart/base_diff_writer.py b/kart/base_diff_writer.py index 32be5c6b..c1970e27 100644 --- a/kart/base_diff_writer.py +++ b/kart/base_diff_writer.py @@ -456,7 +456,7 @@ def get_file_diff(self): def iter_deltadiff_items(self, deltas): if self.sort_keys: return deltas.sorted_items() - return deltas.items() + return deltas.iter_items() def filtered_dataset_deltas(self, ds_path, ds_diff): """ diff --git a/kart/diff_structs.py b/kart/diff_structs.py index 5ff7b89a..9cfc7c4a 100644 --- a/kart/diff_structs.py +++ b/kart/diff_structs.py @@ -1,6 +1,6 @@ from collections import UserDict from dataclasses import dataclass -from typing import Any +from typing import Any, Iterator from .exceptions import InvalidOperation @@ -335,8 +335,7 @@ def prune(self, recurse=True): Deletes any empty RichDicts that are children of self. If recurse is True, also deletes non-empty RichDicts, as long as they only contain empty RichDicts in the end. """ - items = list(self.items()) - for key, value in items: + for key, value in list(self.items()): if key == "data_changes" and value == False and len(self) == 1: del self[key] if not isinstance(value, RichDict): @@ -430,6 +429,10 @@ def __json__(self): return {k: v for k, v in self.items()} +class InvalidatedDeltaDiff: + pass + + class DeltaDiff(Diff): """ A DeltaDiff is the inner-most type of Diff, the one that actually contains Deltas. @@ -439,16 +442,88 @@ class DeltaDiff(Diff): child_type = Delta def __init__(self, initial_contents=()): + self._lazy_initial_contents = None if isinstance(initial_contents, (dict, UserDict)): super().__init__(initial_contents) else: + if isinstance(initial_contents, Iterator): + self._lazy_initial_contents = ( + (delta.key, delta) for delta in initial_contents + ) + initial_contents = () super().__init__((delta.key, delta) for delta in initial_contents) + def __getitem__(self, key): + if key in self.data: + return self.data[key] + self._evaluate_lazy_initial_contents() + return self.data[key] + def __setitem__(self, key, delta): if key != delta.key: raise ValueError("Delta must be added at the appropriate key") super().__setitem__(key, delta) + def _evaluate_lazy_initial_contents(self): + if self._lazy_initial_contents is None: + return + for k, v in self._lazy_initial_contents: + if k not in self: + self[k] = v + self._lazy_initial_contents = None + + def __bool__(self): + result = bool(self.data) + if (not result) and self._lazy_initial_contents: + # If the DeltaDiff is empty, but has lazy initial contents, evaluate the first item to check booleanness. + try: + k, v = next(self._lazy_initial_contents) + except StopIteration: + return False + else: + # remember this result + self.data[k] = v + return True + return result + + def __len__(self): + self._evaluate_lazy_initial_contents() + return super().__len__() + + def prune(self, recurse=True): + # DeltaDiff shouldn't need pruning. I think? + # It's complex to prune the lazy initial contents, and it's not clear that it's necessary. + pass + + def items(self): + self._evaluate_lazy_initial_contents() + return super().items() + + def iter_items(self): + """ + Iterates over the items in the DeltaDiff, including any lazy initial contents. + + This method consumes the iterator without storing its contents. It's not safe to call this method and then call items() on the same object. + """ + yield from self.data.items() + if self._lazy_initial_contents: + for k, v in self._lazy_initial_contents: + if k not in self: + yield (k, v) + + # Invalidate this DeltaDiff; it's not safe to consume it again after this. + self.data = InvalidatedDeltaDiff( + "DeltaDiff can't be used after iter_items() has been called" + ) + + def keys(self): + self._evaluate_lazy_initial_contents() + return super().keys() + + def values(self): + self._evaluate_lazy_initial_contents() + return super().values() + def add_delta(self, delta): """Add the given delta at the appropriate key.""" super().__setitem__(delta.key, delta)