Skip to content

Commit

Permalink
Reduce TTFB for diff generation
Browse files Browse the repository at this point in the history
Lazily populate diff items.

This mostly eliminates TTFB for diffs and also avoids linear-scaled
memory consumption during large diffs

Output seems to be ~30% faster overall
  • Loading branch information
craigds committed Jan 8, 2025
1 parent 1b9a7fe commit 683727c
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 4 deletions.
2 changes: 1 addition & 1 deletion kart/base_diff_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,7 @@ def get_file_diff(self):
def iter_deltadiff_items(self, deltas):
if self.sort_keys:
return deltas.sorted_items()
return deltas.items()
return deltas.iter_items()

def filtered_dataset_deltas(self, ds_path, ds_diff):
"""
Expand Down
81 changes: 78 additions & 3 deletions kart/diff_structs.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import UserDict
from dataclasses import dataclass
from typing import Any
from typing import Any, Iterator

from .exceptions import InvalidOperation

Expand Down Expand Up @@ -335,8 +335,7 @@ def prune(self, recurse=True):
Deletes any empty RichDicts that are children of self.
If recurse is True, also deletes non-empty RichDicts, as long as they only contain empty RichDicts in the end.
"""
items = list(self.items())
for key, value in items:
for key, value in list(self.items()):
if key == "data_changes" and value == False and len(self) == 1:
del self[key]
if not isinstance(value, RichDict):
Expand Down Expand Up @@ -430,6 +429,10 @@ def __json__(self):
return {k: v for k, v in self.items()}


class InvalidatedDeltaDiff:
pass


class DeltaDiff(Diff):
"""
A DeltaDiff is the inner-most type of Diff, the one that actually contains Deltas.
Expand All @@ -439,16 +442,88 @@ class DeltaDiff(Diff):
child_type = Delta

def __init__(self, initial_contents=()):
self._lazy_initial_contents = None
if isinstance(initial_contents, (dict, UserDict)):
super().__init__(initial_contents)
else:
if isinstance(initial_contents, Iterator):
self._lazy_initial_contents = (
(delta.key, delta) for delta in initial_contents
)
initial_contents = ()
super().__init__((delta.key, delta) for delta in initial_contents)

def __getitem__(self, key):
if key in self.data:
return self.data[key]
self._evaluate_lazy_initial_contents()
return self.data[key]

def __setitem__(self, key, delta):
if key != delta.key:
raise ValueError("Delta must be added at the appropriate key")
super().__setitem__(key, delta)

def _evaluate_lazy_initial_contents(self):
if self._lazy_initial_contents is None:
return
for k, v in self._lazy_initial_contents:
if k not in self:
self[k] = v
self._lazy_initial_contents = None

def __bool__(self):
result = bool(self.data)
if (not result) and self._lazy_initial_contents:
# If the DeltaDiff is empty, but has lazy initial contents, evaluate the first item to check booleanness.
try:
k, v = next(self._lazy_initial_contents)
except StopIteration:
return False
else:
# remember this result
self.data[k] = v
return True
return result

def __len__(self):
self._evaluate_lazy_initial_contents()
return super().__len__()

def prune(self, recurse=True):
# DeltaDiff shouldn't need pruning. I think?
# It's complex to prune the lazy initial contents, and it's not clear that it's necessary.
pass

def items(self):
self._evaluate_lazy_initial_contents()
return super().items()

def iter_items(self):
"""
Iterates over the items in the DeltaDiff, including any lazy initial contents.
This method consumes the iterator without storing its contents. It's not safe to call this method and then call items() on the same object.
"""
yield from self.data.items()
if self._lazy_initial_contents:
for k, v in self._lazy_initial_contents:
if k not in self:
yield (k, v)

# Invalidate this DeltaDiff; it's not safe to consume it again after this.
self.data = InvalidatedDeltaDiff(
"DeltaDiff can't be used after iter_items() has been called"
)

def keys(self):
self._evaluate_lazy_initial_contents()
return super().keys()

def values(self):
self._evaluate_lazy_initial_contents()
return super().values()

def add_delta(self, delta):
"""Add the given delta at the appropriate key."""
super().__setitem__(delta.key, delta)
Expand Down

0 comments on commit 683727c

Please sign in to comment.