Skip to content

Commit

Permalink
[content-sensitive-splitter] Add Max size to chunking code.
Browse files Browse the repository at this point in the history
In order to avoid very large chunks which are less likely to deduplicate
well, cap the chunks at 8 times the window_size.
  • Loading branch information
bmarzins committed Nov 20, 2024
1 parent 1d8d987 commit 90e338b
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions src/content_sensitive_splitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ impl ContentSensitiveSplitter {
let mut offset = 0;
let mut remainder = self.unconsumed_len as usize;
let min_size = self.window_size as usize / 4;
let max_size = self.window_size as usize * 8;
let ws = self.window_size as usize;

if remainder < min_size {
Expand All @@ -160,12 +161,16 @@ impl ContentSensitiveSplitter {
break;
}
}
if let Some(boundary) = self.hasher.next_match(&data[offset..], self.mask_l) {
let len_l = max_size - remainder;
let end = std::cmp::min(data.len(), offset + len_l);
if let Some(boundary) = self.hasher.next_match(&data[offset..end], self.mask_l) {
consumes.push(remainder + boundary);
offset += boundary + min_size;
remainder = min_size;
} else {
break;
consumes.push(end - offset + remainder);
offset = end + min_size;
remainder = min_size;
}
}

Expand Down

0 comments on commit 90e338b

Please sign in to comment.