From 1ce729454bf16d627903364173d9773f181e1fad Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Fri, 8 Nov 2024 17:51:25 -0500 Subject: [PATCH 1/7] [chunkers] Use transpose() for Thin and Delta Chunker iterators The match statement does the same thing as the transpose method. --- src/chunkers.rs | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/chunkers.rs b/src/chunkers.rs index 240393a..032e86c 100644 --- a/src/chunkers.rs +++ b/src/chunkers.rs @@ -141,12 +141,7 @@ impl Iterator for ThinChunker { type Item = Result; fn next(&mut self) -> Option { - let mc = self.next_chunk(); - match mc { - Err(e) => Some(Err(e)), - Ok(Some(c)) => Some(Ok(c)), - Ok(None) => None, - } + self.next_chunk().transpose() } } @@ -223,12 +218,7 @@ impl Iterator for DeltaChunker { type Item = Result; fn next(&mut self) -> Option { - let mc = self.next_chunk(); - match mc { - Err(e) => Some(Err(e)), - Ok(Some(c)) => Some(Ok(c)), - Ok(None) => None, - } + self.next_chunk().transpose() } } From a672a9d6603ab0a8abd049dd195685a867fdbf6d Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Fri, 8 Nov 2024 17:37:29 -0500 Subject: [PATCH 2/7] [chunkers] Skip zeroing the Vec in ThickChunker.next_chunk() Use take() and set_limit() to limit the amount of reading from the input, and use read_to_end() to fill the vector. --- src/chunkers.rs | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/chunkers.rs b/src/chunkers.rs index 032e86c..5286e03 100644 --- a/src/chunkers.rs +++ b/src/chunkers.rs @@ -1,8 +1,9 @@ -use anyhow::{anyhow, Context, Result}; +use anyhow::{anyhow, ensure, Context, Result}; use io::prelude::*; use std::fs::File; use std::fs::OpenOptions; use std::io; +use std::io::Take; use std::ops::Range; use std::os::unix::fs::FileExt; use std::path::Path; @@ -21,20 +22,21 @@ pub enum Chunk { } pub struct ThickChunker { - input: File, + input: Take, input_size: u64, total_read: u64, - block_size: usize, + block_size: u64, } impl ThickChunker { - pub fn new(input_path: &Path, block_size: usize) -> Result { + pub fn new(input_path: &Path, block_size: u64) -> Result { let input_size = thinp::file_utils::file_size(input_path)?; let input = OpenOptions::new() .read(true) .write(false) .open(input_path) - .context("couldn't open input file/dev")?; + .context("couldn't open input file/dev")? + .take(0); Ok(Self { input, @@ -44,9 +46,11 @@ impl ThickChunker { }) } - // FIXME: stop reallocating and zeroing these buffers - fn do_read(&mut self, mut buffer: Vec) -> Result> { - self.input.read_exact(&mut buffer)?; + fn do_read(&mut self, size: u64) -> Result> { + let mut buffer = Vec::with_capacity(size as usize); + self.input.set_limit(size); + let read_size = self.input.read_to_end(&mut buffer)?; + ensure!(read_size == size as usize, "short read"); self.total_read += buffer.len() as u64; Ok(Some(Chunk::Mapped(buffer))) } @@ -56,12 +60,10 @@ impl ThickChunker { if remaining == 0 { Ok(None) - } else if remaining >= self.block_size as u64 { - let buf = vec![0; self.block_size]; - self.do_read(buf) + } else if remaining >= self.block_size { + self.do_read(self.block_size) } else { - let buf = vec![0; remaining as usize]; - self.do_read(buf) + self.do_read(remaining) } } } From ba7d0935278f8635fee175e063a9f3d03859e1e7 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Wed, 20 Nov 2024 01:58:59 -0500 Subject: [PATCH 3/7] [slab] Skip duplicate assert in slab::writer_ --- src/slab.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/slab.rs b/src/slab.rs index 249467c..d8349b0 100644 --- a/src/slab.rs +++ b/src/slab.rs @@ -119,7 +119,7 @@ impl SlabOffsets { // derived data, and can be rebuilt with the repair fn. // // file :=
* -// header := +// header := // slab := const FILE_MAGIC: u64 = 0xb927f96a6b611180; @@ -196,7 +196,6 @@ fn writer_(shared: Arc>, rx: Receiver) -> Result<()> let buf = rx.recv(); if buf.is_err() { // all send ends have been closed, so we're done. - assert!(queued.is_empty()); break; } From 7e377c440d927e920463521be90d96b31beb89c4 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Wed, 20 Nov 2024 02:05:30 -0500 Subject: [PATCH 4/7] [content-sensitive-splitter] Don't compute min skip_size offset being larger than data.len() works the same as offset being equal to data.len() --- src/content_sensitive_splitter.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/src/content_sensitive_splitter.rs b/src/content_sensitive_splitter.rs index c07d448..a59486c 100644 --- a/src/content_sensitive_splitter.rs +++ b/src/content_sensitive_splitter.rs @@ -142,11 +142,8 @@ impl ContentSensitiveSplitter { let end = data.len(); if let Some(boundary) = self.hasher.next_match(&data[offset..end], self.mask_s) { consumes.push(remainder + boundary); - offset += boundary; - - let skip_size = std::cmp::min(data.len() - offset, min_size); - offset += skip_size; - remainder = skip_size; + offset += boundary + min_size; + remainder = min_size; continue; } else { offset += ws; @@ -159,11 +156,8 @@ impl ContentSensitiveSplitter { if let Some(boundary) = self.hasher.next_match(&data[offset..], self.mask_l) { consumes.push(remainder + boundary); - offset += boundary; - - let skip_size = std::cmp::min(data.len() - offset, min_size); - offset += skip_size; - remainder = skip_size; + offset += boundary + min_size; + remainder = min_size; } else { break; } From 528229fedc8dc94b5671724695e6cfe5225cd82e Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Wed, 20 Nov 2024 02:14:35 -0500 Subject: [PATCH 5/7] [content-sensitive-splitter] skip min_size bytes on first chunk --- src/content_sensitive_splitter.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/content_sensitive_splitter.rs b/src/content_sensitive_splitter.rs index a59486c..7899a3d 100644 --- a/src/content_sensitive_splitter.rs +++ b/src/content_sensitive_splitter.rs @@ -138,6 +138,10 @@ impl ContentSensitiveSplitter { let min_size = self.window_size as usize / 4; let ws = self.window_size as usize; + if remainder < min_size { + offset += min_size - remainder; + remainder = min_size; + } while offset < data.len() { let end = data.len(); if let Some(boundary) = self.hasher.next_match(&data[offset..end], self.mask_s) { From 1d8d987ff6a6700db3af22129c75318ac071cd31 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Wed, 20 Nov 2024 03:08:27 -0500 Subject: [PATCH 6/7] [content-sensitive-splitter] Fix Normalized chunking code. The code was first searching for a chunk using the more restrictive mask over the whole input starting from min_size. If it didn't find a chunk, it was then using the less restrictive mask over the input starting at window_size + min_size. To Normalize chunking with an average chunk size of window_size, the code now first searches for a chunk using the more restrictive mask over the input from min_size to window_size. If it doesn't find one, it then searches using the less restrictive mask starting at window_size. This keeps the code from searching parts of the input twice, and makes it less likely to find very large chunks. --- src/content_sensitive_splitter.rs | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/src/content_sensitive_splitter.rs b/src/content_sensitive_splitter.rs index 7899a3d..4082986 100644 --- a/src/content_sensitive_splitter.rs +++ b/src/content_sensitive_splitter.rs @@ -143,21 +143,23 @@ impl ContentSensitiveSplitter { remainder = min_size; } while offset < data.len() { - let end = data.len(); - if let Some(boundary) = self.hasher.next_match(&data[offset..end], self.mask_s) { - consumes.push(remainder + boundary); - offset += boundary + min_size; - remainder = min_size; - continue; - } else { - offset += ws; - remainder += ws; - } + let len_s = ws - remainder; + if len_s > 0 { + let end = std::cmp::min(data.len(), offset + len_s); + if let Some(boundary) = self.hasher.next_match(&data[offset..end], self.mask_s) { + consumes.push(remainder + boundary); + offset += boundary + min_size; + remainder = min_size; + continue; + } else { + offset += len_s; + remainder += len_s; + } - if offset >= data.len() { - break; + if offset >= data.len() { + break; + } } - if let Some(boundary) = self.hasher.next_match(&data[offset..], self.mask_l) { consumes.push(remainder + boundary); offset += boundary + min_size; From 90e338bca86ce5ac80ecbd95279be53d92867ea6 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Wed, 20 Nov 2024 03:52:58 -0500 Subject: [PATCH 7/7] [content-sensitive-splitter] Add Max size to chunking code. In order to avoid very large chunks which are less likely to deduplicate well, cap the chunks at 8 times the window_size. --- src/content_sensitive_splitter.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/content_sensitive_splitter.rs b/src/content_sensitive_splitter.rs index 4082986..a2e6d1e 100644 --- a/src/content_sensitive_splitter.rs +++ b/src/content_sensitive_splitter.rs @@ -136,6 +136,7 @@ impl ContentSensitiveSplitter { let mut offset = 0; let mut remainder = self.unconsumed_len as usize; let min_size = self.window_size as usize / 4; + let max_size = self.window_size as usize * 8; let ws = self.window_size as usize; if remainder < min_size { @@ -160,12 +161,16 @@ impl ContentSensitiveSplitter { break; } } - if let Some(boundary) = self.hasher.next_match(&data[offset..], self.mask_l) { + let len_l = max_size - remainder; + let end = std::cmp::min(data.len(), offset + len_l); + if let Some(boundary) = self.hasher.next_match(&data[offset..end], self.mask_l) { consumes.push(remainder + boundary); offset += boundary + min_size; remainder = min_size; } else { - break; + consumes.push(end - offset + remainder); + offset = end + min_size; + remainder = min_size; } }