diff --git a/RELEASE.md b/RELEASE.md index 90b8b1b95..cedeaa9c3 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,15 +1,10 @@ # Releasing firewood -Releasing firewood is straightforward and can be done entirely in CI. +Releasing firewood is straightforward and can be done entirely in CI. Firewood is made up of several sub-projects in a workspace. Each project is in its own crate and has an independent version. -* firewood -* storage -* fwdctl -* grpc-testtool - The first step in drafting a release is ensuring all crates within the firewood project are using the version of the new release. There is a utility to ensure all versions are updated simultaneously in `cargo-workspace-version`. To use it diff --git a/benchmark/src/main.rs b/benchmark/src/main.rs index 30d4aff6a..a4f2a9c45 100644 --- a/benchmark/src/main.rs +++ b/benchmark/src/main.rs @@ -36,9 +36,19 @@ struct Args { cache_size: NonZeroUsize, #[arg(short, long, default_value_t = 128)] revisions: usize, - #[arg(short = 'p', long, default_value_t = 3000)] + #[arg( + short = 'p', + long, + default_value_t = 3000, + help = "Port to listen for prometheus" + )] prometheus_port: u16, - #[arg(short = 's', long, default_value_t = false)] + #[arg( + short = 's', + long, + default_value_t = false, + help = "Dump prometheus stats on exit" + )] stats_dump: bool, #[clap(flatten)] diff --git a/firewood/benches/hashops.rs b/firewood/benches/hashops.rs index d875db632..b6992b563 100644 --- a/firewood/benches/hashops.rs +++ b/firewood/benches/hashops.rs @@ -14,7 +14,7 @@ use std::{fs::File, iter::repeat_with, os::raw::c_int, path::Path}; use storage::{MemStore, NodeStore}; // To enable flamegraph output -// cargo bench --bench shale-bench -- --profile-time=N +// cargo bench --bench hashops -- --profile-time=N enum FlamegraphProfiler { Init(c_int), Active(ProfilerGuard<'static>), @@ -137,8 +137,7 @@ fn bench_db(criterion: &mut Criterion) { criterion_group! { name = benches; config = Criterion::default().with_profiler(FlamegraphProfiler::Init(100)); - // targets = bench_trie_hash, bench_merkle::<3, 32>, bench_db::<100> - targets = bench_merkle::<3, 4>, bench_merkle<3, 32>, bench_db<100> + targets = bench_merkle::<3, 4>, bench_merkle<3, 32>, bench_db::<100> } criterion_main!(benches); diff --git a/storage/Cargo.toml b/storage/Cargo.toml index 95c16808b..adb8e161d 100644 --- a/storage/Cargo.toml +++ b/storage/Cargo.toml @@ -20,11 +20,14 @@ metrics = "0.24.0" log = { version = "0.4.20", optional = true } bytemuck = "1.7.0" bytemuck_derive = "1.7.0" +bitfield = "0.17.0" [dev-dependencies] rand = "0.8.5" test-case = "3.3.1" criterion = { version = "0.5.1", features = ["async_tokio", "html_reports"] } +pprof = { version = "0.14.0", features = ["flamegraph"] } +tempfile = "3.12.0" [features] logger = ["log"] diff --git a/storage/benches/serializer.rs b/storage/benches/serializer.rs index 75b0030e0..99f8e143c 100644 --- a/storage/benches/serializer.rs +++ b/storage/benches/serializer.rs @@ -1,31 +1,84 @@ // Copyright (C) 2023, Ava Labs, Inc. All rights reserved. // See the file LICENSE.md for licensing terms. -use std::{array::from_fn, num::NonZeroU64}; +use std::{array::from_fn, fs::File, num::NonZeroU64, os::raw::c_int}; use bincode::Options; -use criterion::{criterion_group, criterion_main, Criterion}; +use criterion::{criterion_group, criterion_main, profiler::Profiler, Criterion}; +use pprof::ProfilerGuard; use smallvec::SmallVec; use storage::{LeafNode, Node, Path}; +use std::path::Path as FsPath; + +// For flamegraphs: +// cargo bench --bench serializer -- --profile-time=5 + +enum FlamegraphProfiler { + Init(c_int), + Active(ProfilerGuard<'static>), +} + +fn file_error_panic(path: &FsPath) -> impl FnOnce(T) -> U + '_ { + |_| panic!("Error on file `{}`", path.display()) +} + +impl Profiler for FlamegraphProfiler { + #[allow(clippy::unwrap_used)] + fn start_profiling(&mut self, _benchmark_id: &str, _benchmark_dir: &FsPath) { + if let Self::Init(frequency) = self { + let guard = ProfilerGuard::new(*frequency).unwrap(); + *self = Self::Active(guard); + } + } + + #[allow(clippy::unwrap_used)] + fn stop_profiling(&mut self, _benchmark_id: &str, benchmark_dir: &FsPath) { + std::fs::create_dir_all(benchmark_dir).unwrap(); + let filename = "firewood-flamegraph.svg"; + let flamegraph_path = benchmark_dir.join(filename); + #[allow(clippy::unwrap_used)] + let flamegraph_file = + File::create(&flamegraph_path).unwrap_or_else(file_error_panic(&flamegraph_path)); + + #[allow(clippy::unwrap_used)] + if let Self::Active(profiler) = self { + profiler + .report() + .build() + .unwrap() + .flamegraph(flamegraph_file) + .unwrap_or_else(file_error_panic(&flamegraph_path)); + } + } +} + fn leaf(c: &mut Criterion) { let mut group = c.benchmark_group("leaf"); let input = Node::Leaf(LeafNode { - partial_path: Path(SmallVec::from_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])), + partial_path: Path(SmallVec::from_slice(&[0, 1])), value: SmallVec::from_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), }); let serializer = bincode::DefaultOptions::new().with_varint_encoding(); - group.bench_with_input("leaf", &input, |b, input| { + group.bench_with_input("serde", &input, |b, input| { b.iter(|| { serializer.serialize(input).unwrap(); }) }); + + group.bench_with_input("manual", &input, |b, input| { + b.iter(|| { + let mut bytes = Vec::::new(); + input.as_bytes(0, &mut bytes); + }) + }); + group.finish(); } fn branch(c: &mut Criterion) { - let mut group = c.benchmark_group("branch"); + let mut group = c.benchmark_group("has_value"); let mut input = Node::Branch(Box::new(storage::BranchNode { - partial_path: Path(SmallVec::from_slice(&[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])), + partial_path: Path(SmallVec::from_slice(&[0, 1])), value: Some(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9].into_boxed_slice()), children: from_fn(|i| { if i == 0 { @@ -39,24 +92,46 @@ fn branch(c: &mut Criterion) { }), })); let serializer = bincode::DefaultOptions::new().with_varint_encoding(); - let benchfn = |b: &mut criterion::Bencher, input: &storage::Node| { + let serde_serializer = |b: &mut criterion::Bencher, input: &storage::Node| { b.iter(|| { serializer.serialize(input).unwrap(); }) }; - group.bench_with_input("1_child+has_value", &input, benchfn); + let manual_serializer = |b: &mut criterion::Bencher, input: &storage::Node| { + b.iter(|| { + let mut bytes = Vec::new(); + input.as_bytes(0, &mut bytes); + }) + }; + + group.bench_with_input("serde", &input, serde_serializer); + group.bench_with_input("manual", &input, manual_serializer); + group.finish(); + let mut group = c.benchmark_group("1_child"); input.as_branch_mut().unwrap().value = None; - group.bench_with_input("1_child", &input, benchfn); + group.bench_with_input("serde", &input, serde_serializer); + group.bench_with_input("manual", &input, manual_serializer); let child = input.as_branch().unwrap().children[0].clone(); + group.finish(); + let mut group = c.benchmark_group("2_child"); input.as_branch_mut().unwrap().children[1] = child.clone(); - group.bench_with_input("2_child", &input, benchfn); + group.bench_with_input("serde", &input, serde_serializer); + group.bench_with_input("manual", &input, manual_serializer); + group.finish(); + let mut group = c.benchmark_group("16_child"); input.as_branch_mut().unwrap().children = std::array::from_fn(|_| child.clone()); - group.bench_with_input("16_child", &input, benchfn); + group.bench_with_input("serde", &input, serde_serializer); + group.bench_with_input("manual", &input, manual_serializer); + group.finish(); } -criterion_group!(serializers, leaf, branch); +criterion_group!( + name = serializers; + config = Criterion::default().with_profiler(FlamegraphProfiler::Init(100)); + targets = leaf, branch +); criterion_main!(serializers); diff --git a/storage/src/linear/filebacked.rs b/storage/src/linear/filebacked.rs index 529a56f5d..4e98919fb 100644 --- a/storage/src/linear/filebacked.rs +++ b/storage/src/linear/filebacked.rs @@ -53,9 +53,7 @@ impl FileBacked { impl ReadableStorage for FileBacked { fn stream_from(&self, addr: u64) -> Result, Error> { - let mut fd = self.fd.lock().expect("p"); - fd.seek(std::io::SeekFrom::Start(addr))?; - Ok(Box::new(fd.try_clone().expect("poisoned lock"))) + Ok(Box::new(PredictiveReader::new(self, addr))) } fn size(&self) -> Result { @@ -112,3 +110,119 @@ impl WritableStorage for FileBacked { guard.put(addr, next); } } + +/// A reader that can predictively read from a file, avoiding reading past boundaries, but reading in 1k chunks +struct PredictiveReader { + fd: File, + buffer: [u8; Self::PREDICTIVE_READ_BUFFER_SIZE], + offset: u64, + len: usize, + pos: usize, +} + +impl PredictiveReader { + const PREDICTIVE_READ_BUFFER_SIZE: usize = 1024; + + fn new(fb: &FileBacked, start: u64) -> Self { + let fd = fb + .fd + .lock() + .expect("poisoned lock") + .try_clone() + .expect("resource exhaustion"); + + Self { + fd, + buffer: [0u8; Self::PREDICTIVE_READ_BUFFER_SIZE], + offset: start, + len: 0, + pos: 0, + } + } +} + +impl Read for PredictiveReader { + fn read(&mut self, buf: &mut [u8]) -> Result { + if self.len == self.pos { + let bytes_left_in_page = Self::PREDICTIVE_READ_BUFFER_SIZE + - (self.offset % Self::PREDICTIVE_READ_BUFFER_SIZE as u64) as usize; + self.fd.seek(std::io::SeekFrom::Start(self.offset))?; + let read = self.fd.read(&mut self.buffer[..bytes_left_in_page])?; + self.offset += read as u64; + self.len = read; + self.pos = 0; + } + let max_to_return = std::cmp::min(buf.len(), self.len - self.pos); + buf[..max_to_return].copy_from_slice(&self.buffer[self.pos..self.pos + max_to_return]); + self.pos += max_to_return; + Ok(max_to_return) + } +} + +#[cfg(test)] +mod test { + use super::*; + use std::io::Write; + use tempfile::NamedTempFile; + + #[test] + fn basic_reader_test() { + let mut tf = NamedTempFile::new().unwrap(); + let path = tf.path().to_path_buf(); + let output = tf.as_file_mut(); + write!(output, "hello world").unwrap(); + + // whole thing at once, this is always less than 1K so it should + // read the whole thing in + let fb = FileBacked::new( + path, + NonZero::new(10).unwrap(), + NonZero::new(10).unwrap(), + false, + ) + .unwrap(); + let mut reader = fb.stream_from(0).unwrap(); + let mut buf: String = String::new(); + assert_eq!(reader.read_to_string(&mut buf).unwrap(), 11); + assert_eq!(buf, "hello world".to_string()); + assert_eq!(0, reader.read(&mut [0u8; 1]).unwrap()); + + // byte at a time + let mut reader = fb.stream_from(0).unwrap(); + for ch in b"hello world" { + let mut buf = [0u8; 1]; + let read = reader.read(&mut buf).unwrap(); + assert_eq!(read, 1); + assert_eq!(buf[0], *ch); + } + assert_eq!(0, reader.read(&mut [0u8; 1]).unwrap()); + + // with offset + let mut reader = fb.stream_from(6).unwrap(); + buf = String::new(); + assert_eq!(reader.read_to_string(&mut buf).unwrap(), 5); + assert_eq!(buf, "world".to_string()); + } + + #[test] + fn big_file() { + let mut tf = NamedTempFile::new().unwrap(); + let path = tf.path().to_path_buf(); + let output = tf.as_file_mut(); + for _ in 0..1000 { + write!(output, "hello world").unwrap(); + } + + let fb = FileBacked::new( + path, + NonZero::new(10).unwrap(), + NonZero::new(10).unwrap(), + false, + ) + .unwrap(); + let mut reader = fb.stream_from(0).unwrap(); + let mut buf: String = String::new(); + assert_eq!(reader.read_to_string(&mut buf).unwrap(), 11000); + assert_eq!(buf.len(), 11000); + } +} diff --git a/storage/src/node/branch.rs b/storage/src/node/branch.rs index 2248178bc..6aaf8bb82 100644 --- a/storage/src/node/branch.rs +++ b/storage/src/node/branch.rs @@ -8,6 +8,7 @@ use crate::{LeafNode, LinearAddress, Node, Path, TrieHash}; use std::fmt::{Debug, Error as FmtError, Formatter}; #[derive(PartialEq, Eq, Clone, Debug)] +#[repr(C)] /// A child of a branch node. pub enum Child { /// There is a child at this index, but we haven't hashed it @@ -42,7 +43,7 @@ impl Serialize for BranchNode { state.serialize_field("partial_path", &self.partial_path)?; state.serialize_field("value", &self.value)?; - let children: SmallVec<[(u8, LinearAddress, TrieHash); Self::MAX_CHILDREN]> = self + let children: SmallVec<[(u8, LinearAddress, &TrieHash); Self::MAX_CHILDREN]> = self .children .iter() .enumerate() @@ -51,9 +52,7 @@ impl Serialize for BranchNode { Some(Child::Node(_)) => { panic!("serializing in-memory node for disk storage") } - Some(Child::AddressWithHash(addr, hash)) => { - Some((offset as u8, *addr, (*hash).clone())) - } + Some(Child::AddressWithHash(addr, hash)) => Some((offset as u8, *addr, hash)), }) .collect(); @@ -92,18 +91,16 @@ impl<'de> Deserialize<'de> for BranchNode { impl Debug for BranchNode { fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), FmtError> { - write!(f, "[Branch")?; + write!(f, "[BranchNode")?; write!(f, r#" path="{:?}""#, self.partial_path)?; for (i, c) in self.children.iter().enumerate() { match c { None => {} Some(Child::Node(_)) => {} //TODO - Some(Child::AddressWithHash(addr, hash)) => write!( - f, - "(index: {i:?}), address={addr:?}, hash={:?})", - hex::encode(hash), - )?, + Some(Child::AddressWithHash(addr, hash)) => { + write!(f, "({i:?}: address={addr:?} hash={})", hex::encode(hash),)? + } } } diff --git a/storage/src/node/mod.rs b/storage/src/node/mod.rs index b6d068176..554b3f352 100644 --- a/storage/src/node/mod.rs +++ b/storage/src/node/mod.rs @@ -1,9 +1,14 @@ // Copyright (C) 2023, Ava Labs, Inc. All rights reserved. // See the file LICENSE.md for licensing terms. +use bitfield::bitfield; use enum_as_inner::EnumAsInner; +use integer_encoding::{VarIntReader as _, VarIntWriter as _}; use serde::{Deserialize, Serialize}; use smallvec::SmallVec; +use std::io::{Error, ErrorKind, Read, Write}; +use std::num::NonZero; +use std::vec; use std::{fmt::Debug, sync::Arc}; mod branch; @@ -20,6 +25,7 @@ use crate::Path; // TODO: explain why Branch is boxed but Leaf is not #[derive(PartialEq, Eq, Clone, Debug, EnumAsInner, Serialize, Deserialize)] +#[repr(C)] pub enum Node { /// This node is a [BranchNode] Branch(Box), @@ -48,6 +54,105 @@ impl From for Node { } } +#[cfg(not(feature = "branch_factor_256"))] +bitfield! { + struct BranchFirstByte(u8); + impl Debug; + impl new; + u8; + has_value, set_has_value: 1, 1; + number_children, set_number_children: 5, 2; + partial_path_length, set_partial_path_length: 7, 6; +} +#[cfg(not(feature = "branch_factor_256"))] +const MAX_ENCODED_PARTIAL_PATH_LEN: usize = 2; + +#[cfg(feature = "branch_factor_256")] +bitfield! { + struct BranchFirstByte(u8); + impl Debug; + impl new; + u8; + has_value, set_has_value: 1, 1; + partial_path_length, set_partial_path_length: 7, 2; +} +#[cfg(feature = "branch_factor_256")] +const MAX_ENCODED_PARTIAL_PATH_LEN: usize = 63; + +bitfield! { + struct LeafFirstByte(u8); + impl Debug; + impl new; + u8; + is_leaf, set_is_leaf: 0, 0; + partial_path_length, set_partial_path_length: 7, 1; +} + +impl Default for LeafFirstByte { + fn default() -> Self { + LeafFirstByte(1) + } +} + +// TODO: Unstable extend_reserve re-implemented here +// Extend::extend_reserve is unstable so we implement it here +// see https://github.com/rust-lang/rust/issues/72631 +pub trait ExtendableBytes: Write { + fn extend>(&mut self, other: T); + fn reserve(&mut self, reserve: usize) { + let _ = reserve; + } + fn push(&mut self, value: u8); + + fn extend_from_slice(&mut self, other: &[u8]) { + self.extend(other.iter().copied()); + } +} + +impl ExtendableBytes for Vec { + fn extend>(&mut self, other: T) { + std::iter::Extend::extend(self, other); + } + fn reserve(&mut self, reserve: usize) { + self.reserve(reserve); + } + fn push(&mut self, value: u8) { + Vec::push(self, value); + } +} + +pub struct ByteCounter(u64); + +impl ByteCounter { + pub fn new() -> Self { + ByteCounter(0) + } + + pub fn count(&self) -> u64 { + self.0 + } +} + +impl Write for ByteCounter { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + self.0 += buf.len() as u64; + Ok(buf.len()) + } + + fn flush(&mut self) -> std::io::Result<()> { + Ok(()) + } +} + +impl ExtendableBytes for ByteCounter { + fn extend>(&mut self, other: T) { + self.0 += other.into_iter().count() as u64; + } + fn push(&mut self, _value: u8) { + self.0 += 1; + } +} + impl Node { /// Returns the partial path of the node. pub fn partial_path(&self) -> &Path { @@ -96,6 +201,250 @@ impl Node { Node::Leaf(l) => Some(&l.value), } } + + /// Given a [Node], returns a set of bytes to write to storage + /// The format is as follows: + /// + /// For a branch: + /// - Byte 0: + /// - Bit 0: always 0 + /// - Bit 1: indicates if the branch has a value + /// - Bits 2-5: the number of children (unless branch_factor_256, which stores it in the next byte) + /// - Bits 6-7: 0: empty partial_path, 1: 1 nibble, 2: 2 nibbles, 3: length is encoded in the next byte + /// (for branch_factor_256, bits 2-7 are used for partial_path length, up to 63 nibbles) + /// + /// The remaining bytes are in the following order: + /// - The partial path, possibly preceeded by the length if it is longer than 3 nibbles (varint encoded) + /// - The number of children, if the branch factor is 256 + /// - The children. If the number of children == [BranchNode::MAX_CHILDREN], then the children are just + /// addresses with hashes. Otherwise, they are offset, address, hash tuples. + /// + /// For a leaf: + /// - Byte 0: + /// - Bit 0: always 1 + /// - Bits 1-7: the length of the partial path. If the partial path is longer than 126 nibbles, this is set to + /// 126 and the length is encoded in the next byte. + /// + /// The remaining bytes are in the following order: + /// - The partial path, possibly preceeded by the length if it is longer than 126 nibbles (varint encoded) + /// - The value, always preceeded by the length, varint encoded + /// + /// Note that this means the first byte cannot be 255, which would be a leaf with 127 nibbles. We save this extra + /// value to mark this as a freed area. + /// + /// Note that there is a "prefix" byte which is the size of the area when serializing this object. Since + /// we always have one of those, we include it as a parameter for serialization. + /// + /// TODO: We could pack two bytes of the partial path into one and handle the odd byte length + pub fn as_bytes(&self, prefix: u8, encoded: &mut T) { + match self { + Node::Branch(b) => { + let child_iter = b + .children + .iter() + .enumerate() + .filter_map(|(offset, child)| child.as_ref().map(|c| (offset, c))); + let childcount = child_iter.clone().count(); + + // encode the first byte + let pp_len = if b.partial_path.0.len() <= MAX_ENCODED_PARTIAL_PATH_LEN { + b.partial_path.0.len() as u8 + } else { + MAX_ENCODED_PARTIAL_PATH_LEN as u8 + 1 + }; + #[cfg(not(feature = "branch_factor_256"))] + let first_byte: BranchFirstByte = BranchFirstByte::new( + b.value.is_some() as u8, + (childcount % BranchNode::MAX_CHILDREN) as u8, + pp_len, + ); + #[cfg(feature = "branch_factor_256")] + let first_byte: BranchFirstByte = + BranchFirstByte::new(b.value.is_some() as u8, pp_len); + + // create an output stack item, which can overflow to memory for very large branch nodes + const OPTIMIZE_BRANCHES_FOR_SIZE: usize = 1024; + encoded.reserve(OPTIMIZE_BRANCHES_FOR_SIZE); + encoded.push(prefix); + encoded.push(first_byte.0); + #[cfg(feature = "branch_factor_256")] + encoded.extend_one((childcount % BranchNode::MAX_CHILDREN) as u8); + + // encode the partial path, including the length if it didn't fit above + if b.partial_path.0.len() > MAX_ENCODED_PARTIAL_PATH_LEN { + encoded + .write_varint(b.partial_path.len()) + .expect("writing to vec should succeed"); + } + encoded.extend_from_slice(&b.partial_path); + + // encode the value. For tries that have the same length keys, this is always empty + if let Some(v) = &b.value { + encoded + .write_varint(v.len()) + .expect("writing to vec should succeed"); + encoded.extend_from_slice(v); + } + + // encode the children + if childcount == BranchNode::MAX_CHILDREN { + for (_, child) in child_iter { + if let Child::AddressWithHash(address, hash) = child { + encoded.extend_from_slice(&address.get().to_ne_bytes()); + encoded.extend_from_slice(hash); + } else { + panic!("attempt to serialize to persist a branch with a child that is not an AddressWithHash"); + } + } + } else { + for (position, child) in child_iter { + encoded + .write_varint(position) + .expect("writing to vec should succeed"); + if let Child::AddressWithHash(address, hash) = child { + encoded.extend_from_slice(&address.get().to_ne_bytes()); + encoded.extend_from_slice(hash); + } else { + panic!("attempt to serialize to persist a branch with a child that is not an AddressWithHash"); + } + } + } + } + Node::Leaf(l) => { + let first_byte: LeafFirstByte = LeafFirstByte::new(1, l.partial_path.0.len() as u8); + + const OPTIMIZE_LEAVES_FOR_SIZE: usize = 128; + encoded.reserve(OPTIMIZE_LEAVES_FOR_SIZE); + encoded.push(prefix); + encoded.push(first_byte.0); + + // encode the partial path, including the length if it didn't fit above + if l.partial_path.0.len() >= 127 { + encoded + .write_varint(l.partial_path.len()) + .expect("write to array should succeed"); + } + encoded.extend_from_slice(&l.partial_path); + + // encode the value + encoded + .write_varint(l.value.len()) + .expect("write to array should succeed"); + encoded.extend_from_slice(&l.value); + } + } + } + + /// Given a reader, return a [Node] from those bytes + pub fn from_reader(mut serialized: impl Read) -> Result { + let mut first_byte: [u8; 1] = [0]; + serialized.read_exact(&mut first_byte)?; + match first_byte[0] { + 255 => { + // this is a freed area + Err(Error::new(ErrorKind::Other, "attempt to read freed area")) + } + leaf_first_byte if leaf_first_byte & 1 == 1 => { + let partial_path_len = if leaf_first_byte < 255 { + // less than 126 nibbles + LeafFirstByte(leaf_first_byte).partial_path_length() as usize + } else { + serialized.read_varint()? + }; + + let mut partial_path = vec![0u8; partial_path_len]; + serialized.read_exact(&mut partial_path)?; + + let mut value_len_buf = [0u8; 1]; + serialized.read_exact(&mut value_len_buf)?; + let value_len = value_len_buf[0] as usize; + + let mut value = vec![0u8; value_len]; + serialized.read_exact(&mut value)?; + + Ok(Node::Leaf(LeafNode { + partial_path: Path::from(partial_path), + value: value.into(), + })) + } + branch_first_byte => { + let branch_first_byte = BranchFirstByte(branch_first_byte); + + let has_value = branch_first_byte.has_value() == 1; + #[cfg(not(feature = "branch_factor_256"))] + let childcount = branch_first_byte.number_children() as usize; + #[cfg(feature = "branch_factor_256")] + let childcount = { + let mut childcount_buf = [0u8; 1]; + serialized.read_exact(&mut childcount_buf)?; + childcount_buf[0] as usize + }; + + let mut partial_path_len = branch_first_byte.partial_path_length() as usize; + if partial_path_len > MAX_ENCODED_PARTIAL_PATH_LEN { + partial_path_len = serialized.read_varint()?; + } + + let mut partial_path = vec![0u8; partial_path_len]; + serialized.read_exact(&mut partial_path)?; + + let value = if has_value { + let mut value_len_buf = [0u8; 1]; + serialized.read_exact(&mut value_len_buf)?; + let value_len = value_len_buf[0] as usize; + + let mut value = vec![0u8; value_len]; + serialized.read_exact(&mut value)?; + Some(value.into()) + } else { + None + }; + + let mut children = [const { None }; BranchNode::MAX_CHILDREN]; + if childcount == 0 { + // branch is full of all children + for child in children.iter_mut() { + // TODO: we can read them all at once + let mut address_buf = [0u8; 8]; + serialized.read_exact(&mut address_buf)?; + let address = u64::from_ne_bytes(address_buf); + + let mut hash = [0u8; 32]; + serialized.read_exact(&mut hash)?; + + *child = Some(Child::AddressWithHash( + NonZero::new(address).ok_or(Error::other("zero address in child"))?, + hash.into(), + )); + } + } else { + for _ in 0..childcount { + let mut position_buf = [0u8; 1]; + serialized.read_exact(&mut position_buf)?; + let position = position_buf[0] as usize; + + let mut address_buf = [0u8; 8]; + serialized.read_exact(&mut address_buf)?; + let address = u64::from_ne_bytes(address_buf); + + let mut hash = [0u8; 32]; + serialized.read_exact(&mut hash)?; + + children[position] = Some(Child::AddressWithHash( + NonZero::new(address).ok_or(Error::other("zero address in child"))?, + hash.into(), + )); + } + } + + Ok(Node::Branch(Box::new(BranchNode { + partial_path: partial_path.into(), + value, + children, + }))) + } + } + } } /// A path iterator item, which has the key nibbles up to this point, @@ -113,3 +462,52 @@ pub struct PathIterItem { /// None if `node` is the last node in the path. pub next_nibble: Option, } + +#[cfg(test)] + +mod test { + use crate::{ + node::{BranchNode, LeafNode, Node}, + Child, LinearAddress, Path, + }; + use test_case::test_case; + + #[test_case( + Node::Leaf(LeafNode { + partial_path: Path::from(vec![0, 1, 2, 3]), + value: vec![4, 5, 6, 7].into() + }), 11; "leaf node with value")] + #[test_case(Node::Branch(Box::new(BranchNode { + partial_path: Path::from(vec![0, 1]), + value: None, + children: std::array::from_fn(|i| { + if i == 15 { + Some(Child::AddressWithHash(LinearAddress::new(1).unwrap(), std::array::from_fn::(|i| i as u8).into())) + } else { + None + } + })})), 45; "one child branch node with short partial path and no value" + )] + #[test_case(Node::Branch(Box::new(BranchNode { + partial_path: Path::from(vec![0, 1, 2, 3]), + value: Some(vec![4, 5, 6, 7].into()), + children: std::array::from_fn(|_| + Some(Child::AddressWithHash(LinearAddress::new(1).unwrap(), std::array::from_fn::(|i| i as u8).into())) + )})), 652; "full branch node with long partial path and value" + )] + #[allow(unused_variables)] + fn test_serialize_deserialize(node: Node, expected_length: usize) { + use crate::node::Node; + use std::io::Cursor; + + let mut serialized = Vec::new(); + node.as_bytes(0, &mut serialized); + #[cfg(not(feature = "branch_factor_256"))] // TODO: enable this test for branch_factor_256 + assert_eq!(serialized.len(), expected_length); + let mut cursor = Cursor::new(&serialized); + cursor.set_position(1); + let deserialized = Node::from_reader(cursor).unwrap(); + + assert_eq!(node, deserialized); + } +} diff --git a/storage/src/nodestore.rs b/storage/src/nodestore.rs index e30e4d82f..ebe3641e6 100644 --- a/storage/src/nodestore.rs +++ b/storage/src/nodestore.rs @@ -59,7 +59,7 @@ use std::ops::Deref; use std::sync::Arc; use crate::hashednode::hash_node; -use crate::node::Node; +use crate::node::{ByteCounter, Node}; use crate::{Child, FileBacked, Path, ReadableStorage, TrieHash}; use super::linear::WritableStorage; @@ -167,14 +167,21 @@ fn area_size_to_index(n: u64) -> Result { pub type LinearAddress = NonZeroU64; /// Each [StoredArea] contains an [Area] which is either a [Node] or a [FreeArea]. + +#[repr(u8)] #[derive(PartialEq, Eq, Clone, Debug, Deserialize, Serialize)] enum Area { Node(T), - Free(U), + Free(U) = 255, // this is magic: no node starts with a byte of 255 } /// Every item stored in the [NodeStore]'s ReadableStorage after the /// [NodeStoreHeader] is a [StoredArea]. +/// +/// As an overview of what this looks like stored, we get something like this: +/// - Byte 0: The index of the area size +/// - Byte 1: 0x255 if free, otherwise the low-order bit indicates Branch or Leaf +/// - Bytes 2..n: The actual data #[derive(PartialEq, Eq, Clone, Debug, Deserialize, Serialize)] struct StoredArea { /// Index in [AREA_SIZES] of this area's size @@ -210,20 +217,11 @@ impl NodeStore { debug_assert!(addr.get() % 8 == 0); - let addr = addr.get() + 1; // Skip the index byte + let addr = addr.get() + 1; // skip the length byte let area_stream = self.storage.stream_from(addr)?; - let area: Area = serializer() - .deserialize_from(area_stream) - .map_err(|e| Error::new(ErrorKind::InvalidData, e))?; - - match area { - Area::Node(node) => Ok(node.into()), - Area::Free(_) => Err(Error::new( - ErrorKind::InvalidData, - "Attempted to read a freed area", - )), - } + let node = Node::from_reader(area_stream)?; + Ok(node.into()) } } @@ -482,9 +480,9 @@ impl NodeStore, S> { /// Returns the length of the serialized area for a node. fn stored_len(node: &Node) -> u64 { - let area: Area<&Node, FreeArea> = Area::Node(node); - - serializer().serialized_size(&area).expect("fixme") + 1 + let mut bytecounter = ByteCounter::new(); + node.as_bytes(0, &mut bytecounter); + bytecounter.count() } /// Returns an address that can be used to store the given `node` and updates @@ -927,53 +925,6 @@ impl NodeStore { } } -impl NodeStore { - /// Persist the freelist from this proposal to storage. - pub fn flush_freelist(&self) -> Result<(), Error> { - // Write the free lists to storage - let free_list_bytes = bytemuck::bytes_of(&self.header.free_lists); - let free_list_offset = offset_of!(NodeStoreHeader, free_lists) as u64; - self.storage.write(free_list_offset, free_list_bytes)?; - Ok(()) - } - - /// Persist all the nodes of a proposal to storage. - pub fn flush_nodes(&self) -> Result<(), Error> { - for (addr, (area_size_index, node)) in self.kind.new.iter() { - let stored_area = StoredArea { - area_size_index: *area_size_index, - area: Area::<_, FreeArea>::Node(node.as_ref()), - }; - - let stored_area_bytes = serializer() - .serialize(&stored_area) - .map_err(|e| Error::new(ErrorKind::InvalidData, e))?; - - self.storage - .write(addr.get(), stored_area_bytes.as_slice())?; - } - self.storage - .write_cached_nodes(self.kind.new.iter().map(|(addr, (_, node))| (addr, node)))?; - - Ok(()) - } -} - -impl NodeStore { - /// Return a Committed version of this proposal, which doesn't have any modified nodes. - /// This function is used during commit. - pub fn as_committed(&self) -> NodeStore { - NodeStore { - header: self.header, - kind: Committed { - deleted: self.kind.deleted.clone(), - root_hash: self.kind.root_hash.clone(), - }, - storage: self.storage.clone(), - } - } -} - impl NodeStore, S> { /// Persist the freelist from this proposal to storage. pub fn flush_freelist(&self) -> Result<(), Error> { @@ -987,18 +938,12 @@ impl NodeStore, S> { /// Persist all the nodes of a proposal to storage. pub fn flush_nodes(&self) -> Result<(), Error> { for (addr, (area_size_index, node)) in self.kind.new.iter() { - let stored_area = StoredArea { - area_size_index: *area_size_index, - area: Area::<_, FreeArea>::Node(node.as_ref()), - }; - - let stored_area_bytes = serializer() - .serialize(&stored_area) - .map_err(|e| Error::new(ErrorKind::InvalidData, e))?; - + let mut stored_area_bytes = Vec::new(); + node.as_bytes(*area_size_index, &mut stored_area_bytes); self.storage .write(addr.get(), stored_area_bytes.as_slice())?; } + self.storage .write_cached_nodes(self.kind.new.iter().map(|(addr, (_, node))| (addr, node)))?; @@ -1242,6 +1187,13 @@ mod tests { } }), }; "branch node with 1 child")] + #[test_case(BranchNode { + partial_path: Path::from([6, 7, 8]), + value: Some(vec![9, 10, 11].into_boxed_slice()), + children: from_fn(|_| + Some(Child::AddressWithHash(LinearAddress::new(1).unwrap(), std::array::from_fn::(|i| i as u8).into())) + ), + }; "branch node with all child")] #[test_case( Node::Leaf(LeafNode { partial_path: Path::from([0, 1, 2]), @@ -1251,14 +1203,15 @@ mod tests { fn test_serialized_len>(node: N) { let node = node.into(); - let area_size = NodeStore::, MemStore>::stored_len(&node); + let computed_length = + NodeStore::, MemStore>::stored_len(&node); - let area: Area<&Node, FreeArea> = Area::Node(&node); - let actually_serialized = serializer().serialize(&area).unwrap().len() as u64; - assert_eq!(area_size, actually_serialized + 1); + let mut serialized = Vec::new(); + node.as_bytes(0, &mut serialized); + assert_eq!(serialized.len() as u64, computed_length); } #[test] - #[should_panic(expected = "Node size 16777228 is too large")] + #[should_panic(expected = "Node size 16777225 is too large")] fn giant_node() { let memstore = MemStore::new(vec![]); let mut node_store = NodeStore::new_empty_proposal(memstore.into());