diff --git a/Cargo.lock b/Cargo.lock index 4d9db0ae..a10df173 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -346,6 +346,30 @@ version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "02b4ff8b16e6076c3e14220b39fbc1fabb6737522281a388998046859400895f" +[[package]] +name = "bitbuffer" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525586993a118417512a49bada2d143319310891f48b0b116c8f64fbb6486c87" +dependencies = [ + "bitbuffer_derive", + "err-derive", + "memchr", + "num-traits", +] + +[[package]] +name = "bitbuffer_derive" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "052a5a614540ae9bb7de25c2c86a94b6de7374cb7e3230f3128955bdaea62c3f" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "syn_util", +] + [[package]] name = "bitflags" version = "0.4.0" @@ -364,6 +388,12 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" +[[package]] +name = "bitter" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e5958a5c88910651ad16ab92586bce672368e90e4f49ad0dd32518d13b0a73d" + [[package]] name = "blake2" version = "0.10.6" @@ -726,6 +756,20 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +[[package]] +name = "err-derive" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34a887c8df3ed90498c1c437ce21f211c8e27672921a8ffa293cb8d6d4caa9e" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "rustversion", + "syn 1.0.109", + "synstructure", +] + [[package]] name = "errno" version = "0.3.8" @@ -1316,6 +1360,8 @@ dependencies = [ name = "locustdb-compression-utils" version = "0.1.0" dependencies = [ + "bitbuffer", + "bitter", "log", "serde", "serde_json", @@ -2101,6 +2147,12 @@ dependencies = [ "untrusted", ] +[[package]] +name = "rustversion" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" + [[package]] name = "rustyline" version = "1.0.0" @@ -2446,6 +2498,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "syn_util" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6754c4559b79657554e9d8a0d56e65e490c76d382b9c23108364ec4125dea23c" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "sync_wrapper" version = "0.1.2" diff --git a/locustdb-compression-utils/Cargo.lock b/locustdb-compression-utils/Cargo.lock index d229c42a..0bd6647f 100644 --- a/locustdb-compression-utils/Cargo.lock +++ b/locustdb-compression-utils/Cargo.lock @@ -50,6 +50,42 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "autocfg" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" + +[[package]] +name = "bitbuffer" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525586993a118417512a49bada2d143319310891f48b0b116c8f64fbb6486c87" +dependencies = [ + "bitbuffer_derive", + "err-derive", + "memchr", + "num-traits", +] + +[[package]] +name = "bitbuffer_derive" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "052a5a614540ae9bb7de25c2c86a94b6de7374cb7e3230f3128955bdaea62c3f" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "syn_util", +] + +[[package]] +name = "bitter" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e5958a5c88910651ad16ab92586bce672368e90e4f49ad0dd32518d13b0a73d" + [[package]] name = "cfg-if" version = "1.0.0" @@ -87,7 +123,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn", + "syn 2.0.53", ] [[package]] @@ -102,6 +138,20 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "err-derive" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c34a887c8df3ed90498c1c437ce21f211c8e27672921a8ffa293cb8d6d4caa9e" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "rustversion", + "syn 1.0.109", + "synstructure", +] + [[package]] name = "getrandom" version = "0.2.12" @@ -135,6 +185,8 @@ checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" name = "locustdb-compression-utils" version = "0.1.0" dependencies = [ + "bitbuffer", + "bitter", "clap", "log", "rand", @@ -148,12 +200,51 @@ version = "0.4.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" +[[package]] +name = "memchr" +version = "2.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" + +[[package]] +name = "num-traits" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +dependencies = [ + "autocfg", +] + [[package]] name = "ppv-lite86" version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn 1.0.109", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + [[package]] name = "proc-macro2" version = "1.0.79" @@ -202,6 +293,12 @@ dependencies = [ "getrandom", ] +[[package]] +name = "rustversion" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" + [[package]] name = "ryu" version = "1.0.17" @@ -225,7 +322,7 @@ checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" dependencies = [ "proc-macro2", "quote", - "syn", + "syn 2.0.53", ] [[package]] @@ -245,6 +342,17 @@ version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01" +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + [[package]] name = "syn" version = "2.0.53" @@ -256,18 +364,53 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "syn_util" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6754c4559b79657554e9d8a0d56e65e490c76d382b9c23108364ec4125dea23c" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "synstructure" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", + "unicode-xid", +] + [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +[[package]] +name = "unicode-xid" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" + [[package]] name = "utf8parse" version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" diff --git a/locustdb-compression-utils/Cargo.toml b/locustdb-compression-utils/Cargo.toml index ee62067c..2672423b 100644 --- a/locustdb-compression-utils/Cargo.toml +++ b/locustdb-compression-utils/Cargo.toml @@ -11,6 +11,8 @@ license-file = "../LICENSE" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" log = "0.4" +bitter = "0.6" +bitbuffer = "0.10" [dev-dependencies] clap = { version = "4", features = ["derive"] } diff --git a/locustdb-compression-utils/examples/gorilla_time.rs b/locustdb-compression-utils/examples/gorilla_time.rs index 21b88d49..3572bd49 100644 --- a/locustdb-compression-utils/examples/gorilla_time.rs +++ b/locustdb-compression-utils/examples/gorilla_time.rs @@ -50,7 +50,6 @@ fn main() { let opt = Opt::parse(); if let Some(mibibytes) = opt.benchmark { - assert!(!opt.single, "Benchmarking single precision is not supported"); assert!(opt.max_regret.len() == 1, "Benchmarking multiple max-regret values is not supported"); // create 1GiB of random floats let len = (1 << 20) * mibibytes / 8; @@ -59,7 +58,11 @@ fn main() { let start_time = std::time::Instant::now(); let mut fast_rng = rand::rngs::SmallRng::seed_from_u64(42); for _ in 0..len { - data.push(fast_rng.gen::()); + if opt.single { + data.push(fast_rng.gen::() as f64); + } else { + data.push(fast_rng.gen::()); + } } println!( "Generated {mibibytes} MiB of random data in {:?}", diff --git a/locustdb-compression-utils/src/bit_reader.rs b/locustdb-compression-utils/src/bit_reader.rs deleted file mode 100644 index 768d8595..00000000 --- a/locustdb-compression-utils/src/bit_reader.rs +++ /dev/null @@ -1,215 +0,0 @@ -use crate::bit_writer::Bit; - - -#[derive(Debug, PartialEq)] -pub enum Error { - Eof, -} - -/// BufferedReader -/// -/// BufferedReader encapsulates a buffer of bytes which can be read from. -#[derive(Debug)] -pub struct BitReader<'a> { - bytes: &'a [u8], // internal buffer of bytes - index: usize, // index into bytes - pos: u32, // position in the byte we are currenlty reading -} - -impl<'a> BitReader<'a> { - /// new creates a new `BufferedReader` from `bytes` - pub fn new(bytes: &'a [u8]) -> Self { - BitReader { - bytes, - index: 0, - pos: 0, - } - } - - fn get_byte(&mut self) -> Result { - self.bytes.get(self.index).cloned().ok_or(Error::Eof) - } - - pub fn read_bit(&mut self) -> Result { - if self.pos == 8 { - self.index += 1; - self.pos = 0; - } - - let byte = self.get_byte()?; - - let bit = if byte & 1u8.wrapping_shl(7 - self.pos) == 0 { - Bit::Zero - } else { - Bit::One - }; - - self.pos += 1; - - Ok(bit) - } - - pub fn read_byte(&mut self) -> Result { - if self.pos == 0 { - self.pos += 8; - return self.get_byte(); - } - - if self.pos == 8 { - self.index += 1; - return self.get_byte(); - } - - let mut byte = 0; - let mut b = self.get_byte()?; - - byte |= b.wrapping_shl(self.pos); - - self.index += 1; - b = self.get_byte()?; - - byte |= b.wrapping_shr(8 - self.pos); - - Ok(byte) - } - - pub fn read_bits(&mut self, mut num: u32) -> Result { - // can't read more than 64 bits into a u64 - if num > 64 { - num = 64; - } - - let mut bits: u64 = 0; - while num >= 8 { - let byte = self.read_byte().map(u64::from)?; - bits = bits.wrapping_shl(8) | byte; - num -= 8; - } - - while num > 0 { - self.read_bit() - .map(|bit| bits = bits.wrapping_shl(1) | bit.to_u64())?; - - num -= 1; - } - - Ok(bits) - } - - #[cfg(test)] - pub fn peak_bits(&mut self, num: u32) -> Result { - // save the current index and pos so we can reset them after calling `read_bits` - let index = self.index; - let pos = self.pos; - - let bits = self.read_bits(num)?; - - self.index = index; - self.pos = pos; - - Ok(bits) - } -} - -#[cfg(test)] -mod tests { - use crate::bit_reader::Error; - use crate::bit_writer::Bit; - - use super::BitReader; - - #[test] - fn read_bit() { - let bytes = vec![0b01101100, 0b11101001]; - let mut b = BitReader::new(&bytes); - - assert_eq!(b.read_bit().unwrap(), Bit::Zero); - assert_eq!(b.read_bit().unwrap(), Bit::One); - assert_eq!(b.read_bit().unwrap(), Bit::One); - assert_eq!(b.read_bit().unwrap(), Bit::Zero); - assert_eq!(b.read_bit().unwrap(), Bit::One); - assert_eq!(b.read_bit().unwrap(), Bit::One); - assert_eq!(b.read_bit().unwrap(), Bit::Zero); - assert_eq!(b.read_bit().unwrap(), Bit::Zero); - - assert_eq!(b.read_bit().unwrap(), Bit::One); - assert_eq!(b.read_bit().unwrap(), Bit::One); - assert_eq!(b.read_bit().unwrap(), Bit::One); - assert_eq!(b.read_bit().unwrap(), Bit::Zero); - assert_eq!(b.read_bit().unwrap(), Bit::One); - assert_eq!(b.read_bit().unwrap(), Bit::Zero); - assert_eq!(b.read_bit().unwrap(), Bit::Zero); - assert_eq!(b.read_bit().unwrap(), Bit::One); - - assert_eq!(b.read_bit().err().unwrap(), Error::Eof); - } - - #[test] - fn read_byte() { - let bytes = vec![100, 25, 0, 240, 240]; - let mut b = BitReader::new(&bytes); - - assert_eq!(b.read_byte().unwrap(), 100); - assert_eq!(b.read_byte().unwrap(), 25); - assert_eq!(b.read_byte().unwrap(), 0); - - // read some individual bits we can test `read_byte` when the position in the - // byte we are currently reading is non-zero - assert_eq!(b.read_bit().unwrap(), Bit::One); - assert_eq!(b.read_bit().unwrap(), Bit::One); - assert_eq!(b.read_bit().unwrap(), Bit::One); - assert_eq!(b.read_bit().unwrap(), Bit::One); - - assert_eq!(b.read_byte().unwrap(), 15); - - assert_eq!(b.read_byte().err().unwrap(), Error::Eof); - } - - #[test] - fn read_bits() { - let bytes = vec![0b01010111, 0b00011101, 0b11110101, 0b00010100]; - let mut b = BitReader::new(&bytes); - - assert_eq!(b.read_bits(3).unwrap(), 0b010); - assert_eq!(b.read_bits(1).unwrap(), 0b1); - assert_eq!(b.read_bits(20).unwrap(), 0b01110001110111110101); - assert_eq!(b.read_bits(8).unwrap(), 0b00010100); - assert_eq!(b.read_bits(4).err().unwrap(), Error::Eof); - } - - #[test] - fn read_mixed() { - let bytes = vec![0b01101101, 0b01101101]; - let mut b = BitReader::new(&bytes); - - assert_eq!(b.read_bit().unwrap(), Bit::Zero); - assert_eq!(b.read_bits(3).unwrap(), 0b110); - assert_eq!(b.read_byte().unwrap(), 0b11010110); - assert_eq!(b.read_bits(2).unwrap(), 0b11); - assert_eq!(b.read_bit().unwrap(), Bit::Zero); - assert_eq!(b.read_bits(1).unwrap(), 0b1); - assert_eq!(b.read_bit().err().unwrap(), Error::Eof); - } - - #[test] - fn peak_bits() { - let bytes = vec![0b01010111, 0b00011101, 0b11110101, 0b00010100]; - let mut b = BitReader::new(&bytes); - - assert_eq!(b.peak_bits(1).unwrap(), 0b0); - assert_eq!(b.peak_bits(4).unwrap(), 0b0101); - assert_eq!(b.peak_bits(8).unwrap(), 0b01010111); - assert_eq!(b.peak_bits(20).unwrap(), 0b01010111000111011111); - - // read some individual bits we can test `peak_bits` when the position in the - // byte we are currently reading is non-zero - assert_eq!(b.read_bits(12).unwrap(), 0b010101110001); - - assert_eq!(b.peak_bits(1).unwrap(), 0b1); - assert_eq!(b.peak_bits(4).unwrap(), 0b1101); - assert_eq!(b.peak_bits(8).unwrap(), 0b11011111); - assert_eq!(b.peak_bits(20).unwrap(), 0b11011111010100010100); - - assert_eq!(b.peak_bits(22).err().unwrap(), Error::Eof); - } -} diff --git a/locustdb-compression-utils/src/bit_writer.rs b/locustdb-compression-utils/src/bit_writer.rs deleted file mode 100644 index c4dc842c..00000000 --- a/locustdb-compression-utils/src/bit_writer.rs +++ /dev/null @@ -1,256 +0,0 @@ -/// BufferedWriter -/// -/// BufferedWriter writes bytes to a buffer. -#[derive(Debug, Default)] -pub struct BitWriter { - buf: Vec, - pos: u32, // position in the last byte in the buffer -} - -impl BitWriter { - /// new creates a new BufferedWriter - pub fn new() -> Self { - BitWriter { - buf: Vec::new(), - // set pos to 8 to indicate the buffer has no space presently since it is empty - pos: 8, - } - } - - fn grow(&mut self) { - self.buf.push(0); - } - - fn last_index(&self) -> usize { - self.buf.len() - 1 - } - - pub fn write_one(&mut self) { - if self.pos == 8 { - self.grow(); - self.pos = 0; - } - - let i = self.last_index(); - self.buf[i] |= 1u8.wrapping_shl(7 - self.pos); - self.pos += 1; - } - - pub fn write_zero(&mut self) { - if self.pos == 8 { - self.grow(); - self.pos = 0; - } - self.pos += 1; - } - - pub fn write_byte(&mut self, byte: u8) { - if self.pos == 8 { - self.grow(); - - let i = self.last_index(); - self.buf[i] = byte; - return; - } - - let i = self.last_index(); - let mut b = byte.wrapping_shr(self.pos); - self.buf[i] |= b; - - self.grow(); - - b = byte.wrapping_shl(8 - self.pos); - self.buf[i + 1] |= b; - } - - pub fn write_bits(&mut self, mut bits: u64, mut num: u32) { - // we should never write more than 64 bits for a u64 - if num > 64 { - num = 64; - } - - bits = bits.wrapping_shl(64 - num); - while num >= 8 { - let byte = bits.wrapping_shr(56); - self.write_byte(byte as u8); - - bits = bits.wrapping_shl(8); - num -= 8; - } - - while num > 0 { - let byte = bits.wrapping_shr(63); - if byte == 1 { - self.write_one(); - } else { - self.write_zero(); - } - - bits = bits.wrapping_shl(1); - num -= 1; - } - } - - pub fn close(self) -> Box<[u8]> { - self.buf.into_boxed_slice() - } -} - -#[cfg(test)] -mod tests { - use super::BitWriter; - - #[test] - fn write_bit() { - let mut b = BitWriter::new(); - - // 170 = 0b10101010 - for i in 0..8 { - if i % 2 == 0 { - b.write_one(); - continue; - } - - b.write_zero(); - } - - // 146 = 0b10010010 - for i in 0..8 { - if i % 3 == 0 { - b.write_one(); - continue; - } - - b.write_zero(); - } - - // 136 = 010001000 - for i in 0..8 { - if i % 4 == 0 { - b.write_one(); - continue; - } - - b.write_zero(); - } - - assert_eq!(b.buf.len(), 3); - - assert_eq!(b.buf[0], 170); - assert_eq!(b.buf[1], 146); - assert_eq!(b.buf[2], 136); - } - - #[test] - fn write_byte() { - let mut b = BitWriter::new(); - - b.write_byte(234); - b.write_byte(188); - b.write_byte(77); - - assert_eq!(b.buf.len(), 3); - - assert_eq!(b.buf[0], 234); - assert_eq!(b.buf[1], 188); - assert_eq!(b.buf[2], 77); - - // write some bits so we can test `write_byte` when the last byte is partially filled - b.write_one(); - b.write_one(); - b.write_one(); - b.write_one(); - b.write_byte(0b11110000); // 1111 1111 0000 - b.write_byte(0b00001111); // 1111 1111 0000 0000 1111 - b.write_byte(0b00001111); // 1111 1111 0000 0000 1111 0000 1111 - - assert_eq!(b.buf.len(), 7); - assert_eq!(b.buf[3], 255); // 0b11111111 = 255 - assert_eq!(b.buf[4], 0); // 0b00000000 = 0 - assert_eq!(b.buf[5], 240); // 0b11110000 = 240 - } - - #[test] - fn write_bits() { - let mut b = BitWriter::new(); - - // 101011 - b.write_bits(43, 6); - - // 010 - b.write_bits(2, 3); - - // 1 - b.write_bits(1, 1); - - // 1010 1100 1110 0011 1101 - b.write_bits(708157, 20); - - // 11 - b.write_bits(3, 2); - - assert_eq!(b.buf.len(), 4); - - assert_eq!(b.buf[0], 173); // 0b10101101 = 173 - assert_eq!(b.buf[1], 107); // 0b01101011 = 107 - assert_eq!(b.buf[2], 56); // 0b00111000 = 56 - assert_eq!(b.buf[3], 247); // 0b11110111 = 247 - } - - #[test] - fn write_mixed() { - let mut b = BitWriter::new(); - - // 1010 1010 - for i in 0..8 { - if i % 2 == 0 { - b.write_one(); - continue; - } - - b.write_zero(); - } - - // 0000 1001 - b.write_byte(9); - - // 1001 1100 1100 - b.write_bits(2508, 12); - - println!("{:?}", b.buf); - - // 1111 - for _ in 0..4 { - b.write_one(); - } - - assert_eq!(b.buf.len(), 4); - - println!("{:?}", b.buf); - - assert_eq!(b.buf[0], 170); // 0b10101010 = 170 - assert_eq!(b.buf[1], 9); // 0b00001001 = 9 - assert_eq!(b.buf[2], 156); // 0b10011100 = 156 - assert_eq!(b.buf[3], 207); // 0b11001111 = 207 - } -} - - -/// Bit -/// -/// An enum used to represent a single bit, can be either `Zero` or `One`. -#[derive(Debug, PartialEq)] -pub enum Bit { - Zero, - One, -} - -impl Bit { - /// Convert a bit to u64, so `Zero` becomes 0 and `One` becomes 1. - pub fn to_u64(&self) -> u64 { - match self { - Bit::Zero => 0, - Bit::One => 1, - } - } -} diff --git a/locustdb-compression-utils/src/column.rs b/locustdb-compression-utils/src/column.rs index a573985e..4e76a263 100644 --- a/locustdb-compression-utils/src/column.rs +++ b/locustdb-compression-utils/src/column.rs @@ -28,7 +28,7 @@ impl Column { let xor_compressed = xor_float::double::encode(&xs, 100, mantissa); // Require at least 1.5x compression to use xor if xor_compressed.len() * 6 > xor_compressed.len() { - Column::Xor(Vec::from(xor_compressed)) + Column::Xor(xor_compressed) } else { Column::Float(xs) } diff --git a/locustdb-compression-utils/src/lib.rs b/locustdb-compression-utils/src/lib.rs index e0afdc36..f1a55cf4 100644 --- a/locustdb-compression-utils/src/lib.rs +++ b/locustdb-compression-utils/src/lib.rs @@ -1,11 +1,4 @@ -// BitReader and BitWriter are adapted from https://github.com/jeromefroe/tsz-rs/tree/b3e2dce64707c42c10019d9159f88a0f594458af -mod bit_reader; -mod bit_writer; - -pub(crate) use bit_reader::BitReader; -pub(crate) use bit_writer::BitWriter; - pub mod xor_float; pub mod column; pub mod test_data; \ No newline at end of file diff --git a/locustdb-compression-utils/src/xor_float/double.rs b/locustdb-compression-utils/src/xor_float/double.rs index b906c260..78b39d52 100644 --- a/locustdb-compression-utils/src/xor_float/double.rs +++ b/locustdb-compression-utils/src/xor_float/double.rs @@ -1,14 +1,17 @@ -use crate::bit_reader::Error; -use crate::bit_writer::Bit; -use crate::{BitReader, BitWriter}; +use bitbuffer::{BigEndian, BitReadBuffer, BitReadStream, BitWriteStream}; -pub fn encode(floats: &[f64], max_regret: u32, mantissa: Option) -> Box<[u8]> { - let mut writer = BitWriter::new(); - writer.write_bits(floats.len() as u64, 64); +use super::Error; + + + +pub fn encode(floats: &[f64], max_regret: u32, mantissa: Option) -> Vec { + let mut write_bytes = vec![]; + let mut writer = BitWriteStream::new(&mut write_bytes, BigEndian); + writer.write_int(floats.len() as u64, 64).unwrap(); match floats.first() { - Some(first) => writer.write_bits(first.to_bits(), 64), - None => return writer.close(), - } + Some(first) => writer.write_int(first.to_bits(), 64).unwrap(), + None => return write_bytes, + }; let mut last_value = floats[0]; let mut last_leading_zeros = 65; let mut last_trailing_zeros = 65; @@ -27,67 +30,70 @@ pub fn encode(floats: &[f64], max_regret: u32, mantissa: Option) -> Box<[u8 let trailing_zeros = xor.trailing_zeros(); if trailing_zeros == 64 { - writer.write_zero(); + writer.write_int(0, 1).unwrap(); } else { let significant_bits = 64 - leading_zeros - trailing_zeros; if leading_zeros >= last_leading_zeros && trailing_zeros >= last_trailing_zeros && (regret < max_regret || significant_bits == last_significant_bits) { - writer.write_one(); - writer.write_zero(); + writer.write_int(0b10, 2).unwrap(); let xor = xor >> last_trailing_zeros; - writer.write_bits(xor, last_significant_bits); + writer.write_int(xor, last_significant_bits as usize).unwrap(); regret += last_significant_bits - significant_bits; } else { last_leading_zeros = leading_zeros; last_trailing_zeros = trailing_zeros; last_significant_bits = significant_bits; regret = 0; - writer.write_one(); - writer.write_one(); - writer.write_bits(leading_zeros as u64, 5); - writer.write_bits(significant_bits as u64 - 1, 6); + writer.write_int((0b11 << 11) | (leading_zeros << 6) | (significant_bits - 1), 13).unwrap(); let xor = xor >> last_trailing_zeros; - writer.write_bits(xor, significant_bits); + writer.write_int(xor, significant_bits as usize).unwrap(); } } last_value = f; } - writer.close() + // TODO: flush partial bits? + write_bytes } pub fn decode(data: &[u8]) -> Result, Error> { - let mut reader = BitReader::new(data); - let length = reader.read_bits(64)? as usize; + let buffer = BitReadBuffer::new(data, BigEndian); + let mut reader = BitReadStream::new(buffer); + let length = reader.read_int(64).map_err(|_| Error::Eof)?; let mut decoded = Vec::with_capacity(length); if length == 0 { return Ok(decoded); } - let first = reader.read_bits(64).unwrap(); + let first = reader.read_int(64).map_err(|_| Error::Eof)?; decoded.push(f64::from_bits(first)); let mut last = first; - let mut last_leading_zeros; - let mut last_trailing_zeros = 65; + let mut last_leading_zeros: u32; + let mut last_trailing_zeros = 65u32; let mut last_significant_bits = 0; for _ in 1..length { - match reader.read_bit()? { - Bit::Zero => { + //match reader.read_bit().ok_or(Error::Eof)? { + match reader.read_int::(1).unwrap() { + 0 => { decoded.push(f64::from_bits(last)); } - Bit::One => { - if let Bit::One = reader.read_bit()? { - last_leading_zeros = reader.read_bits(5)? as u32; - last_significant_bits = reader.read_bits(6)? as u32 + 1; + 1 => { + //if reader.read_bit().ok_or(Error::Eof)? { + if reader.read_int::(1).unwrap() == 1u8 { + last_leading_zeros = reader.read_int(5).unwrap(); + last_significant_bits = reader.read_int::(6).unwrap() + 1; last_trailing_zeros = 64 - last_leading_zeros - last_significant_bits; } - let xor = reader.read_bits(last_significant_bits)?; + let xor: u64 = reader.read_int(last_significant_bits as usize).unwrap(); last ^= xor << last_trailing_zeros; decoded.push(f64::from_bits(last)); } + _ => { + return Err(Error::Eof); + } } } @@ -95,8 +101,9 @@ pub fn decode(data: &[u8]) -> Result, Error> { } -pub fn verbose_encode(name: &str, floats: &[f64], max_regret: u32, mantissa: Option, verbose: bool) -> Box<[u8]> { - let mut writer = BitWriter::new(); +pub fn verbose_encode(name: &str, floats: &[f64], max_regret: u32, mantissa: Option, verbose: bool) -> Vec { + let mut write_bytes = vec![]; + let mut writer = BitWriteStream::new(&mut write_bytes, BigEndian); let mask = match mantissa { Some(mantissa) => { @@ -128,8 +135,8 @@ pub fn verbose_encode(name: &str, floats: &[f64], max_regret: u32, mantissa: Opt ); } - writer.write_bits(floats.len() as u64, 64); - writer.write_bits(floats[0].to_bits(), 64); + writer.write_int(floats.len(), 64).unwrap(); + writer.write_int(floats[0].to_bits(), 64).unwrap(); let mut last_value = floats[0]; let mut last_leading_zeros = 65; let mut last_trailing_zeros = 65; @@ -144,16 +151,15 @@ pub fn verbose_encode(name: &str, floats: &[f64], max_regret: u32, mantissa: Opt let mut bits_string = String::new(); if trailing_zeros == 64 { - writer.write_zero(); + writer.write_int(0, 1).unwrap(); bits_string.push_str("\x1b[1;31m0\x1b[0m"); } else { let significant_bits = 64 - leading_zeros - trailing_zeros; if leading_zeros >= last_leading_zeros && trailing_zeros >= last_trailing_zeros && (regret < max_regret || significant_bits == last_significant_bits) { - writer.write_one(); - writer.write_zero(); + writer.write_int(0b10, 2).unwrap(); bits_string.push_str("\x1b[1;31m10\x1b[0m"); let xor = xor >> last_trailing_zeros; - writer.write_bits(xor, last_significant_bits); + writer.write_int(xor, last_significant_bits as usize).unwrap(); if verbose { bits_string.push_str(&format!( "\x1b[1;33m{:0width$b}\x1b[0m", @@ -168,15 +174,14 @@ pub fn verbose_encode(name: &str, floats: &[f64], max_regret: u32, mantissa: Opt last_significant_bits = significant_bits; regret = 0; - writer.write_one(); - writer.write_one(); + writer.write_int(0b11, 2).unwrap(); bits_string.push_str("\x1b[1;31m11\x1b[0m"); - writer.write_bits(leading_zeros as u64, 5); + writer.write_int(leading_zeros, 5).unwrap(); bits_string.push_str(&format!("\x1b[1;32m{:05b}\x1b[0m", leading_zeros)); - writer.write_bits(significant_bits as u64 - 1, 6); + writer.write_int(significant_bits - 1, 6).unwrap(); bits_string.push_str(&format!("\x1b[1;34m{:06b}\x1b[0m", significant_bits - 1)); let xor = xor >> last_trailing_zeros; - writer.write_bits(xor, significant_bits); + writer.write_int(xor, significant_bits as usize).unwrap(); if verbose { bits_string.push_str(&format!( "\x1b[1;33m{:0width$b}\x1b[0m", @@ -200,12 +205,11 @@ pub fn verbose_encode(name: &str, floats: &[f64], max_regret: u32, mantissa: Opt // 8 bytes per value and 8 addtional bytes for the length let uncompressed_size = std::mem::size_of_val(floats) + 8; - let compressed = writer.close(); println!( "Compression ratio of {:.2} for {name} (max_regret={max_regret})", - uncompressed_size as f64 / compressed.len() as f64, + uncompressed_size as f64 / write_bytes.len() as f64, ); - compressed + write_bytes } diff --git a/locustdb-compression-utils/src/xor_float/mod.rs b/locustdb-compression-utils/src/xor_float/mod.rs index 2c92dec7..5e31ec33 100644 --- a/locustdb-compression-utils/src/xor_float/mod.rs +++ b/locustdb-compression-utils/src/xor_float/mod.rs @@ -1,3 +1,7 @@ pub mod double; pub mod single; +#[derive(Debug, PartialEq)] +pub enum Error { + Eof, +} \ No newline at end of file diff --git a/locustdb-compression-utils/src/xor_float/single.rs b/locustdb-compression-utils/src/xor_float/single.rs index 600b26ef..b3e9b3e5 100644 --- a/locustdb-compression-utils/src/xor_float/single.rs +++ b/locustdb-compression-utils/src/xor_float/single.rs @@ -1,11 +1,12 @@ -use crate::BitWriter; +use bitbuffer::{BigEndian, BitWriteStream}; -pub fn encode(floats: &[f32], max_regret: u32, mantissa: Option) -> Box<[u8]> { - let mut writer = BitWriter::new(); - writer.write_bits(floats.len() as u64, 64); +pub fn encode(floats: &[f32], max_regret: u32, mantissa: Option) -> Vec { + let mut write_bytes = vec![]; + let mut writer = BitWriteStream::new(&mut write_bytes, BigEndian); + writer.write_int(floats.len(), 64).unwrap(); match floats.first() { - Some(first) => writer.write_bits(first.to_bits() as u64, 64), - None => return writer.close(), + Some(first) => writer.write_int(first.to_bits(), 64).unwrap(), + None => return write_bytes, } let mut last_value = floats[0]; let mut last_leading_zeros = 65; @@ -24,39 +25,38 @@ pub fn encode(floats: &[f32], max_regret: u32, mantissa: Option) -> Box<[u8 let leading_zeros = xor.leading_zeros(); let trailing_zeros = xor.trailing_zeros(); if trailing_zeros == 64 { - writer.write_zero(); + writer.write_int(0, 1).unwrap(); } else { let significant_bits = 64 - leading_zeros - trailing_zeros; if leading_zeros >= last_leading_zeros && trailing_zeros >= last_trailing_zeros && (regret < max_regret || significant_bits == last_significant_bits) { - writer.write_one(); - writer.write_zero(); + writer.write_int(0b10, 2).unwrap(); let xor = xor >> last_trailing_zeros; - writer.write_bits(xor as u64, last_significant_bits); + writer.write_int(xor, last_significant_bits as usize).unwrap(); regret += last_significant_bits - significant_bits; } else { last_leading_zeros = leading_zeros; last_trailing_zeros = trailing_zeros; last_significant_bits = significant_bits; regret = 0; - writer.write_one(); - writer.write_one(); - writer.write_bits(leading_zeros as u64, 5); - writer.write_bits(significant_bits as u64, 5 - 1); + writer.write_int(0b11, 2).unwrap(); + writer.write_int(leading_zeros as u64, 5).unwrap(); + writer.write_int(significant_bits as u64 - 1, 6).unwrap(); let xor = xor >> last_trailing_zeros; - writer.write_bits(xor as u64, significant_bits); + writer.write_int(xor, significant_bits as usize).unwrap(); } } last_value = f; } - writer.close() + write_bytes } -pub fn verbose_encode(name: &str, floats: &[f32], max_regret: u32, mantissa: Option, verbose: bool) -> Box<[u8]> { - let mut writer = BitWriter::new(); +pub fn verbose_encode(name: &str, floats: &[f32], max_regret: u32, mantissa: Option, verbose: bool) -> Vec { + let mut write_bytes = vec![]; + let mut writer = BitWriteStream::new(&mut write_bytes, BigEndian); let mask = match mantissa { Some(mantissa) => { @@ -88,8 +88,8 @@ pub fn verbose_encode(name: &str, floats: &[f32], max_regret: u32, mantissa: Opt ); } - writer.write_bits(floats.len() as u64, 32); - writer.write_bits(floats[0].to_bits() as u64, 32); + writer.write_int(floats.len(), 64).unwrap(); + writer.write_int(floats[0].to_bits(), 32).unwrap(); let mut last_value = floats[0]; let mut last_leading_zeros = 65; let mut last_trailing_zeros = 65; @@ -104,16 +104,15 @@ pub fn verbose_encode(name: &str, floats: &[f32], max_regret: u32, mantissa: Opt let mut bits_string = String::new(); if trailing_zeros == 32 { - writer.write_zero(); + writer.write_int(0, 1).unwrap(); bits_string.push_str("\x1b[1;31m0\x1b[0m"); } else { let significant_bits = 32 - leading_zeros - trailing_zeros; if leading_zeros >= last_leading_zeros && trailing_zeros >= last_trailing_zeros && (regret < max_regret || significant_bits == last_significant_bits) { - writer.write_one(); - writer.write_zero(); + writer.write_int(0b10, 2).unwrap(); bits_string.push_str("\x1b[1;31m10\x1b[0m"); let xor = xor >> last_trailing_zeros; - writer.write_bits(xor as u64, last_significant_bits); + writer.write_int(xor, last_significant_bits as usize).unwrap(); if verbose { bits_string.push_str(&format!( "\x1b[1;33m{:0width$b}\x1b[0m", @@ -128,15 +127,14 @@ pub fn verbose_encode(name: &str, floats: &[f32], max_regret: u32, mantissa: Opt last_significant_bits = significant_bits; regret = 0; - writer.write_one(); - writer.write_one(); + writer.write_int(0b11, 2).unwrap(); bits_string.push_str("\x1b[1;31m11\x1b[0m"); - writer.write_bits(leading_zeros as u64, 5); + writer.write_int(leading_zeros as u64, 5).unwrap(); bits_string.push_str(&format!("\x1b[1;32m{:05b}\x1b[0m", leading_zeros)); - writer.write_bits(significant_bits as u64, 5); + writer.write_int(significant_bits as u64 - 1, 6).unwrap(); bits_string.push_str(&format!("\x1b[1;34m{:05b}\x1b[0m", significant_bits - 1)); let xor = xor >> last_trailing_zeros; - writer.write_bits(xor as u64, significant_bits - 1); + writer.write_int(xor, significant_bits as usize).unwrap(); if verbose { bits_string.push_str(&format!( "\x1b[1;33m{:0width$b}\x1b[0m", @@ -160,12 +158,11 @@ pub fn verbose_encode(name: &str, floats: &[f32], max_regret: u32, mantissa: Opt // 8 bytes per value and 8 addtional bytes for the length let uncompressed_size = floats.len() * std::mem::size_of::() + 8; - let compressed = writer.close(); println!( "Compression ratio of {:.2} for {name} (max_regret={max_regret})", - uncompressed_size as f64 / compressed.len() as f64, + uncompressed_size as f64 / write_bytes.len() as f64, ); - compressed + write_bytes }