From 5911d6bf75a3e5cc07fe29aeedb3ab79f9bfa81f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9ophile=20BR=C3=89ZOT?=
 <theophile.brezot@cosmian.com>
Date: Tue, 28 Jan 2025 12:20:39 +0100
Subject: [PATCH] add generic encoding

---
 src/encoding.rs | 262 +++++++++++++++++++++++++++++++++++++-----------
 src/findex.rs   |   6 +-
 src/lib.rs      |   9 +-
 3 files changed, 213 insertions(+), 64 deletions(-)
diff --git a/src/encoding.rs b/src/encoding.rs
index 752faa61..afea712b 100644
--- a/src/encoding.rs
+++ b/src/encoding.rs
@@ -3,77 +3,212 @@
 //! deletion, but there is no theoretical restriction on the kind of operation
 //! that can be used.
 
-use std::{collections::HashSet, hash::Hash};
+#[cfg(any(test, feature = "test-utils"))]
+pub mod dummy_encoding {
+    use std::{collections::HashSet, hash::Hash};
 
-use crate::Op;
+    use crate::Op;
 
-/// Blocks are the smallest unit size in block mode, 16 bytes is optimized to store UUIDs.
-const BLOCK_LENGTH: usize = 16;
+    /// Blocks are the smallest unit size in block mode, 16 bytes is optimized to
+    /// store UUIDs.
+    const BLOCK_LENGTH: usize = 16;
 
-/// The chunk length is the size of the available space in a word.
-const CHUNK_LENGTH: usize = 8 * BLOCK_LENGTH;
+    /// The chunk length is the size of the available space in a word.
+    const CHUNK_LENGTH: usize = 8 * BLOCK_LENGTH;
 
-pub const WORD_LENGTH: usize = 1 + CHUNK_LENGTH;
+    pub const WORD_LENGTH: usize = 1 + CHUNK_LENGTH;
 
-pub fn dummy_encode<const WORD_LENGTH: usize, Value: AsRef<[u8]>>(
-    op: Op,
-    vs: HashSet<Value>,
-) -> Result<Vec<[u8; WORD_LENGTH]>, String> {
-    if (u8::MAX as usize) < WORD_LENGTH {
-        return Err("WORD_LENGTH too big for this encoding".to_string());
+    pub fn dummy_encode<const WORD_LENGTH: usize, Value: AsRef<[u8]>>(
+        op: Op,
+        vs: HashSet<Value>,
+    ) -> Result<Vec<[u8; WORD_LENGTH]>, String> {
+        if (u8::MAX as usize) < WORD_LENGTH {
+            return Err("WORD_LENGTH too big for this encoding".to_string());
+        }
+
+        vs.into_iter()
+            .map(|v| {
+                let bytes = v.as_ref();
+                if WORD_LENGTH - 2 < bytes.len() {
+                    return Err(format!(
+                        "insufficient bytes in a word to fit a value of length {}",
+                        bytes.len(),
+                    ));
+                }
+                let n = bytes.len() as u8;
+                let mut res = [0; WORD_LENGTH];
+                if op == Op::Insert {
+                    res[0] = 1;
+                } else {
+                    res[0] = 0;
+                }
+                res[1] = n;
+                res[2..bytes.len() + 2].copy_from_slice(bytes);
+                Ok(res)
+            })
+            .collect()
     }
 
-    vs.into_iter()
-        .map(|v| {
-            let bytes = v.as_ref();
-            if WORD_LENGTH - 2 < bytes.len() {
+    pub fn dummy_decode<const WORD_LENGTH: usize, TryFromError: std::error::Error, Value>(
+        ws: Vec<[u8; WORD_LENGTH]>,
+    ) -> Result<HashSet<Value>, String>
+    where
+        for<'z> Value: Hash + PartialEq + Eq + TryFrom<&'z [u8], Error = TryFromError>,
+    {
+        let mut res = HashSet::with_capacity(ws.len());
+        for w in ws {
+            if !w.is_empty() {
+                let n = <usize>::from(w[1]);
+                let v = Value::try_from(&w[2..n + 2]).map_err(|e| e.to_string())?;
+                if w[0] == 1 {
+                    res.insert(v);
+                } else {
+                    res.remove(&v);
+                }
+            }
+        }
+        Ok(res)
+    }
+}
+
+pub mod generic_encoding {
+    use crate::Op;
+    use std::{collections::HashSet, fmt::Debug, hash::Hash};
+
+    const MAX_VALUE_LENGTH: usize = (1 << 16) - 1;
+
+    pub fn generic_encode<const WORD_LENGTH: usize, Value: Debug + AsRef<[u8]>>(
+        op: Op,
+        vs: HashSet<Value>,
+    ) -> Result<Vec<[u8; WORD_LENGTH]>, String> {
+        let mut ws = Vec::<[u8; WORD_LENGTH]>::new();
+        let mut w = [0; WORD_LENGTH];
+        let mut pos = 0;
+
+        // Returns the metadata to be written alongside the given value.
+        let get_metadata = |v: &Value| {
+            if v.as_ref().len() > MAX_VALUE_LENGTH {
                 return Err(format!(
-                    "insufficient bytes in a word to fit a value of length {}",
-                    bytes.len(),
+                    "values bigger than {} bytes cannot be encoded",
+                    MAX_VALUE_LENGTH
                 ));
             }
-            let n = bytes.len() as u8;
-            let mut res = [0; WORD_LENGTH];
-            if op == Op::Insert {
-                res[0] = 1;
-            } else {
-                res[0] = 0;
+            let m = ((v.as_ref().len() as u16) << 1) + if Op::Insert == op { 1 } else { 0 };
+            Ok(m.to_be_bytes())
+        };
+
+        // Writes the given bytes to the current word `w` starting at the current
+        // position `pos`, overflowing into new words if the number of bytes to be
+        // written is larger than the remaining space in the current word. Pushes
+        // all completed word into `ws`.
+        let mut write_bytes = |mut w: [u8; WORD_LENGTH], mut pos: usize, mut bytes: &[u8]| {
+            if bytes.is_empty() {
+                return Err("cannot encode values of length 0".to_string());
             }
-            res[1] = n;
-            res[2..bytes.len() + 2].copy_from_slice(bytes);
-            Ok(res)
-        })
-        .collect()
-}
+            let available = |pos| WORD_LENGTH - pos;
+            // Gets the length of the available space.
+            loop {
+                if bytes.len() < available(pos) {
+                    w[pos..pos + bytes.len()].copy_from_slice(bytes);
+                    return Ok((w, pos + bytes.len()));
+                } else {
+                    w[pos..].copy_from_slice(&bytes[..available(pos)]);
+                    ws.push(w);
+                    bytes = &bytes[available(pos)..];
+                    w = [0; WORD_LENGTH];
+                    pos = 0;
+                }
+            }
+        };
+
+        for v in vs {
+            let metadata = get_metadata(&v)?;
+            (w, pos) = write_bytes(w, pos, &metadata)?;
+            (w, pos) = write_bytes(w, pos, v.as_ref())?;
+        }
 
-pub fn dummy_decode<const WORD_LENGTH: usize, TryFromError: std::error::Error, Value>(
-    ws: Vec<[u8; WORD_LENGTH]>,
-) -> Result<HashSet<Value>, String>
-where
-    for<'z> Value: Hash + PartialEq + Eq + TryFrom<&'z [u8], Error = TryFromError>,
-{
-    let mut res = HashSet::with_capacity(ws.len());
-    for w in ws {
-        if !w.is_empty() {
-            let n = <usize>::from(w[1]);
-            let v = Value::try_from(&w[2..n + 2]).map_err(|e| e.to_string())?;
-            if w[0] == 1 {
-                res.insert(v);
+        // Do not forget to push the current word if any byte were written to it.
+        if 0 != pos {
+            ws.push(w);
+        }
+
+        Ok(ws)
+    }
+
+    pub fn generic_decode<const WORD_LENGTH: usize, TryFromError: std::error::Error, Value>(
+        ws: Vec<[u8; WORD_LENGTH]>,
+    ) -> Result<HashSet<Value>, String>
+    where
+        for<'z> Value: Hash + PartialEq + Eq + TryFrom<&'z [u8], Error = TryFromError>,
+    {
+        let mut ws = ws.into_iter();
+        let mut vs = HashSet::<Value>::new();
+        let mut w = ws.next();
+        let mut pos = 0;
+
+        // Gets the length of the available space.
+        let available = |pos| WORD_LENGTH - pos;
+
+        // Attempts reading the next `n` bytes from the position `pos`.
+        let mut read_bytes = |mut n: usize, mut pos: usize| -> (Option<Vec<u8>>, usize) {
+            let mut bytes = Vec::<u8>::with_capacity(n);
+            loop {
+                if let Some(cur_w) = w {
+                    if n < available(pos) {
+                        cur_w[pos..pos + n].iter().for_each(|b| bytes.push(*b));
+                        pos += n;
+                        return (Some(bytes), pos);
+                    } else {
+                        cur_w[pos..].iter().for_each(|b| bytes.push(*b));
+                        n -= available(pos);
+                        w = ws.next();
+                        pos = 0;
+                    }
+                } else {
+                    // If there is no more words and not enough bytes could be read,
+                    // let the caller manage.
+                    return (None, pos);
+                }
+            }
+        };
+
+        while let (Some(m), new_pos) = read_bytes(2, pos) {
+            let m = <u16>::from_be_bytes([m[0], m[1]]);
+            let op = if 1 == m % 2 { Op::Insert } else { Op::Delete };
+            let n = (m >> 1) as usize; // safe conversion
+
+            if n != 0 {
+                if let (Some(bytes), new_pos) = read_bytes(n, new_pos) {
+                    pos = new_pos;
+                    let v = Value::try_from(&bytes).map_err(|e| e.to_string())?;
+                    if Op::Insert == op {
+                        vs.insert(v);
+                    } else {
+                        vs.remove(&v);
+                    }
+                } else {
+                    return Err(format!("cannot read {} bytes from the remaining words", n));
+                }
             } else {
-                res.remove(&v);
+                // In case no value bytes were read, only advance by one!
+                if 0 == new_pos {
+                    pos = new_pos;
+                } else {
+                    pos = new_pos - 1;
+                }
             }
         }
+
+        Ok(vs)
     }
-    Ok(res)
 }
 
-#[cfg(test)]
-mod tests {
-    use crate::{Decoder, Encoder};
+#[cfg(any(test, feature = "test-utils"))]
+pub mod tests {
+    use crate::{Decoder, Encoder, Op};
 
-    use super::*;
     use rand::{RngCore, thread_rng};
-    use std::fmt::Debug;
+    use std::{collections::HashSet, fmt::Debug, hash::Hash};
 
     /// Uses fuzzing to attempt asserting that: encode ∘ decode = identity.
     ///
@@ -91,7 +226,7 @@ mod tests {
     /// in chronological order, and attempt decoding the result of this
     /// operation, comparing this result against the expected set of values
     /// built from the raw decoded operations.
-    fn test_encoding<
+    pub fn test_encoding<
         // An upper-bound on the value length is needed for the dummy encoding.
         const MAX_VALUE_LENGTH: usize,
         // Values need to implement conversion from bytes to allow for a uniform
@@ -106,13 +241,13 @@ mod tests {
     ) {
         let mut rng = thread_rng();
 
-        // Draws a random number of operations in [2,12].
-        let n_ops = rng.next_u32() % 10 + 2;
+        // Draws a random number of operations in [0,10].
+        let n_ops = rng.next_u32() % 10;
 
         let ops = (0..n_ops)
             .map(|_| {
-                // Draws a random number of values in [10,100].
-                let n_vs = rng.next_u32() % 90 + 10;
+                // Draws a random number of values in [0,10].
+                let n_vs = rng.next_u32() % 10;
 
                 (
                     // draws a random operation
@@ -124,7 +259,7 @@ mod tests {
                     // draws random values
                     (0..n_vs)
                         .map(|_| {
-                            let len = rng.next_u32() as usize % MAX_VALUE_LENGTH;
+                            let len = rng.next_u32() as usize % MAX_VALUE_LENGTH + 1;
                             let mut bytes = vec![0; len];
                             rng.fill_bytes(&mut bytes);
                             Value::from(bytes)
@@ -162,9 +297,22 @@ mod tests {
 
     #[test]
     fn test_dummy_encoding() {
+        use super::dummy_encoding::*;
         test_encoding::<{ WORD_LENGTH - 2 }, _, _, _>(
             dummy_encode::<WORD_LENGTH, Vec<u8>>,
             dummy_decode,
         );
     }
+
+    #[test]
+    fn test_better_encoding() {
+        use super::generic_encoding::*;
+
+        const WORD_LENGTH: usize = 255;
+        const MAX_VALUE_LENGTH: usize = 2000;
+        test_encoding::<MAX_VALUE_LENGTH, _, _, _>(
+            generic_encode::<WORD_LENGTH, Vec<u8>>,
+            generic_decode,
+        );
+    }
 }
diff --git a/src/findex.rs b/src/findex.rs
index e54f43ad..fa58b7c7 100644
--- a/src/findex.rs
+++ b/src/findex.rs
@@ -123,10 +123,8 @@ mod tests {
     use rand_core::SeedableRng;
 
     use crate::{
-        ADDRESS_LENGTH, Findex, InMemory, IndexADT, Value,
-        address::Address,
-        encoding::{dummy_decode, dummy_encode},
-        secret::Secret,
+        ADDRESS_LENGTH, Findex, InMemory, IndexADT, Value, address::Address, dummy_decode,
+        dummy_encode, secret::Secret,
     };
 
     const WORD_LENGTH: usize = 16;
diff --git a/src/lib.rs b/src/lib.rs
index 6a042c78..6dd26179 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,7 +2,6 @@
 
 mod address;
 mod adt;
-#[cfg(any(test, feature = "test-utils"))]
 mod encoding;
 mod error;
 mod findex;
@@ -18,6 +17,7 @@ mod value;
 
 pub use address::Address;
 pub use adt::{IndexADT, MemoryADT};
+pub use encoding::generic_encoding::{generic_decode, generic_encode};
 pub use error::Error;
 pub use findex::Findex;
 pub use findex::Op;
@@ -27,8 +27,11 @@ pub use value::Value;
 #[cfg(feature = "redis-mem")]
 pub use memory::redis_store::{MemoryError, RedisMemory};
 
-#[cfg(feature = "test-utils")]
-pub use encoding::{WORD_LENGTH, dummy_decode, dummy_encode};
+#[cfg(any(test, feature = "test-utils"))]
+pub use encoding::{
+    dummy_encoding::{WORD_LENGTH, dummy_decode, dummy_encode},
+    tests::test_encoding,
+};
 
 #[cfg(any(test, feature = "test-utils"))]
 pub use memory::InMemory;