Skip to content

Commit

Permalink
Reduce slicing in xxh32/64 loops
Browse files Browse the repository at this point in the history
  • Loading branch information
DoumanAsh committed Feb 19, 2024
1 parent c557f2c commit 4e5c1f1
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 98 deletions.
85 changes: 39 additions & 46 deletions src/xxh32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,36 +49,45 @@ fn finalize(mut input: u32, mut data: &[u8], is_aligned: bool) -> u32 {
avalanche(input)
}

#[inline(always)]
const fn init_v(seed: u32) -> (u32, u32, u32, u32) {
(
seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2),
seed.wrapping_add(PRIME_2),
seed,
seed.wrapping_sub(PRIME_1),
)
}

macro_rules! round_loop {
($input:ident => $($v:tt)+) => {unsafe {
$($v)+.0 = round($($v)+.0, read_le_unaligned($input.as_ptr()));
$($v)+.1 = round($($v)+.1, read_le_unaligned($input.as_ptr().add(4)));
$($v)+.2 = round($($v)+.2, read_le_unaligned($input.as_ptr().add(8)));
$($v)+.3 = round($($v)+.3, read_le_unaligned($input.as_ptr().add(12)));
$input = &$input[16..];
}}
}

///Returns hash for the provided input
pub fn xxh32(mut input: &[u8], seed: u32) -> u32 {
let mut result = input.len() as u32;

if input.len() >= CHUNK_SIZE {
let mut v1 = seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2);
let mut v2 = seed.wrapping_add(PRIME_2);
let mut v3 = seed;
let mut v4 = seed.wrapping_sub(PRIME_1);
let mut v = init_v(seed);

loop {
v1 = round(v1, read_le_unaligned(input.as_ptr()));
input = &input[4..];
v2 = round(v2, read_le_unaligned(input.as_ptr()));
input = &input[4..];
v3 = round(v3, read_le_unaligned(input.as_ptr()));
input = &input[4..];
v4 = round(v4, read_le_unaligned(input.as_ptr()));
input = &input[4..];

round_loop!(input => v);
if input.len() < CHUNK_SIZE {
break;
}
}

result = result.wrapping_add(
v1.rotate_left(1).wrapping_add(
v2.rotate_left(7).wrapping_add(
v3.rotate_left(12).wrapping_add(
v4.rotate_left(18)
v.0.rotate_left(1).wrapping_add(
v.1.rotate_left(7).wrapping_add(
v.2.rotate_left(12).wrapping_add(
v.3.rotate_left(18)
)
)
)
Expand All @@ -95,10 +104,7 @@ pub fn xxh32(mut input: &[u8], seed: u32) -> u32 {
pub struct Xxh32 {
total_len: u32,
is_large_len: bool,
v1: u32,
v2: u32,
v3: u32,
v4: u32,
v: (u32, u32, u32, u32),
mem: [u32; 4],
mem_size: u32,
}
Expand All @@ -110,10 +116,7 @@ impl Xxh32 {
Self {
total_len: 0,
is_large_len: false,
v1: seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2),
v2: seed.wrapping_add(PRIME_2),
v3: seed,
v4: seed.wrapping_sub(PRIME_1),
v: init_v(seed),
mem: [0, 0, 0, 0],
mem_size: 0,
}
Expand Down Expand Up @@ -141,10 +144,10 @@ impl Xxh32 {
ptr::copy_nonoverlapping(input.as_ptr(), (self.mem.as_mut_ptr() as *mut u8).offset(self.mem_size as isize), fill_len)
}

self.v1 = round(self.v1, self.mem[0].to_le());
self.v2 = round(self.v2, self.mem[1].to_le());
self.v3 = round(self.v3, self.mem[2].to_le());
self.v4 = round(self.v4, self.mem[3].to_le());
self.v.0 = round(self.v.0, self.mem[0].to_le());
self.v.1 = round(self.v.1, self.mem[1].to_le());
self.v.2 = round(self.v.2, self.mem[2].to_le());
self.v.3 = round(self.v.3, self.mem[3].to_le());

input = &input[fill_len..];
self.mem_size = 0;
Expand All @@ -155,14 +158,7 @@ impl Xxh32 {
//So it is questionable whether we want to allocate local vars here.
//Streaming version is likely to be used with relatively small chunks anyway.
loop {
self.v1 = round(self.v1, read_le_unaligned(input.as_ptr()));
input = &input[4..];
self.v2 = round(self.v2, read_le_unaligned(input.as_ptr()));
input = &input[4..];
self.v3 = round(self.v3, read_le_unaligned(input.as_ptr()));
input = &input[4..];
self.v4 = round(self.v4, read_le_unaligned(input.as_ptr()));
input = &input[4..];
round_loop!(input => self.v);

if input.len() < CHUNK_SIZE {
break;
Expand All @@ -184,16 +180,16 @@ impl Xxh32 {

if self.is_large_len {
result = result.wrapping_add(
self.v1.rotate_left(1).wrapping_add(
self.v2.rotate_left(7).wrapping_add(
self.v3.rotate_left(12).wrapping_add(
self.v4.rotate_left(18)
self.v.0.rotate_left(1).wrapping_add(
self.v.1.rotate_left(7).wrapping_add(
self.v.2.rotate_left(12).wrapping_add(
self.v.3.rotate_left(18)
)
)
)
);
} else {
result = result.wrapping_add(self.v3.wrapping_add(PRIME_5));
result = result.wrapping_add(self.v.2.wrapping_add(PRIME_5));
}

let input = unsafe {
Expand All @@ -208,10 +204,7 @@ impl Xxh32 {
pub fn reset(&mut self, seed: u32) {
self.total_len = 0;
self.is_large_len = false;
self.v1 = seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2);
self.v2 = seed.wrapping_add(PRIME_2);
self.v3 = seed;
self.v4 = seed.wrapping_sub(PRIME_1);
self.v = init_v(seed);
self.mem_size = 0;
}
}
98 changes: 46 additions & 52 deletions src/xxh64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,40 +79,50 @@ fn finalize(mut input: u64, mut data: &[u8], is_aligned: bool) -> u64 {
avalanche(input)
}

#[inline(always)]
const fn init_v(seed: u64) -> (u64, u64, u64, u64) {
(
seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2),
seed.wrapping_add(PRIME_2),
seed,
seed.wrapping_sub(PRIME_1),
)
}

macro_rules! round_loop {
($input:ident => $($v:tt)+) => {unsafe {
$($v)+.0 = round($($v)+.0, read_64le_unaligned($input.as_ptr()));
$($v)+.1 = round($($v)+.1, read_64le_unaligned($input.as_ptr().add(8)));
$($v)+.2 = round($($v)+.2, read_64le_unaligned($input.as_ptr().add(16)));
$($v)+.3 = round($($v)+.3, read_64le_unaligned($input.as_ptr().add(24)));
$input = &$input[32..];
}}
}

///Returns hash for the provided input.
pub fn xxh64(mut input: &[u8], seed: u64) -> u64 {
let input_len = input.len() as u64;
let mut result;

if input.len() >= CHUNK_SIZE {
let mut v1 = seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2);
let mut v2 = seed.wrapping_add(PRIME_2);
let mut v3 = seed;
let mut v4 = seed.wrapping_sub(PRIME_1);
let mut v = init_v(seed);

loop {
v1 = round(v1, read_64le_unaligned(input.as_ptr()));
input = &input[8..];
v2 = round(v2, read_64le_unaligned(input.as_ptr()));
input = &input[8..];
v3 = round(v3, read_64le_unaligned(input.as_ptr()));
input = &input[8..];
v4 = round(v4, read_64le_unaligned(input.as_ptr()));
input = &input[8..];
round_loop!(input => v);

if input.len() < CHUNK_SIZE {
break;
}
}

result = v1.rotate_left(1).wrapping_add(v2.rotate_left(7))
.wrapping_add(v3.rotate_left(12))
.wrapping_add(v4.rotate_left(18));
result = v.0.rotate_left(1).wrapping_add(v.1.rotate_left(7))
.wrapping_add(v.2.rotate_left(12))
.wrapping_add(v.3.rotate_left(18));

result = merge_round(result, v1);
result = merge_round(result, v2);
result = merge_round(result, v3);
result = merge_round(result, v4);
result = merge_round(result, v.0);
result = merge_round(result, v.1);
result = merge_round(result, v.2);
result = merge_round(result, v.3);
} else {
result = seed.wrapping_add(PRIME_5)
}
Expand All @@ -126,10 +136,7 @@ pub fn xxh64(mut input: &[u8], seed: u64) -> u64 {
#[derive(Clone)]
pub struct Xxh64 {
total_len: u64,
v1: u64,
v2: u64,
v3: u64,
v4: u64,
v: (u64, u64, u64, u64),
mem: [u64; 4],
mem_size: u64,
}
Expand All @@ -140,10 +147,7 @@ impl Xxh64 {
pub const fn new(seed: u64) -> Self {
Self {
total_len: 0,
v1: seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2),
v2: seed.wrapping_add(PRIME_2),
v3: seed,
v4: seed.wrapping_sub(PRIME_1),
v: init_v(seed),
mem: [0, 0, 0, 0],
mem_size: 0,
}
Expand All @@ -170,10 +174,10 @@ impl Xxh64 {
ptr::copy_nonoverlapping(input.as_ptr(), (self.mem.as_mut_ptr() as *mut u8).add(self.mem_size as usize), fill_len)
}

self.v1 = round(self.v1, self.mem[0].to_le());
self.v2 = round(self.v2, self.mem[1].to_le());
self.v3 = round(self.v3, self.mem[2].to_le());
self.v4 = round(self.v4, self.mem[3].to_le());
self.v.0 = round(self.v.0, self.mem[0].to_le());
self.v.1 = round(self.v.1, self.mem[1].to_le());
self.v.2 = round(self.v.2, self.mem[2].to_le());
self.v.3 = round(self.v.3, self.mem[3].to_le());

input = &input[fill_len..];
self.mem_size = 0;
Expand All @@ -184,14 +188,7 @@ impl Xxh64 {
//So it is questionable whether we want to allocate local vars here.
//Streaming version is likely to be used with relatively small chunks anyway.
loop {
self.v1 = round(self.v1, read_64le_unaligned(input.as_ptr()));
input = &input[8..];
self.v2 = round(self.v2, read_64le_unaligned(input.as_ptr()));
input = &input[8..];
self.v3 = round(self.v3, read_64le_unaligned(input.as_ptr()));
input = &input[8..];
self.v4 = round(self.v4, read_64le_unaligned(input.as_ptr()));
input = &input[8..];
round_loop!(input => self.v);

if input.len() < CHUNK_SIZE {
break;
Expand All @@ -212,16 +209,16 @@ impl Xxh64 {
let mut result;

if self.total_len >= CHUNK_SIZE as u64 {
result = self.v1.rotate_left(1).wrapping_add(self.v2.rotate_left(7))
.wrapping_add(self.v3.rotate_left(12))
.wrapping_add(self.v4.rotate_left(18));

result = merge_round(result, self.v1);
result = merge_round(result, self.v2);
result = merge_round(result, self.v3);
result = merge_round(result, self.v4);
result = self.v.0.rotate_left(1).wrapping_add(self.v.1.rotate_left(7))
.wrapping_add(self.v.2.rotate_left(12))
.wrapping_add(self.v.3.rotate_left(18));

result = merge_round(result, self.v.0);
result = merge_round(result, self.v.1);
result = merge_round(result, self.v.2);
result = merge_round(result, self.v.3);
} else {
result = self.v3.wrapping_add(PRIME_5)
result = self.v.2.wrapping_add(PRIME_5)
}

result = result.wrapping_add(self.total_len);
Expand All @@ -237,10 +234,7 @@ impl Xxh64 {
///Resets state with provided seed.
pub fn reset(&mut self, seed: u64) {
self.total_len = 0;
self.v1 = seed.wrapping_add(PRIME_1).wrapping_add(PRIME_2);
self.v2 = seed.wrapping_add(PRIME_2);
self.v3 = seed;
self.v4 = seed.wrapping_sub(PRIME_1);
self.v = init_v(seed);
self.mem_size = 0;
}
}
Expand Down

0 comments on commit 4e5c1f1

Please sign in to comment.