From 22b76add63dccad5d213cd2212cab94bde112c15 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Wed, 8 Jan 2025 17:26:37 +0100 Subject: [PATCH 1/5] Add arrow conversions for OffsetsBuffer --- src/offset.rs | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/offset.rs b/src/offset.rs index 80b45d6680..a724002671 100644 --- a/src/offset.rs +++ b/src/offset.rs @@ -541,3 +541,47 @@ impl std::ops::Deref for OffsetsBuffer { self.0.as_slice() } } + +/// arrow1 -> arrow2 +#[cfg(feature = "arrow")] +impl From> + for OffsetsBuffer +{ + fn from(offset_buffer2: arrow_buffer::OffsetBuffer) -> Self { + let buffer1: arrow_buffer::Buffer = offset_buffer2.into_inner().into_inner().into(); + // SAFETY: the input buffer is guaranteed to be valid + unsafe { Self::new_unchecked(buffer1.into()) } + } +} + +/// arrow2 -> arrow1 +#[cfg(feature = "arrow")] +impl From> + for arrow_buffer::OffsetBuffer +{ + fn from(offsets_buffer: OffsetsBuffer) -> Self { + let num_elements = offsets_buffer.len(); + Self::new(arrow_buffer::ScalarBuffer::new( + offsets_buffer.into_inner().into(), + 0, + num_elements, + )) + } +} + +#[cfg(feature = "arrow")] +#[test] +fn test_arrow_offsets_buffer_conversion() { + let mut arrow2_offsets = + OffsetsBuffer::::from(Offsets::try_from(vec![0, 1, 3, 3, 12, 42]).unwrap()); + arrow2_offsets.slice(1, 4); + + assert_eq!(arrow2_offsets.as_slice(), [1, 3, 3, 12]); + + let arrow1_offsets: arrow_buffer::OffsetBuffer = arrow2_offsets.clone().into(); + assert_eq!(arrow1_offsets.as_ref(), [1, 3, 3, 12]); + + let back_again = OffsetsBuffer::from(arrow1_offsets); + assert_eq!(back_again, arrow2_offsets); + assert_eq!(back_again.as_slice(), [1, 3, 3, 12]); +} From d287b8c986a666f4600c070b43910285dc67127e Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Wed, 8 Jan 2025 20:14:20 +0100 Subject: [PATCH 2/5] Add `Bitmap::from_arrow` --- src/bitmap/immutable.rs | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/bitmap/immutable.rs b/src/bitmap/immutable.rs index 49a8f14e80..305cfa0cde 100644 --- a/src/bitmap/immutable.rs +++ b/src/bitmap/immutable.rs @@ -98,6 +98,17 @@ impl Bitmap { }) } + /// Convert from `arrow-rs` `NullBuffer` + #[cfg(feature = "arrow")] + pub fn from_arrow(nulls: arrow_buffer::buffer::NullBuffer) -> Self { + let offset = nulls.offset(); + let len = nulls.len(); + let null_count = nulls.null_count(); + let bytes = crate::buffer::to_bytes(nulls.into_inner().into_inner()); + // SAFETY: the invariants are held by the input + unsafe { Self::from_inner_unchecked(bytes.into(), offset, len, null_count) } + } + /// Returns the length of the [`Bitmap`]. #[inline] pub fn len(&self) -> usize { @@ -491,3 +502,32 @@ impl From for arrow_buffer::buffer::NullBuffer { unsafe { arrow_buffer::buffer::NullBuffer::new_unchecked(buffer, null_count) } } } + +// // Can't implement this because of `impl> From

for Bitmap` +// #[cfg(feature = "arrow")] +// impl From for Bitmap { +// fn from(value: arrow_buffer::buffer::NullBuffer) -> Self { +// let buffer = value.buffer.into(); +// let null_count = value.null_count; +// // Safety: null count is accurate +// unsafe { Self::from_unchecked(buffer, null_count) } +// } +// } + +#[cfg(feature = "arrow")] +#[test] +fn test_arrow_nullbuffer_conversion() { + let mut bitmap2 = Bitmap::from([false, true, false, false, true, false, false, false, true]); + bitmap2.slice(1, 6); + + assert_eq!( + bitmap2, + Bitmap::from([true, false, false, true, false, false]) + ); + + let nulls1 = arrow_buffer::buffer::NullBuffer::from(bitmap2.clone()); + assert_eq!(nulls1.null_count(), bitmap2.null_count()); + + let back_again = Bitmap::from_arrow(nulls1); + assert_eq!(back_again, bitmap2); +} From fe864b77465a2ff96019def62780da1772a5c4fd Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Wed, 8 Jan 2025 20:14:34 +0100 Subject: [PATCH 3/5] Add arrow conversions for `ListArray` --- src/array/list/mod.rs | 137 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 136 insertions(+), 1 deletion(-) diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index 55b4875cf7..6dbf0550ee 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -7,7 +7,7 @@ use crate::{ offset::{Offset, Offsets, OffsetsBuffer}, }; -use super::{new_empty_array, specification::try_check_offsets_bounds, Array}; +use super::{new_empty_array, specification::try_check_offsets_bounds, Array, PrimitiveArray}; #[cfg(feature = "arrow")] mod data; @@ -242,3 +242,138 @@ impl Array for ListArray { Box::new(self.clone().with_validity(validity)) } } + +/// arrow2 -> arrow1 conversion +#[cfg(feature = "arrow")] +impl From> + for arrow_array::GenericListArray +{ + fn from(value: ListArray) -> Self { + let field = ListArray::::get_child_field(&value.data_type()); + let field = Arc::new(arrow_schema::Field::new( + "item", + field.clone().data_type.into(), + field.is_nullable, + )); + let offsets = value.offsets().clone().into(); + let values = value.values().clone().into(); + let nulls = value.validity().map(|x| x.clone().into()); + Self::new(field, offsets, values, nulls) + } +} + +/// arrow1 -> arrow2 conversion +#[cfg(feature = "arrow")] +impl From> + for ListArray +{ + fn from(array1: arrow_array::GenericListArray) -> Self { + let (field1, offset_buffer1, array1, nulls1) = array1.into_parts(); + let data_type1 = field1.data_type().clone(); + Self::new( + Self::default_datatype(data_type1.into()), + offset_buffer1.into(), + array1.into(), + nulls1.map(Bitmap::from_arrow), + ) + } +} + +#[cfg(feature = "arrow")] +#[test] +fn test_arrow_list_array_conversion_non_null() { + /* + We build this: + + [0_001, 0_002], + [1_001, 1_002, 1_003], + [], + [3_001, 3_002], + [4_001], + */ + let offsets = OffsetsBuffer::::from(Offsets::try_from(vec![0, 2, 5, 5, 7, 8]).unwrap()); + let values = PrimitiveArray::::from_vec(vec![ + 0_001_i16, 0_002, // + 1_001, 1_002, 1_003, // + // + 3_001, 3_002, // + 4_001, + ]); + // let bitmap = Some(Bitmap::from([true, truefalse, true])); + let bitmap = None; + + let list_array = ListArray::new( + DataType::List(Arc::new(Field::new("item", DataType::Int16, true))), + offsets, + values.boxed(), + bitmap, + ); + + // Skip first and last elements: + let list_array = list_array.sliced(1, 3); + + assert_eq!(list_array.len(), 3); + assert_eq!(list_array.value(0).len(), 3); + assert_eq!(list_array.value(1).len(), 0); + assert_eq!(list_array.value(2).len(), 2); + + let list_array_1 = arrow_array::ListArray::from(list_array.clone()); + assert_eq!(list_array_1.value_length(0), 3); + assert_eq!(list_array_1.value_length(1), 0); + assert_eq!(list_array_1.value_length(2), 2); + + let roundtripped = ListArray::from(list_array_1); + + assert_eq!(list_array, roundtripped); +} + +#[cfg(feature = "arrow")] +#[test] +fn test_arrow_list_array_conversion_nullable() { + /* + We build this: + + [0_001, 0_002], + [1_001, 1_002, 1_003], + [], + [3_001, 3_002], + null, + [4_001], + */ + let offsets = OffsetsBuffer::::from(Offsets::try_from(vec![0, 2, 5, 5, 7, 7, 8]).unwrap()); + let values = PrimitiveArray::::from_vec(vec![ + 0_001_i16, 0_002, // + 1_001, 1_002, 1_003, // + // [] + 3_001, 3_002, // + // null + 4_001, + ]); + let bitmap = Some(Bitmap::from([true, true, true, true, false, true])); + + let list_array = ListArray::new( + DataType::List(Arc::new(Field::new("item", DataType::Int16, true))), + offsets, + values.boxed(), + bitmap, + ); + + // Skip first and last elements: + let list_array = list_array.sliced(1, 4); + + assert_eq!(list_array.len(), 4); + assert_eq!(list_array.value(0).len(), 3); + assert_eq!(list_array.value(1).len(), 0); + assert_eq!(list_array.value(2).len(), 2); + assert_eq!(list_array.value(3).len(), 0); // null + + let list_array_1 = arrow_array::ListArray::from(list_array.clone()); + assert_eq!(list_array_1.value_length(0), 3); + assert_eq!(list_array_1.value_length(1), 0); + assert_eq!(list_array_1.value_length(2), 2); + assert_eq!(list_array_1.value_length(3), 0); // null + + let roundtripped = ListArray::from(list_array_1); + + assert_eq!(list_array, roundtripped); +} From 0e4b3dd7cd73426b1209ebe0323087452a7c8b91 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Wed, 8 Jan 2025 20:21:37 +0100 Subject: [PATCH 4/5] Clippy fixes --- src/array/list/mod.rs | 4 +++- src/offset.rs | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index 6dbf0550ee..d54c541910 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -249,7 +249,7 @@ impl From> for arrow_array::GenericListArray { fn from(value: ListArray) -> Self { - let field = ListArray::::get_child_field(&value.data_type()); + let field = ListArray::::get_child_field(value.data_type()); let field = Arc::new(arrow_schema::Field::new( "item", field.clone().data_type.into(), @@ -282,6 +282,7 @@ impl From From { fn from(offset_buffer2: arrow_buffer::OffsetBuffer) -> Self { - let buffer1: arrow_buffer::Buffer = offset_buffer2.into_inner().into_inner().into(); + let buffer1: arrow_buffer::Buffer = offset_buffer2.into_inner().into_inner(); // SAFETY: the input buffer is guaranteed to be valid unsafe { Self::new_unchecked(buffer1.into()) } } From c9b759ef8e3b267f12a6462bf94a0098a52143d3 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Thu, 9 Jan 2025 10:20:24 +0100 Subject: [PATCH 5/5] only clone the datatype --- src/array/list/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/array/list/mod.rs b/src/array/list/mod.rs index d54c541910..0c3d6a139c 100644 --- a/src/array/list/mod.rs +++ b/src/array/list/mod.rs @@ -252,7 +252,7 @@ impl From> let field = ListArray::::get_child_field(value.data_type()); let field = Arc::new(arrow_schema::Field::new( "item", - field.clone().data_type.into(), + field.data_type.clone().into(), field.is_nullable, )); let offsets = value.offsets().clone().into();