From a663446809eac95d2a1dd4d0189772ab1210fd2e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 29 Dec 2024 21:58:05 +0100 Subject: [PATCH 1/5] chore(deps): Bump serde from 1.0.216 to 1.0.217 (#860) Bumps [serde](https://github.com/serde-rs/serde) from 1.0.216 to 1.0.217. - [Release notes](https://github.com/serde-rs/serde/releases) - [Commits](https://github.com/serde-rs/serde/compare/v1.0.216...v1.0.217) --- updated-dependencies: - dependency-name: serde dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ddb0c77b8..7e5fb5bba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5060,9 +5060,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.216" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] @@ -5078,9 +5078,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.216" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", From c7ecf5d55df0e93007858d6c30e49b4982659b35 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 29 Dec 2024 21:58:31 +0100 Subject: [PATCH 2/5] chore(deps): Bump reqwest from 0.12.10 to 0.12.11 (#859) Bumps [reqwest](https://github.com/seanmonstar/reqwest) from 0.12.10 to 0.12.11. - [Release notes](https://github.com/seanmonstar/reqwest/releases) - [Changelog](https://github.com/seanmonstar/reqwest/blob/master/CHANGELOG.md) - [Commits](https://github.com/seanmonstar/reqwest/compare/v0.12.10...v0.12.11) --- updated-dependencies: - dependency-name: reqwest dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7e5fb5bba..a658c57d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1365,7 +1365,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" dependencies = [ "lazy_static", - "windows-sys 0.59.0", + "windows-sys 0.48.0", ] [[package]] @@ -4658,9 +4658,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.10" +version = "0.12.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d3536321cfc54baa8cf3e273d5e1f63f889067829c4b410fcdbac8ca7b80994" +checksum = "7fe060fe50f524be480214aba758c71f99f90ee8c83c5a36b5e9e1d568eb4eb3" dependencies = [ "base64 0.22.1", "bytes", @@ -6472,7 +6472,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.48.0", ] [[package]] From 94c433be5891f2c0653455c2736d8bac95818359 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 29 Dec 2024 21:58:57 +0100 Subject: [PATCH 3/5] chore(deps): Bump aws-sdk-s3tables from 1.1.0 to 1.2.0 (#858) Bumps [aws-sdk-s3tables](https://github.com/awslabs/aws-sdk-rust) from 1.1.0 to 1.2.0. - [Release notes](https://github.com/awslabs/aws-sdk-rust/releases) - [Commits](https://github.com/awslabs/aws-sdk-rust/commits) --- updated-dependencies: - dependency-name: aws-sdk-s3tables dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 8 ++++---- crates/catalog/s3tables/Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a658c57d5..8a456f274 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -670,9 +670,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.1" +version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "300a12520b4e6d08b73f77680f12c16e8ae43250d55100e0b2be46d78da16a48" +checksum = "44f6f1124d6e19ab6daf7f2e615644305dc6cb2d706892a8a8c0b98db35de020" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -717,9 +717,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3tables" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc0cc08edc80d70edb091fad02537a719ed293ef871553ef8df192c92c415e4d" +checksum = "2111e5117b6e6bbe8c89ddca58e5c1339accc74a47757ab1e39db4f26999a426" dependencies = [ "aws-credential-types", "aws-runtime", diff --git a/crates/catalog/s3tables/Cargo.toml b/crates/catalog/s3tables/Cargo.toml index 64b332611..772b328f3 100644 --- a/crates/catalog/s3tables/Cargo.toml +++ b/crates/catalog/s3tables/Cargo.toml @@ -32,7 +32,7 @@ keywords = ["iceberg", "sql", "catalog"] anyhow = { workspace = true } async-trait = { workspace = true } aws-config = { workspace = true } -aws-sdk-s3tables = "1.0.0" +aws-sdk-s3tables = "1.2.0" iceberg = { workspace = true } serde_json = { workspace = true } uuid = { workspace = true, features = ["v4"] } From 044750f8909c4f42d486c3e0af59ffb37a0dcbc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=9E=97=E4=BC=9F?= Date: Mon, 30 Dec 2024 11:13:14 +0800 Subject: [PATCH 4/5] Add orbstack guide (#856) --- CONTRIBUTING.md | 2 +- docs/contributing/orbstack.md | 39 +++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 docs/contributing/orbstack.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 71a222e14..79a3e6ba6 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -116,7 +116,7 @@ cargo 1.69.0 (6e9a83356 2023-04-12) Currently, iceberg-rust uses Docker to set up environment for integration tests. Native Docker has some limitations, please check (https://github.com/apache/iceberg-rust/pull/748). Please use Orbstack or Podman. -For MacOS users, you can install [OrbStack](https://orbstack.dev/) as a docker alternative. +For MacOS users, you can install [OrbStack as a docker alternative](docs/contributing/orbstack.md). For podman, refer to [Using Podman instead of Docker](docs/contributing/podman.md) diff --git a/docs/contributing/orbstack.md b/docs/contributing/orbstack.md new file mode 100644 index 000000000..29eb09dc5 --- /dev/null +++ b/docs/contributing/orbstack.md @@ -0,0 +1,39 @@ + + +# OrbStack as a docker alternative on macOS +1. Install OrbStack by downloading [installer](https://orbstack.dev/download) or using Homebrew. + ```shell + brew install orbstack + ``` + +2. Migrate Docker data + ```shell + orbstack migrate docker + ``` + +3. (Optional) Add registry mirrors + + You can edit the config directly at `~/.orbstack/config/docker.json` and restart the engine with `orb restart docker`. + + ``` + { + "registry-mirrors": ["https://registry.docker.ir", "https://docker.iranserver.com"] + } + ``` \ No newline at end of file From 328e18e2bae552f42b28bcd2bf926df7cbc652b7 Mon Sep 17 00:00:00 2001 From: xxchan Date: Mon, 30 Dec 2024 23:24:08 +0800 Subject: [PATCH 5/5] feat: support metadata table "snapshots" (#822) --- Cargo.lock | 17 ++ Cargo.toml | 1 + crates/iceberg/Cargo.toml | 1 + crates/iceberg/src/lib.rs | 1 + crates/iceberg/src/metadata_scan.rs | 256 ++++++++++++++++++++++++++++ crates/iceberg/src/scan.rs | 9 +- crates/iceberg/src/spec/snapshot.rs | 12 ++ crates/iceberg/src/table.rs | 7 + 8 files changed, 300 insertions(+), 4 deletions(-) create mode 100644 crates/iceberg/src/metadata_scan.rs diff --git a/Cargo.lock b/Cargo.lock index 8a456f274..42388f85c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2135,6 +2135,12 @@ dependencies = [ "syn 2.0.92", ] +[[package]] +name = "dissimilar" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59f8e79d1fbf76bdfbde321e902714bf6c49df88a7dda6fc682fc2979226962d" + [[package]] name = "dlv-list" version = "0.5.2" @@ -2236,6 +2242,16 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "expect-test" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63af43ff4431e848fb47472a920f14fa71c24de13255a5692e93d4e90302acb0" +dependencies = [ + "dissimilar", + "once_cell", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -2868,6 +2884,7 @@ dependencies = [ "chrono", "ctor", "derive_builder", + "expect-test", "fnv", "futures", "iceberg-catalog-memory", diff --git a/Cargo.toml b/Cargo.toml index b796308be..5b1dca422 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -101,3 +101,4 @@ volo-thrift = "0.10" hive_metastore = "0.1" tera = "1" zstd = "0.13.2" +expect-test = "1" diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index f84e7ab67..7f323722f 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -86,6 +86,7 @@ zstd = { workspace = true } [dev-dependencies] ctor = { workspace = true } +expect-test = { workspace = true } iceberg-catalog-memory = { workspace = true } iceberg_test_utils = { path = "../test_utils", features = ["tests"] } pretty_assertions = { workspace = true } diff --git a/crates/iceberg/src/lib.rs b/crates/iceberg/src/lib.rs index eaecfea60..1946f35f3 100644 --- a/crates/iceberg/src/lib.rs +++ b/crates/iceberg/src/lib.rs @@ -73,6 +73,7 @@ mod avro; pub mod io; pub mod spec; +pub mod metadata_scan; pub mod scan; pub mod expr; diff --git a/crates/iceberg/src/metadata_scan.rs b/crates/iceberg/src/metadata_scan.rs new file mode 100644 index 000000000..942d7605c --- /dev/null +++ b/crates/iceberg/src/metadata_scan.rs @@ -0,0 +1,256 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Metadata table api. + +use std::sync::Arc; + +use arrow_array::builder::{MapBuilder, PrimitiveBuilder, StringBuilder}; +use arrow_array::types::{Int64Type, TimestampMillisecondType}; +use arrow_array::RecordBatch; +use arrow_schema::{DataType, Field, Schema, TimeUnit}; + +use crate::spec::TableMetadata; +use crate::table::Table; +use crate::Result; + +/// Metadata table is used to inspect a table's history, snapshots, and other metadata as a table. +/// +/// References: +/// - +/// - +/// - +#[derive(Debug)] +pub struct MetadataTable(Table); + +impl MetadataTable { + /// Creates a new metadata scan. + pub(super) fn new(table: Table) -> Self { + Self(table) + } + + /// Get the snapshots table. + pub fn snapshots(&self) -> SnapshotsTable { + SnapshotsTable { + metadata_table: self, + } + } + + fn metadata(&self) -> &TableMetadata { + self.0.metadata() + } +} + +/// Snapshots table. +pub struct SnapshotsTable<'a> { + metadata_table: &'a MetadataTable, +} + +impl<'a> SnapshotsTable<'a> { + /// Returns the schema of the snapshots table. + pub fn schema(&self) -> Schema { + Schema::new(vec![ + Field::new( + "committed_at", + DataType::Timestamp(TimeUnit::Millisecond, Some("+00:00".into())), + false, + ), + Field::new("snapshot_id", DataType::Int64, false), + Field::new("parent_id", DataType::Int64, true), + Field::new("operation", DataType::Utf8, false), + Field::new("manifest_list", DataType::Utf8, false), + Field::new( + "summary", + DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct( + vec![ + Field::new("keys", DataType::Utf8, false), + Field::new("values", DataType::Utf8, true), + ] + .into(), + ), + false, + )), + false, + ), + false, + ), + ]) + } + + /// Scans the snapshots table. + pub fn scan(&self) -> Result { + let mut committed_at = + PrimitiveBuilder::::new().with_timezone("+00:00"); + let mut snapshot_id = PrimitiveBuilder::::new(); + let mut parent_id = PrimitiveBuilder::::new(); + let mut operation = StringBuilder::new(); + let mut manifest_list = StringBuilder::new(); + let mut summary = MapBuilder::new(None, StringBuilder::new(), StringBuilder::new()); + + for snapshot in self.metadata_table.metadata().snapshots() { + committed_at.append_value(snapshot.timestamp_ms()); + snapshot_id.append_value(snapshot.snapshot_id()); + parent_id.append_option(snapshot.parent_snapshot_id()); + manifest_list.append_value(snapshot.manifest_list()); + operation.append_value(snapshot.summary().operation.as_str()); + for (key, value) in &snapshot.summary().additional_properties { + summary.keys().append_value(key); + summary.values().append_value(value); + } + summary.append(true)?; + } + + Ok(RecordBatch::try_new(Arc::new(self.schema()), vec![ + Arc::new(committed_at.finish()), + Arc::new(snapshot_id.finish()), + Arc::new(parent_id.finish()), + Arc::new(operation.finish()), + Arc::new(manifest_list.finish()), + Arc::new(summary.finish()), + ])?) + } +} + +#[cfg(test)] +mod tests { + use expect_test::{expect, Expect}; + use itertools::Itertools; + + use super::*; + use crate::scan::tests::TableTestFixture; + + /// Snapshot testing to check the resulting record batch. + /// + /// - `expected_schema/data`: put `expect![[""]]` as a placeholder, + /// and then run test with `UPDATE_EXPECT=1 cargo test` to automatically update the result, + /// or use rust-analyzer (see [video](https://github.com/rust-analyzer/expect-test)). + /// Check the doc of [`expect_test`] for more details. + /// - `ignore_check_columns`: Some columns are not stable, so we can skip them. + /// - `sort_column`: The order of the data might be non-deterministic, so we can sort it by a column. + fn check_record_batch( + record_batch: RecordBatch, + expected_schema: Expect, + expected_data: Expect, + ignore_check_columns: &[&str], + sort_column: Option<&str>, + ) { + let mut columns = record_batch.columns().to_vec(); + if let Some(sort_column) = sort_column { + let column = record_batch.column_by_name(sort_column).unwrap(); + let indices = arrow_ord::sort::sort_to_indices(column, None, None).unwrap(); + columns = columns + .iter() + .map(|column| arrow_select::take::take(column.as_ref(), &indices, None).unwrap()) + .collect_vec(); + } + + expected_schema.assert_eq(&format!( + "{}", + record_batch.schema().fields().iter().format(",\n") + )); + expected_data.assert_eq(&format!( + "{}", + record_batch + .schema() + .fields() + .iter() + .zip_eq(columns) + .map(|(field, column)| { + if ignore_check_columns.contains(&field.name().as_str()) { + format!("{}: (skipped)", field.name()) + } else { + format!("{}: {:?}", field.name(), column) + } + }) + .format(",\n") + )); + } + + #[test] + fn test_snapshots_table() { + let table = TableTestFixture::new().table; + let record_batch = table.metadata_table().snapshots().scan().unwrap(); + check_record_batch( + record_batch, + expect![[r#" + Field { name: "committed_at", data_type: Timestamp(Millisecond, Some("+00:00")), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, + Field { name: "snapshot_id", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, + Field { name: "parent_id", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, + Field { name: "operation", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, + Field { name: "manifest_list", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, + Field { name: "summary", data_type: Map(Field { name: "entries", data_type: Struct([Field { name: "keys", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "values", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }"#]], + expect![[r#" + committed_at: PrimitiveArray + [ + 2018-01-04T21:22:35.770+00:00, + 2019-04-12T20:29:15.770+00:00, + ], + snapshot_id: PrimitiveArray + [ + 3051729675574597004, + 3055729675574597004, + ], + parent_id: PrimitiveArray + [ + null, + 3051729675574597004, + ], + operation: StringArray + [ + "append", + "append", + ], + manifest_list: (skipped), + summary: MapArray + [ + StructArray + -- validity: + [ + ] + [ + -- child 0: "keys" (Utf8) + StringArray + [ + ] + -- child 1: "values" (Utf8) + StringArray + [ + ] + ], + StructArray + -- validity: + [ + ] + [ + -- child 0: "keys" (Utf8) + StringArray + [ + ] + -- child 1: "values" (Utf8) + StringArray + [ + ] + ], + ]"#]], + &["manifest_list"], + Some("committed_at"), + ); + } +} diff --git a/crates/iceberg/src/scan.rs b/crates/iceberg/src/scan.rs index 7a100b346..cb3e5d8c8 100644 --- a/crates/iceberg/src/scan.rs +++ b/crates/iceberg/src/scan.rs @@ -961,7 +961,7 @@ impl FileScanTask { } #[cfg(test)] -mod tests { +pub mod tests { use std::collections::HashMap; use std::fs; use std::fs::File; @@ -990,13 +990,14 @@ mod tests { use crate::table::Table; use crate::TableIdent; - struct TableTestFixture { + pub struct TableTestFixture { table_location: String, - table: Table, + pub table: Table, } impl TableTestFixture { - fn new() -> Self { + #[allow(clippy::new_without_default)] + pub fn new() -> Self { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().join("table1"); let manifest_list1_location = table_location.join("metadata/manifests_list_1.avro"); diff --git a/crates/iceberg/src/spec/snapshot.rs b/crates/iceberg/src/spec/snapshot.rs index 81fd6eae6..f24a3c26b 100644 --- a/crates/iceberg/src/spec/snapshot.rs +++ b/crates/iceberg/src/spec/snapshot.rs @@ -52,6 +52,18 @@ pub enum Operation { Delete, } +impl Operation { + /// Returns the string representation (lowercase) of the operation. + pub fn as_str(&self) -> &str { + match self { + Operation::Append => "append", + Operation::Replace => "replace", + Operation::Overwrite => "overwrite", + Operation::Delete => "delete", + } + } +} + #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone)] /// Summarises the changes in the snapshot. pub struct Summary { diff --git a/crates/iceberg/src/table.rs b/crates/iceberg/src/table.rs index 406f9dd65..fa5304855 100644 --- a/crates/iceberg/src/table.rs +++ b/crates/iceberg/src/table.rs @@ -22,6 +22,7 @@ use std::sync::Arc; use crate::arrow::ArrowReaderBuilder; use crate::io::object_cache::ObjectCache; use crate::io::FileIO; +use crate::metadata_scan::MetadataTable; use crate::scan::TableScanBuilder; use crate::spec::{TableMetadata, TableMetadataRef}; use crate::{Error, ErrorKind, Result, TableIdent}; @@ -200,6 +201,12 @@ impl Table { TableScanBuilder::new(self) } + /// Creates a metadata table which provides table-like APIs for inspecting metadata. + /// See [`MetadataTable`] for more details. + pub fn metadata_table(self) -> MetadataTable { + MetadataTable::new(self) + } + /// Returns the flag indicating whether the `Table` is readonly or not pub fn readonly(&self) -> bool { self.readonly