From 6bc3bdc4de2d0bb59bdf91144d552f663e28bd78 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 28 Aug 2024 11:41:03 +0200 Subject: [PATCH 1/9] Add "remove_item" function to annotation storages --- CHANGELOG.md | 7 +++++ core/src/annostorage/inmemory.rs | 34 ++++++++++++++++++++++++ core/src/annostorage/mod.rs | 11 ++++++++ core/src/graph/mod.rs | 11 ++------ core/src/graph/storage/adjacencylist.rs | 5 +--- core/src/graph/storage/disk_adjacency.rs | 5 +--- 6 files changed, 56 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f4ff7f3c..60b356c47 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- New method `remove_item()` for annotation storages that allows for more + efficient removal if not only a single annotation, but the whole item should + be deleted. This is used in when applying a `DeleteNode` or `DeleteEdge` + event. + ## [3.4.0] - 2024-08-20 ### Added diff --git a/core/src/annostorage/inmemory.rs b/core/src/annostorage/inmemory.rs index b9bca4f97..21652539a 100644 --- a/core/src/annostorage/inmemory.rs +++ b/core/src/annostorage/inmemory.rs @@ -307,6 +307,40 @@ where Ok(result) } + fn remove_item(&mut self, item: &T) -> Result { + let mut result = false; + + if let Some(all_annos) = self.by_container.remove(item) { + for anno in all_annos { + // since value was found, also remove the item from the other containers + self.remove_element_from_by_anno(&anno, item); + + if let Some(resolved_key) = self.anno_keys.get_value(anno.key) { + // decrease the annotation count for this key + let new_key_count: usize = + if let Some(num_of_keys) = self.anno_key_sizes.get_mut(&resolved_key) { + *num_of_keys -= 1; + *num_of_keys + } else { + 0 + }; + // if annotation count dropped to zero remove the key + if new_key_count == 0 { + self.by_anno.remove(&anno.key); + self.anno_key_sizes.remove(&resolved_key); + self.anno_keys.remove(anno.key); + } + } + + result = true; + self.check_and_remove_value_symbol(anno.val); + self.total_number_of_annos -= 1; + } + } + + Ok(result) + } + fn clear(&mut self) -> Result<()> { self.clear_internal(); Ok(()) diff --git a/core/src/annostorage/mod.rs b/core/src/annostorage/mod.rs index 9c42515f0..51e1ac6d9 100644 --- a/core/src/annostorage/mod.rs +++ b/core/src/annostorage/mod.rs @@ -187,6 +187,17 @@ where /// Returns the value for that annotation, if it existed. fn remove_annotation_for_item(&mut self, item: &T, key: &AnnoKey) -> Result>>; + /// Remove all annotations for the given item. Returns whether the item had + /// any annotations. + fn remove_item(&mut self, item: &T) -> Result { + let mut result = false; + for a in self.get_annotations_for_item(item)? { + self.remove_annotation_for_item(item, &a.key)?; + result = true; + } + Ok(result) + } + /// Remove all annotations. fn clear(&mut self) -> Result<()>; diff --git a/core/src/graph/mod.rs b/core/src/graph/mod.rs index e3702c379..806a941c7 100644 --- a/core/src/graph/mod.rs +++ b/core/src/graph/mod.rs @@ -412,15 +412,8 @@ impl Graph { &mut node_id_cache, )? { // delete all annotations - { - for a in self - .node_annos - .get_annotations_for_item(&existing_node_id)? - { - self.node_annos - .remove_annotation_for_item(&existing_node_id, &a.key)?; - } - } + self.node_annos.remove_item(&existing_node_id)?; + // delete all edges pointing to this node either as source or target for c in all_components.iter() { if let Ok(gs) = self.get_or_create_writable(c) { diff --git a/core/src/graph/storage/adjacencylist.rs b/core/src/graph/storage/adjacencylist.rs index 42c2dbf0f..9a105acc9 100644 --- a/core/src/graph/storage/adjacencylist.rs +++ b/core/src/graph/storage/adjacencylist.rs @@ -288,10 +288,7 @@ impl WriteableGraphStorage for AdjacencyListStorage { ingoing.remove(idx); } } - let annos = self.annos.get_annotations_for_item(edge)?; - for a in annos { - self.annos.remove_annotation_for_item(edge, &a.key)?; - } + self.annos.remove_item(edge)?; Ok(()) } diff --git a/core/src/graph/storage/disk_adjacency.rs b/core/src/graph/storage/disk_adjacency.rs index 034fda877..d86fcee22 100644 --- a/core/src/graph/storage/disk_adjacency.rs +++ b/core/src/graph/storage/disk_adjacency.rs @@ -323,10 +323,7 @@ impl WriteableGraphStorage for DiskAdjacencyListStorage { self.edges.remove(edge)?; self.inverse_edges.remove(&edge.inverse())?; - let annos = self.annos.get_annotations_for_item(edge)?; - for a in annos { - self.annos.remove_annotation_for_item(edge, &a.key)?; - } + self.annos.remove_item(edge)?; Ok(()) } From 9182b119fe551727f51ac46a8df57afc918848fb Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 28 Aug 2024 12:07:30 +0200 Subject: [PATCH 2/9] Provide specialized remove_item function for on disk annotation storage --- core/src/annostorage/mod.rs | 9 +----- core/src/annostorage/ondisk.rs | 51 ++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 8 deletions(-) diff --git a/core/src/annostorage/mod.rs b/core/src/annostorage/mod.rs index 51e1ac6d9..35d4c203c 100644 --- a/core/src/annostorage/mod.rs +++ b/core/src/annostorage/mod.rs @@ -189,14 +189,7 @@ where /// Remove all annotations for the given item. Returns whether the item had /// any annotations. - fn remove_item(&mut self, item: &T) -> Result { - let mut result = false; - for a in self.get_annotations_for_item(item)? { - self.remove_annotation_for_item(item, &a.key)?; - result = true; - } - Ok(result) - } + fn remove_item(&mut self, item: &T) -> Result; /// Remove all annotations. fn clear(&mut self) -> Result<()>; diff --git a/core/src/annostorage/ondisk.rs b/core/src/annostorage/ondisk.rs index 779b591ee..c6dc047b9 100644 --- a/core/src/annostorage/ondisk.rs +++ b/core/src/annostorage/ondisk.rs @@ -411,6 +411,57 @@ where Ok(result) } + fn remove_item(&mut self, item: &T) -> Result { + let mut result = false; + + let start = create_by_container_key(item.clone(), usize::MIN); + let end = create_by_container_key(item.clone(), usize::MAX); + + let annos_in_container: Result> = self.by_container.range(start..=end).collect(); + let annos_in_container = annos_in_container?; + + for (by_container_key, _) in annos_in_container { + let symbol_id = usize::parse_key(&by_container_key[T::key_size()..])?; + let key = self + .anno_key_symbols + .get_value(symbol_id) + .unwrap_or_default(); + + if let Some(val) = self.by_container.remove(&by_container_key)? { + // remove annotation from by_anno_qname + let anno = Annotation { + key: key.as_ref().clone(), + val: val.into(), + }; + + self.by_anno_qname.remove(&create_by_anno_qname_key( + item.clone(), + symbol_id, + &anno.val, + ))?; + // decrease the annotation count for this key + let new_key_count: usize = + if let Some(num_of_keys) = self.anno_key_sizes.get_mut(&key) { + *num_of_keys -= 1; + *num_of_keys + } else { + 0 + }; + // if annotation count dropped to zero remove the key + if new_key_count == 0 { + self.anno_key_sizes.remove(&key); + if let Some(id) = self.anno_key_symbols.get_symbol(&key) { + self.anno_key_symbols.remove(id); + } + } + + result = true; + } + } + + Ok(result) + } + fn remove_annotation_for_item(&mut self, item: &T, key: &AnnoKey) -> Result>> { // remove annotation from by_container if let Some(symbol_id) = self.anno_key_symbols.get_symbol(key) { From 934ee5ebe439ba3442ae7a1a323a56af269fe20f Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 28 Aug 2024 12:09:22 +0200 Subject: [PATCH 3/9] Add some simple tests for remove_item --- core/src/annostorage/inmemory/tests.rs | 23 +++++++++++++++++++++++ core/src/annostorage/ondisk/tests.rs | 24 ++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/core/src/annostorage/inmemory/tests.rs b/core/src/annostorage/inmemory/tests.rs index 8fbbeb613..4bcfce94a 100644 --- a/core/src/annostorage/inmemory/tests.rs +++ b/core/src/annostorage/inmemory/tests.rs @@ -104,6 +104,29 @@ fn remove() { assert_eq!(&0, a.anno_key_sizes.get(&test_anno.key).unwrap_or(&0)); } +#[test] +fn remove_item() { + let test_anno = Annotation { + key: AnnoKey { + name: "anno1".into(), + ns: "annis1".into(), + }, + val: "test".into(), + }; + + let mut a: AnnoStorageImpl = AnnoStorageImpl::new(); + a.insert(1, test_anno.clone()).unwrap(); + + assert_eq!(1, a.number_of_annotations().unwrap()); + assert_eq!(1, a.anno_key_sizes.len()); + assert_eq!(&1, a.anno_key_sizes.get(&test_anno.key).unwrap()); + + a.remove_item(&1).unwrap(); + + assert_eq!(0, a.number_of_annotations().unwrap()); + assert_eq!(&0, a.anno_key_sizes.get(&test_anno.key).unwrap_or(&0)); +} + #[test] fn get_node_id_from_name() { let key = NODE_NAME_KEY.as_ref().clone(); diff --git a/core/src/annostorage/ondisk/tests.rs b/core/src/annostorage/ondisk/tests.rs index d00bd4e8c..82f3df121 100644 --- a/core/src/annostorage/ondisk/tests.rs +++ b/core/src/annostorage/ondisk/tests.rs @@ -112,6 +112,30 @@ fn remove() { assert_eq!(&0, a.anno_key_sizes.get(&test_anno.key).unwrap_or(&0)); } +#[test] +fn remove_item() { + LOGGER_INIT.call_once(env_logger::init); + let test_anno = Annotation { + key: AnnoKey { + name: "anno1".into(), + ns: "annis1".into(), + }, + val: "test".into(), + }; + + let mut a = AnnoStorageImpl::new(None).unwrap(); + a.insert(1, test_anno.clone()).unwrap(); + + assert_eq!(1, a.number_of_annotations().unwrap()); + assert_eq!(1, a.anno_key_sizes.len()); + assert_eq!(&1, a.anno_key_sizes.get(&test_anno.key).unwrap()); + + a.remove_item(&1).unwrap(); + + assert_eq!(0, a.number_of_annotations().unwrap()); + assert_eq!(&0, a.anno_key_sizes.get(&test_anno.key).unwrap_or(&0)); +} + #[test] fn get_node_id_from_name() { let key = NODE_NAME_KEY.as_ref().clone(); From 0c52577027b00d6418fb09971a71f807272d5f84 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Wed, 28 Aug 2024 17:18:58 +0200 Subject: [PATCH 4/9] Insert values sorted and only once --- core/src/annostorage/inmemory.rs | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/core/src/annostorage/inmemory.rs b/core/src/annostorage/inmemory.rs index 21652539a..92ebd8e14 100644 --- a/core/src/annostorage/inmemory.rs +++ b/core/src/annostorage/inmemory.rs @@ -11,7 +11,7 @@ use rustc_hash::FxHashSet; use smartstring::alias::String; use smartstring::{LazyCompact, SmartString}; use std::borrow::Cow; -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::hash::Hash; use std::path::Path; use std::sync::Arc; @@ -230,12 +230,18 @@ where // inserts a new relation between the annotation and the item // if set is not existing yet it is created - self.by_anno + let item_list_for_value = self + .by_anno .entry(anno.key) .or_default() .entry(anno.val) - .or_default() - .push(item.clone()); + .or_default(); + // Insert the items sorted, so we can more easily find a specific + // item later. For annotation key/value combinations with a lot of + // items, this can make an impact on performance. + if let Err(idx) = item_list_for_value.binary_search(&item) { + item_list_for_value.insert(idx, item.clone()); + } if existing_anno.is_none() { // a new annotation entry was inserted and did not replace an existing one @@ -311,6 +317,12 @@ where let mut result = false; if let Some(all_annos) = self.by_container.remove(item) { + // Collect relevevant annotation symbols to remove them en-bloc + let all_affected_annos: HashSet<_> = all_annos.iter().map(|a| a.key).collect(); + for anno_key in all_affected_annos { + if let Some(by_anno_entry) = self.by_anno.get_mut(&anno_key) {} + } + for anno in all_annos { // since value was found, also remove the item from the other containers self.remove_element_from_by_anno(&anno, item); From af2b764bc8f93c7d82d2b227b3502e9c60614317 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Thu, 29 Aug 2024 11:37:12 +0200 Subject: [PATCH 5/9] Use binary search for removing the annotation --- core/src/annostorage/inmemory.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/core/src/annostorage/inmemory.rs b/core/src/annostorage/inmemory.rs index 92ebd8e14..54d6862ea 100644 --- a/core/src/annostorage/inmemory.rs +++ b/core/src/annostorage/inmemory.rs @@ -86,7 +86,9 @@ impl Date: Thu, 29 Aug 2024 11:41:08 +0200 Subject: [PATCH 6/9] Remove unused code --- core/src/annostorage/inmemory.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/core/src/annostorage/inmemory.rs b/core/src/annostorage/inmemory.rs index 54d6862ea..f0ed9c3b8 100644 --- a/core/src/annostorage/inmemory.rs +++ b/core/src/annostorage/inmemory.rs @@ -11,7 +11,7 @@ use rustc_hash::FxHashSet; use smartstring::alias::String; use smartstring::{LazyCompact, SmartString}; use std::borrow::Cow; -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; use std::hash::Hash; use std::path::Path; use std::sync::Arc; @@ -319,12 +319,6 @@ where let mut result = false; if let Some(all_annos) = self.by_container.remove(item) { - // Collect relevevant annotation symbols to remove them en-bloc - let all_affected_annos: HashSet<_> = all_annos.iter().map(|a| a.key).collect(); - for anno_key in all_affected_annos { - if let Some(by_anno_entry) = self.by_anno.get_mut(&anno_key) {} - } - for anno in all_annos { // since value was found, also remove the item from the other containers self.remove_element_from_by_anno(&anno, item); From 82c94f1bebb9e12b5eea0e37b418070fbebe5fea Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 2 Sep 2024 15:47:42 +0200 Subject: [PATCH 7/9] Extend benchmark to also including delete operation --- graphannis/benches/graphannis.rs | 144 ++++++++++++++----------------- 1 file changed, 64 insertions(+), 80 deletions(-) diff --git a/graphannis/benches/graphannis.rs b/graphannis/benches/graphannis.rs index 226f164a9..2a8504b52 100644 --- a/graphannis/benches/graphannis.rs +++ b/graphannis/benches/graphannis.rs @@ -228,15 +228,11 @@ fn deserialize_gum(bench: &mut Criterion) { }); } -fn apply_update_inmemory(bench: &mut Criterion) { - if CORPUS_STORAGE.is_none() { - return; - } - - let cs = CORPUS_STORAGE.as_ref().unwrap(); - +/// Creates graph updates for adding some token (first result) and deleting them +/// again (second result item). +fn create_example_update() -> (GraphUpdate, GraphUpdate) { // Create a set of graph updates to apply - let mut u = GraphUpdate::default(); + let mut updates_add = GraphUpdate::default(); // Generate a lot of tokens made of fake strings (using names) let mut token_names: Vec = Vec::with_capacity(TOK_COUNT); @@ -249,101 +245,89 @@ fn apply_update_inmemory(bench: &mut Criterion) { token_names.push(node_name.clone()); // Create token node - u.add_event(UpdateEvent::AddNode { - node_name: node_name.clone(), - node_type: "node".to_string(), - }) - .unwrap(); - u.add_event(UpdateEvent::AddNodeLabel { - node_name: node_name.clone(), - anno_ns: "annis".to_string(), - anno_name: "tok".to_string(), - anno_value: t.to_string(), - }) - .unwrap(); + updates_add + .add_event(UpdateEvent::AddNode { + node_name: node_name.clone(), + node_type: "node".to_string(), + }) + .unwrap(); + updates_add + .add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.clone(), + anno_ns: "annis".to_string(), + anno_name: "tok".to_string(), + anno_value: t.to_string(), + }) + .unwrap(); // add the order relation if let Some(previous_token_name) = previous_token_name { - u.add_event(UpdateEvent::AddEdge { - source_node: previous_token_name, - target_node: node_name.clone(), - layer: "annis".to_string(), - component_type: "Ordering".to_string(), - component_name: "".to_string(), - }) - .unwrap(); + updates_add + .add_event(UpdateEvent::AddEdge { + source_node: previous_token_name, + target_node: node_name.clone(), + layer: "annis".to_string(), + component_type: "Ordering".to_string(), + component_name: "".to_string(), + }) + .unwrap(); } previous_token_name = Some(node_name); } - cs.create_empty_corpus("apply_update_test_corpus", false) - .unwrap(); - - bench.bench_function("apply_update_inmemory", move |b| { - b.iter(|| cs.apply_update("apply_update_test_corpus", &mut u).unwrap()); - }); - - cs.delete("apply_update_test_corpus").unwrap(); + // Add update events to delete the just added token + let mut updates_delete = GraphUpdate::new(); + for t in token_names { + updates_delete + .add_event(UpdateEvent::DeleteNode { + node_name: t.clone(), + }) + .unwrap(); + } + (updates_add, updates_delete) } -fn apply_update_ondisk(bench: &mut Criterion) { +fn apply_update_inmemory(bench: &mut Criterion) { if CORPUS_STORAGE.is_none() { return; } let cs = CORPUS_STORAGE.as_ref().unwrap(); - // Create a set of graph updates to apply - let mut u = GraphUpdate::default(); - - // Generate a lot of tokens made of fake strings (using names) - let mut token_names: Vec = Vec::with_capacity(TOK_COUNT); - let mut previous_token_name = None; - for i in 0..TOK_COUNT { - let node_name = format!("n{}", i); - - let t: &str = LastName(EN).fake(); - - token_names.push(node_name.clone()); - - // Create token node - u.add_event(UpdateEvent::AddNode { - node_name: node_name.clone(), - node_type: "node".to_string(), - }) - .unwrap(); - u.add_event(UpdateEvent::AddNodeLabel { - node_name: node_name.clone(), - anno_ns: "annis".to_string(), - anno_name: "tok".to_string(), - anno_value: t.to_string(), - }) - .unwrap(); - - // add the order relation - if let Some(previous_token_name) = previous_token_name { - u.add_event(UpdateEvent::AddEdge { - source_node: previous_token_name, - target_node: node_name.clone(), - layer: "annis".to_string(), - component_type: "Ordering".to_string(), - component_name: "".to_string(), - }) + bench.bench_function("apply_update_inmemory", move |b| { + cs.create_empty_corpus("apply_update_test_corpus", false) .unwrap(); - } + let (mut u1, mut u2) = create_example_update(); + b.iter(|| { + cs.apply_update("apply_update_test_corpus", &mut u1) + .unwrap(); + cs.apply_update("apply_update_test_corpus", &mut u2) + .unwrap() + }); + cs.delete("apply_update_test_corpus").unwrap(); + }); +} - previous_token_name = Some(node_name); +fn apply_update_ondisk(bench: &mut Criterion) { + if CORPUS_STORAGE.is_none() { + return; } - cs.create_empty_corpus("apply_update_test_corpus", true) - .unwrap(); + let cs = CORPUS_STORAGE.as_ref().unwrap(); bench.bench_function("apply_update_ondisk", move |b| { - b.iter(|| cs.apply_update("apply_update_test_corpus", &mut u).unwrap()); + cs.create_empty_corpus("apply_update_test_corpus", false) + .unwrap(); + let (mut u1, mut u2) = create_example_update(); + b.iter(|| { + cs.apply_update("apply_update_test_corpus", &mut u1) + .unwrap(); + cs.apply_update("apply_update_test_corpus", &mut u2) + .unwrap() + }); + cs.delete("apply_update_test_corpus").unwrap(); }); - - cs.delete("apply_update_test_corpus").unwrap(); } criterion_group!(name=default; config= Criterion::default().sample_size(50); targets = From 925ddbeb6e242dde1aecaa17b57e0636f47a01cb Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Mon, 2 Sep 2024 16:17:00 +0200 Subject: [PATCH 8/9] Use a BTreeSet to store the items belonging to an annotation value. This is much efficient if there are many items than moving it around in a vector. Since serde sees sets as sequences, the serialization should be the same. --- core/src/annostorage/inmemory.rs | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/core/src/annostorage/inmemory.rs b/core/src/annostorage/inmemory.rs index f0ed9c3b8..6d881061f 100644 --- a/core/src/annostorage/inmemory.rs +++ b/core/src/annostorage/inmemory.rs @@ -11,7 +11,7 @@ use rustc_hash::FxHashSet; use smartstring::alias::String; use smartstring::{LazyCompact, SmartString}; use std::borrow::Cow; -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::hash::Hash; use std::path::Path; use std::sync::Arc; @@ -22,7 +22,7 @@ struct SparseAnnotation { val: usize, } -type ValueItemMap = HashMap>; +type ValueItemMap = HashMap>; #[derive(Serialize, Deserialize, Clone, Default)] pub struct AnnoStorageImpl { @@ -86,9 +86,7 @@ impl Date: Mon, 2 Sep 2024 16:26:58 +0200 Subject: [PATCH 9/9] Actually use the on disk representation in benchmark --- graphannis/benches/graphannis.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphannis/benches/graphannis.rs b/graphannis/benches/graphannis.rs index 2a8504b52..1e474bf24 100644 --- a/graphannis/benches/graphannis.rs +++ b/graphannis/benches/graphannis.rs @@ -317,7 +317,7 @@ fn apply_update_ondisk(bench: &mut Criterion) { let cs = CORPUS_STORAGE.as_ref().unwrap(); bench.bench_function("apply_update_ondisk", move |b| { - cs.create_empty_corpus("apply_update_test_corpus", false) + cs.create_empty_corpus("apply_update_test_corpus", true) .unwrap(); let (mut u1, mut u2) = create_example_update(); b.iter(|| {