From cf70fcd54ab6257cd3cb87d2e603a58d0a67e1e6 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 20 Aug 2024 15:01:13 +0200 Subject: [PATCH 1/6] Add support for coverage edges between spans and segmentation nodes --- CHANGELOG.md | 10 +++ cli/src/bin/annis.rs | 2 +- graphannis/src/annis/db/aql/model.rs | 103 ++++++++++++----------- graphannis/src/annis/db/corpusstorage.rs | 3 +- graphannis/src/annis/util/mod.rs | 1 + graphannis/src/lib.rs | 3 - 6 files changed, 69 insertions(+), 53 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 136bcd5a6..3c18c8fc2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- Added support for coverage edges between span nodes an segmentation nodes when + calculating the AQL model index. + +### Fixed + +- Do not use recursion to calculate the indirect coverage edges in the model + index, since this could fail for deeply nested structures. + ## [3.3.3] - 2024-07-12 ### Fixed diff --git a/cli/src/bin/annis.rs b/cli/src/bin/annis.rs index 587acba82..a055918e2 100644 --- a/cli/src/bin/annis.rs +++ b/cli/src/bin/annis.rs @@ -175,7 +175,7 @@ impl AnnisRunner { let readline = rl.readline(&prompt); match readline { Ok(line) => { - rl.add_history_entry(&line.clone()); + rl.add_history_entry(line.clone()); if !self.exec(&line) { break; } diff --git a/graphannis/src/annis/db/aql/model.rs b/graphannis/src/annis/db/aql/model.rs index 4ffeee063..20fbbc28c 100644 --- a/graphannis/src/annis/db/aql/model.rs +++ b/graphannis/src/annis/db/aql/model.rs @@ -112,60 +112,66 @@ fn calculate_inherited_coverage_edges( graph: &mut AnnotationGraph, n: NodeID, all_cov_components: &[AnnotationComponent], - all_dom_gs: &[Arc], + all_dom_components: &[AnnotationComponent], ) -> std::result::Result, ComponentTypeError> { - let mut directly_covered_token = FxHashSet::default(); - - for c in all_cov_components.iter() { - if let Some(gs) = graph.get_graphstorage_as_ref(c) { - let out: Result, graphannis_core::errors::GraphAnnisCoreError> = - gs.get_outgoing_edges(n).collect(); - directly_covered_token.extend(out?); - } - } - - if directly_covered_token.is_empty() { - let has_token_anno = graph - .get_node_annos() - .get_value_for_item(&n, &TOKEN_KEY)? - .is_some(); - if has_token_anno { - // Even if technically a token does not cover itself, if we need to abort the recursion - // with the basic case - directly_covered_token.insert(n); + // Iterate over all all nodes that are somehow covered (by coverage or + // dominance edges) starting from the given node. + let all_text_coverage_components: Vec = + [all_cov_components, all_dom_components].concat(); + + let all_text_coverage_gs: Vec<_> = all_text_coverage_components + .iter() + .filter_map(|c| graph.get_graphstorage_as_ref(c)) + .map(|gs| gs.as_edgecontainer()) + .collect(); + let combined_gs = UnionEdgeContainer::new(all_text_coverage_gs); + + // Remember the non-token and token nodes, so we can connect all non-token + // nodes to the covered token. + let mut indirectly_covered_nodes = FxHashSet::default(); + indirectly_covered_nodes.insert(n); + let mut covered_token = FxHashSet::default(); + + let tok_helper = TokenHelper::new(graph)?; + + for step in CycleSafeDFS::new(&combined_gs, n, 1, usize::MAX) { + let step = step?; + if tok_helper.is_token(step.node)? { + covered_token.insert(step.node); + } else { + indirectly_covered_nodes.insert(step.node); } } - let mut indirectly_covered_token = FxHashSet::default(); - // recursivly get the covered token from all children connected by a dominance relation - for dom_gs in all_dom_gs { - for out in dom_gs.get_outgoing_edges(n) { - let out = out?; - indirectly_covered_token.extend(calculate_inherited_coverage_edges( - graph, - out, - all_cov_components, - all_dom_gs, - )?); - } - } + let coverage_gs = tok_helper.get_gs_coverage().clone(); if let Ok(gs_cov) = graph.get_or_create_writable(&AnnotationComponent::new( AnnotationComponentType::Coverage, ANNIS_NS.into(), "inherited-coverage".into(), )) { - // Ignore all already directly covered token when creating the inherited coverage edges - for t in indirectly_covered_token.difference(&directly_covered_token) { - gs_cov.add_edge(Edge { - source: n, - target: *t, - })?; + // Connect all non-token nodes to the covered token nodes if no such direct coverage already exists + for source in indirectly_covered_nodes { + for target in &covered_token { + let mut needs_edge = true; + for gs in coverage_gs.iter() { + if gs.is_connected(source, *target, 1, std::ops::Bound::Included(1))? { + needs_edge = false; + break; + } + } + + if needs_edge { + gs_cov.add_edge(Edge { + source, + target: *target, + })?; + } + } } } - directly_covered_token.extend(indirectly_covered_token); - Ok(directly_covered_token) + Ok(covered_token) } pub struct AQLUpdateGraphIndex { @@ -276,17 +282,18 @@ impl AQLUpdateGraphIndex { let all_cov_components = graph.get_all_components(Some(AnnotationComponentType::Coverage), None); - let all_dom_gs: Vec> = graph - .get_all_components(Some(AnnotationComponentType::Dominance), Some("")) - .into_iter() - .filter_map(|c| graph.get_graphstorage(&c)) - .collect(); + let all_dom_components = + graph.get_all_components(Some(AnnotationComponentType::Dominance), Some("")); // go over each node and calculate the left-most and right-most token for invalid in self.invalid_nodes.iter()? { let (n, _) = invalid?; - let covered_token = - calculate_inherited_coverage_edges(graph, n, &all_cov_components, &all_dom_gs)?; + let covered_token = calculate_inherited_coverage_edges( + graph, + n, + &all_cov_components, + &all_dom_components, + )?; self.calculate_token_alignment( graph, n, diff --git a/graphannis/src/annis/db/corpusstorage.rs b/graphannis/src/annis/db/corpusstorage.rs index fbb27b798..f0bac4506 100644 --- a/graphannis/src/annis/db/corpusstorage.rs +++ b/graphannis/src/annis/db/corpusstorage.rs @@ -1078,7 +1078,7 @@ impl CorpusStorage { for node in file_nodes? { // Get the linked file for this node if let Some(original_path) = node_annos.get_value_for_item(&node, &linked_file_key)? { - let original_path = old_base_path.join(&PathBuf::from(original_path.as_ref())); + let original_path = old_base_path.join(PathBuf::from(original_path.as_ref())); if original_path.is_file() { if let Some(node_name) = node_annos.get_value_for_item(&node, &NODE_NAME_KEY)? { // Create a new file name based on the node name and copy the file @@ -1583,6 +1583,7 @@ impl CorpusStorage { /// Count the number of results for a `query`. /// - `query` - The search query definition. + /// /// Returns the count as number. pub fn count>(&self, query: SearchQuery) -> Result { let timeout = TimeoutCheck::new(query.timeout); diff --git a/graphannis/src/annis/util/mod.rs b/graphannis/src/annis/util/mod.rs index 295929745..dac24db37 100644 --- a/graphannis/src/annis/util/mod.rs +++ b/graphannis/src/annis/util/mod.rs @@ -73,6 +73,7 @@ impl From for SearchDef { /// Returns a vector over all query definitions defined in a CSV file. /// - `file` - The CSV file path. /// - `panic_on_invalid` - If true, an invalid query definition will trigger a panic, otherwise it will be ignored. +/// /// Can be used if this query is called in a test case to fail the test. pub fn get_queries_from_csv(file: &Path, panic_on_invalid: bool) -> Vec { if let Ok(mut reader) = csv::Reader::from_path(file) { diff --git a/graphannis/src/lib.rs b/graphannis/src/lib.rs index 0f59d3cb6..b10706d5b 100644 --- a/graphannis/src/lib.rs +++ b/graphannis/src/lib.rs @@ -22,9 +22,6 @@ extern crate lazy_static; #[macro_use] extern crate lalrpop_util; -#[cfg(feature = "c-api")] -extern crate simplelog; - mod annis; pub use crate::annis::db::corpusstorage::CorpusStorage; From 700b11b579e786aa1bc39d5880db235a11a188bc Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 20 Aug 2024 15:12:44 +0200 Subject: [PATCH 2/6] Add exception to clippy rule as a workaround. This will probably be fixed in clippy 1.81 (https://github.com/rust-lang/rust-clippy/pull/12892) --- graphannis/src/annis/db/corpusstorage.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/graphannis/src/annis/db/corpusstorage.rs b/graphannis/src/annis/db/corpusstorage.rs index f0bac4506..5064fecdd 100644 --- a/graphannis/src/annis/db/corpusstorage.rs +++ b/graphannis/src/annis/db/corpusstorage.rs @@ -1269,6 +1269,7 @@ impl CorpusStorage { }; let config_as_str: Option<&str> = config_as_str.as_deref(); + #[allow(clippy::needless_borrows_for_generic_args)] graphannis_core::graph::serialization::graphml::export( graph, config_as_str, From b249354018fecdddf8b57709c3eb882934525e2e Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 20 Aug 2024 16:50:18 +0200 Subject: [PATCH 3/6] Add tests and only add the inherited edges for the invalid node itself. All covered nodes of an invalidated node will also be invalidated, so we don't need to implicitly add the edges when collected the covered token of the invalid node. --- graphannis/src/annis/db/aql/model.rs | 78 +++--- graphannis/src/annis/db/aql/model/tests.rs | 255 +++++++++++++++++- .../src/annis/db/corpusstorage/tests.rs | 4 +- graphannis/src/annis/db/example_generator.rs | 221 ++++++++++++++- 4 files changed, 516 insertions(+), 42 deletions(-) diff --git a/graphannis/src/annis/db/aql/model.rs b/graphannis/src/annis/db/aql/model.rs index 20fbbc28c..6d7513ce9 100644 --- a/graphannis/src/annis/db/aql/model.rs +++ b/graphannis/src/annis/db/aql/model.rs @@ -126,47 +126,53 @@ fn calculate_inherited_coverage_edges( .collect(); let combined_gs = UnionEdgeContainer::new(all_text_coverage_gs); - // Remember the non-token and token nodes, so we can connect all non-token - // nodes to the covered token. - let mut indirectly_covered_nodes = FxHashSet::default(); - indirectly_covered_nodes.insert(n); let mut covered_token = FxHashSet::default(); - let tok_helper = TokenHelper::new(graph)?; - - for step in CycleSafeDFS::new(&combined_gs, n, 1, usize::MAX) { - let step = step?; - if tok_helper.is_token(step.node)? { - covered_token.insert(step.node); - } else { - indirectly_covered_nodes.insert(step.node); - } - } - - let coverage_gs = tok_helper.get_gs_coverage().clone(); - - if let Ok(gs_cov) = graph.get_or_create_writable(&AnnotationComponent::new( + let inherited_cov_component = AnnotationComponent::new( AnnotationComponentType::Coverage, ANNIS_NS.into(), "inherited-coverage".into(), - )) { - // Connect all non-token nodes to the covered token nodes if no such direct coverage already exists - for source in indirectly_covered_nodes { - for target in &covered_token { - let mut needs_edge = true; - for gs in coverage_gs.iter() { - if gs.is_connected(source, *target, 1, std::ops::Bound::Included(1))? { - needs_edge = false; - break; - } - } + ); - if needs_edge { - gs_cov.add_edge(Edge { - source, - target: *target, - })?; - } + { + let tok_helper = TokenHelper::new(graph)?; + for step in CycleSafeDFS::new(&combined_gs, n, 1, usize::MAX) { + let step = step?; + if tok_helper.is_token(step.node)? { + covered_token.insert(step.node); + } + } + }; + let other_coverage_gs: Vec> = graph + .get_all_components(Some(AnnotationComponentType::Coverage), None) + .into_iter() + .filter(|c| c != &inherited_cov_component) + .filter_map(|c| graph.get_graphstorage(&c)) + .filter(|gs| { + if let Some(stats) = gs.get_statistics() { + stats.nodes > 0 + } else { + true + } + }) + .collect(); + + // Connect all non-token nodes to the covered token nodes if no such direct coverage already exists + let mut direct_coverage_targets = FxHashSet::default(); + for gs in other_coverage_gs.iter() { + for target in gs.get_outgoing_edges(n) { + direct_coverage_targets.insert(target?); + } + } + let gs_cov = graph.get_or_create_writable(&inherited_cov_component)?; + + for target in &covered_token { + if n != *target { + if !direct_coverage_targets.contains(target) { + gs_cov.add_edge(Edge { + source: n, + target: *target, + })?; } } } @@ -283,7 +289,7 @@ impl AQLUpdateGraphIndex { let all_cov_components = graph.get_all_components(Some(AnnotationComponentType::Coverage), None); let all_dom_components = - graph.get_all_components(Some(AnnotationComponentType::Dominance), Some("")); + graph.get_all_components(Some(AnnotationComponentType::Dominance), None); // go over each node and calculate the left-most and right-most token for invalid in self.invalid_nodes.iter()? { diff --git a/graphannis/src/annis/db/aql/model/tests.rs b/graphannis/src/annis/db/aql/model/tests.rs index a0f0ff93d..6c44b026d 100644 --- a/graphannis/src/annis/db/aql/model/tests.rs +++ b/graphannis/src/annis/db/aql/model/tests.rs @@ -1,7 +1,19 @@ use std::{fs::File, path::PathBuf}; -use crate::{annis::db::aql::model::CorpusSize, AnnotationGraph}; +use crate::{ + annis::db::{aql::model::CorpusSize, example_generator}, + model::AnnotationComponent, + AnnotationGraph, +}; use assert_matches::assert_matches; +use graphannis_core::graph::{ + storage::GraphStorage, + update::{GraphUpdate, UpdateEvent}, + NODE_NAME_KEY, +}; +use itertools::Itertools; + +use super::AnnotationComponentType::Coverage; #[test] fn global_stats_token_count() { @@ -21,3 +33,244 @@ fn global_stats_token_count() { && *segmentation_count.get("diplomatic").unwrap() == 11 && *segmentation_count.get("norm").unwrap() == 13); } + +#[test] +fn inherited_cov_edges_simple_tokenization() { + // Ad a simple dominance node structure above the example sentence. + let mut u = GraphUpdate::new(); + example_generator::create_corpus_structure_simple(&mut u); + example_generator::create_tokens(&mut u, Some("root/doc1"), Some("root/doc1")); + example_generator::make_span( + &mut u, + "root/doc1#span1", + &["root/doc1#tok1", "root/doc1#tok2", "root/doc1#tok3"], + true, + ); + example_generator::make_span( + &mut u, + "root/doc1#span2", + &["root/doc1#tok4", "root/doc1#tok5"], + true, + ); + u.add_event(UpdateEvent::AddNode { + node_name: "root/doc1#struct1".to_string(), + node_type: "node".to_string(), + }) + .unwrap(); + u.add_event(UpdateEvent::AddNodeLabel { + node_name: "root/doc1#struct1".to_string(), + anno_ns: "test".to_string(), + anno_name: "cat".to_string(), + anno_value: "P".to_string(), + }) + .unwrap(); + u.add_event(UpdateEvent::AddEdge { + source_node: "root/doc1#struct1".to_string(), + target_node: "root/doc1#span1".to_string(), + layer: "test".to_string(), + component_type: "Dominance".to_string(), + component_name: "edge".to_string(), + }) + .unwrap(); + u.add_event(UpdateEvent::AddEdge { + source_node: "root/doc1#struct1".to_string(), + target_node: "root/doc1#span2".to_string(), + layer: "test".to_string(), + component_type: "Dominance".to_string(), + component_name: "edge".to_string(), + }) + .unwrap(); + u.add_event(UpdateEvent::AddNode { + node_name: "root/doc1#struct2".to_string(), + node_type: "node".to_string(), + }) + .unwrap(); + u.add_event(UpdateEvent::AddNodeLabel { + node_name: "root/doc1#struct2".to_string(), + anno_ns: "test".to_string(), + anno_name: "cat".to_string(), + anno_value: "ROOT".to_string(), + }) + .unwrap(); + u.add_event(UpdateEvent::AddEdge { + source_node: "root/doc1#struct2".to_string(), + target_node: "root/doc1#struct1".to_string(), + layer: "test".to_string(), + component_type: "Dominance".to_string(), + component_name: "edge".to_string(), + }) + .unwrap(); + + let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap(); + g.apply_update(&mut u, |_| {}).unwrap(); + + // Check that the inherited coverage edges have been created + let gs = g + .get_graphstorage_as_ref(&AnnotationComponent::new( + Coverage, + "annis".into(), + "inherited-coverage".into(), + )) + .unwrap(); + let sources: Vec<_> = gs + .source_nodes() + .map(|n| { + g.get_node_annos() + .get_value_for_item(&n.unwrap(), &NODE_NAME_KEY) + .unwrap() + .unwrap() + .to_string() + }) + .sorted() + .collect(); + assert_eq!(sources, vec!["root/doc1#struct1", "root/doc1#struct2"]); + + // Also check that the edges target the right token + assert_out_edges( + &g, + gs, + "root/doc1#struct1", + &[ + "root/doc1#tok1", + "root/doc1#tok2", + "root/doc1#tok3", + "root/doc1#tok4", + "root/doc1#tok5", + ], + ); + assert_out_edges( + &g, + gs, + "root/doc1#struct2", + &[ + "root/doc1#tok1", + "root/doc1#tok2", + "root/doc1#tok3", + "root/doc1#tok4", + "root/doc1#tok5", + ], + ); +} + +#[test] +fn inherited_cov_edges_multiple_segmentation() { + let mut u = GraphUpdate::new(); + example_generator::create_corpus_structure_simple(&mut u); + example_generator::create_multiple_segmentations(&mut u, "root/doc1"); + // Add a simple dominance node structure above the "a" segmentation + example_generator::make_span( + &mut u, + "root/doc1#span1", + &["root/doc1#a1", "root/doc1#a2", "root/doc1#a3"], + true, + ); + example_generator::make_span(&mut u, "root/doc1#span2", &["root/doc1#a4"], true); + u.add_event(UpdateEvent::AddNode { + node_name: "root/doc1#struct1".to_string(), + node_type: "node".to_string(), + }) + .unwrap(); + u.add_event(UpdateEvent::AddNodeLabel { + node_name: "root/doc1#struct1".to_string(), + anno_ns: "test".to_string(), + anno_name: "cat".to_string(), + anno_value: "ROOT".to_string(), + }) + .unwrap(); + u.add_event(UpdateEvent::AddEdge { + source_node: "root/doc1#struct1".to_string(), + target_node: "root/doc1#span1".to_string(), + layer: "test".to_string(), + component_type: "Dominance".to_string(), + component_name: "edge".to_string(), + }) + .unwrap(); + u.add_event(UpdateEvent::AddEdge { + source_node: "root/doc1#struct1".to_string(), + target_node: "root/doc1#span2".to_string(), + layer: "test".to_string(), + component_type: "Dominance".to_string(), + component_name: "edge".to_string(), + }) + .unwrap(); + + let mut g = AnnotationGraph::with_default_graphstorages(false).unwrap(); + g.apply_update(&mut u, |_| {}).unwrap(); + + // TODO Check that the inherited coverage edges have been created + let gs = g + .get_graphstorage_as_ref(&AnnotationComponent::new( + Coverage, + "annis".into(), + "inherited-coverage".into(), + )) + .unwrap(); + + let sources: Vec<_> = gs + .source_nodes() + .map(|n| { + g.get_node_annos() + .get_value_for_item(&n.unwrap(), &NODE_NAME_KEY) + .unwrap() + .unwrap() + .to_string() + }) + .sorted() + .collect(); + assert_eq!( + sources, + vec!["root/doc1#span1", "root/doc1#span2", "root/doc1#struct1"] + ); + + // Also check that the edges target the right timeline items (and not the segmentation nodes) + assert_out_edges( + &g, + gs, + "root/doc1#span1", + &[ + "root/doc1#tli1", + "root/doc1#tli2", + "root/doc1#tli3", + "root/doc1#tli4", + ], + ); + assert_out_edges(&g, gs, "root/doc1#span2", &["root/doc1#tli5"]); + assert_out_edges( + &g, + gs, + "root/doc1#struct1", + &[ + "root/doc1#tli1", + "root/doc1#tli2", + "root/doc1#tli3", + "root/doc1#tli4", + "root/doc1#tli5", + ], + ); +} + +fn assert_out_edges( + graph: &AnnotationGraph, + gs: &dyn GraphStorage, + source: &str, + expected: &[&str], +) { + let out: Vec<_> = gs + .get_outgoing_edges( + graph + .get_node_annos() + .get_node_id_from_name(source) + .unwrap() + .unwrap(), + ) + .map(|t| { + graph + .get_node_annos() + .get_value_for_item(&t.unwrap(), &NODE_NAME_KEY) + .unwrap() + .unwrap() + .to_string() + }) + .collect(); + assert_eq!(out, expected); +} diff --git a/graphannis/src/annis/db/corpusstorage/tests.rs b/graphannis/src/annis/db/corpusstorage/tests.rs index 8744e9a1d..e44b83303 100644 --- a/graphannis/src/annis/db/corpusstorage/tests.rs +++ b/graphannis/src/annis/db/corpusstorage/tests.rs @@ -676,7 +676,7 @@ fn subgraph_with_segmentation() { ]; for (i, t) in seg_tokens.iter().enumerate() { let node_name = format!("root/doc1#seg{}", i); - example_generator::create_token_node(&mut g, &node_name, t, Some("root/doc1")); + example_generator::create_token_node(&mut g, &node_name, t, None, None, Some("root/doc1")); g.add_event(UpdateEvent::AddNodeLabel { node_name, anno_ns: "default_ns".to_string(), @@ -1488,6 +1488,8 @@ fn reoptimize_corpussizeconfig() { &mut u, "rootCorpus/subCorpus1/doc1#sTok12", "!", + None, + None, Some("rootCorpus/subCorpus1/doc1#sText1"), ); u.add_event(UpdateEvent::AddEdge { diff --git a/graphannis/src/annis/db/example_generator.rs b/graphannis/src/annis/db/example_generator.rs index c414216c1..7c7ce3e9b 100644 --- a/graphannis/src/annis/db/example_generator.rs +++ b/graphannis/src/annis/db/example_generator.rs @@ -1,8 +1,10 @@ use graphannis_core::graph::{ update::{GraphUpdate, UpdateEvent}, - ANNIS_NS, + ANNIS_NS, DEFAULT_NS, }; +use crate::model::AnnotationComponentType; + /// Create update events for the following corpus structure: /// /// ``` @@ -208,7 +210,14 @@ pub(crate) fn create_tokens( "?", ]; for (i, t) in token_strings.iter().enumerate() { - create_token_node(update, &format!("{}tok{}", prefix, i), t, parent_node); + create_token_node( + update, + &format!("{}tok{}", prefix, i), + t, + None, + None, + parent_node, + ); } // add the order relations @@ -225,11 +234,140 @@ pub(crate) fn create_tokens( } } +/// Creates two segmentation layers that cover the same timeline. +/// +/// ```text +/// a: [Another ] [ex] [ample] [text] +/// b: [An] [other] [example ] [text] +/// ``` +/// +/// The timeline items have the name `tli1`, `tli2`, ..., `tli5`. +pub(crate) fn create_multiple_segmentations(update: &mut GraphUpdate, document_node: &str) { + let prefix = format!("{}#", document_node); + + // Timeline items + for i in 1..=5 { + create_token_node( + update, + &format!("{prefix}tli{i}"), + " ", + None, + None, + Some(document_node), + ) + } + + // Segmentation `a` + make_segmentation_span( + update, + &format!("{prefix}a1"), + document_node, + "a", + "Another", + &[&format!("{prefix}tli1"), &format!("{prefix}tli2")], + ); + make_segmentation_span( + update, + &format!("{prefix}a2"), + document_node, + "a", + "ex", + &[&format!("{prefix}tli3")], + ); + make_segmentation_span( + update, + &format!("{prefix}a3"), + document_node, + "a", + "ample", + &[&format!("{prefix}tli4")], + ); + + make_segmentation_span( + update, + &format!("{prefix}a4"), + document_node, + "a", + "text", + &[&format!("{prefix}tli5")], + ); + + // Segmentation `b` + make_segmentation_span( + update, + &format!("{prefix}b1"), + document_node, + "b", + "An", + &[&format!("{prefix}tli1")], + ); + make_segmentation_span( + update, + &format!("{prefix}b2"), + document_node, + "b", + "other", + &[&format!("{prefix}tli2")], + ); + make_segmentation_span( + update, + &format!("{prefix}b3"), + document_node, + "b", + "example", + &[&format!("{prefix}tli3"), &format!("{prefix}tli4")], + ); + + make_segmentation_span( + update, + &format!("{prefix}b4"), + document_node, + "b", + "text", + &[&format!("{prefix}tli5")], + ); + + // add the order relations + for i in 1..5 { + update + .add_event(UpdateEvent::AddEdge { + source_node: format!("{prefix}tli{}", i), + target_node: format!("{prefix}tli{}", i + 1), + layer: ANNIS_NS.to_string(), + component_type: "Ordering".to_string(), + component_name: "".to_string(), + }) + .unwrap(); + } + for i in 1..4 { + update + .add_event(UpdateEvent::AddEdge { + source_node: format!("{prefix}a{}", i), + target_node: format!("{prefix}a{}", i + 1), + layer: DEFAULT_NS.to_string(), + component_type: "Ordering".to_string(), + component_name: "a".to_string(), + }) + .unwrap(); + update + .add_event(UpdateEvent::AddEdge { + source_node: format!("{prefix}b{}", i), + target_node: format!("{prefix}b{}", i + 1), + layer: DEFAULT_NS.to_string(), + component_type: "Ordering".to_string(), + component_name: "b".to_string(), + }) + .unwrap(); + } +} + pub(crate) fn create_token_node( update: &mut GraphUpdate, node_name: &str, token_value: &str, - parent_node: Option<&str>, + whitespace_before: Option<&str>, + whitespace_after: Option<&str>, + document_node: Option<&str>, ) { update .add_event(UpdateEvent::AddNode { @@ -246,7 +384,28 @@ pub(crate) fn create_token_node( }) .unwrap(); - if let Some(parent_node) = parent_node { + if let Some(ws) = whitespace_before { + update + .add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.to_string(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "tok-whitespace-before".to_string(), + anno_value: ws.to_string(), + }) + .unwrap(); + } + if let Some(ws) = whitespace_after { + update + .add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.to_string(), + anno_ns: ANNIS_NS.to_string(), + anno_name: "tok-whitespace-after".to_string(), + anno_value: ws.to_string(), + }) + .unwrap(); + } + + if let Some(parent_node) = document_node { // add the token node to the document update .add_event(UpdateEvent::AddEdge { @@ -286,3 +445,57 @@ pub(crate) fn make_span( .unwrap(); } } + +pub(crate) fn make_segmentation_span( + update: &mut GraphUpdate, + node_name: &str, + parent_node_name: &str, + segmentation_name: &str, + segmentation_value: &str, + covered_token_names: &[&str], +) { + update + .add_event(UpdateEvent::AddNode { + node_name: node_name.to_string(), + node_type: "node".to_string(), + }) + .unwrap(); + + update + .add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.into(), + anno_ns: ANNIS_NS.into(), + anno_name: "tok".into(), + anno_value: segmentation_value.into(), + }) + .unwrap(); + update + .add_event(UpdateEvent::AddNodeLabel { + node_name: node_name.into(), + anno_ns: "".into(), + anno_name: segmentation_name.into(), + anno_value: segmentation_value.into(), + }) + .unwrap(); + + for c in covered_token_names { + update + .add_event(UpdateEvent::AddEdge { + source_node: node_name.to_string(), + target_node: c.to_string(), + layer: "".to_string(), + component_type: "Coverage".to_string(), + component_name: "".to_string(), + }) + .unwrap(); + } + update + .add_event(UpdateEvent::AddEdge { + source_node: node_name.into(), + target_node: parent_node_name.into(), + layer: ANNIS_NS.into(), + component_type: AnnotationComponentType::PartOf.to_string(), + component_name: "".into(), + }) + .unwrap(); +} From 7e7787bcacb6c9890c8469cd307d798c211813b6 Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 20 Aug 2024 17:12:04 +0200 Subject: [PATCH 4/6] Pre-calculate some of the repeatly used components --- graphannis/src/annis/db/aql/model.rs | 66 +++++++++++++--------------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/graphannis/src/annis/db/aql/model.rs b/graphannis/src/annis/db/aql/model.rs index 6d7513ce9..b13684dde 100644 --- a/graphannis/src/annis/db/aql/model.rs +++ b/graphannis/src/annis/db/aql/model.rs @@ -111,65 +111,44 @@ pub struct AQLGlobalStatistics { fn calculate_inherited_coverage_edges( graph: &mut AnnotationGraph, n: NodeID, - all_cov_components: &[AnnotationComponent], - all_dom_components: &[AnnotationComponent], + other_cov_gs: &[Arc], + all_text_coverage_components: &[AnnotationComponent], + inherited_cov_component: &AnnotationComponent, ) -> std::result::Result, ComponentTypeError> { // Iterate over all all nodes that are somehow covered (by coverage or // dominance edges) starting from the given node. - let all_text_coverage_components: Vec = - [all_cov_components, all_dom_components].concat(); - - let all_text_coverage_gs: Vec<_> = all_text_coverage_components + let all_text_cov_components_gs: Vec<_> = all_text_coverage_components .iter() .filter_map(|c| graph.get_graphstorage_as_ref(c)) .map(|gs| gs.as_edgecontainer()) .collect(); - let combined_gs = UnionEdgeContainer::new(all_text_coverage_gs); - - let mut covered_token = FxHashSet::default(); - let inherited_cov_component = AnnotationComponent::new( - AnnotationComponentType::Coverage, - ANNIS_NS.into(), - "inherited-coverage".into(), - ); + let all_text_cov_components_combined = UnionEdgeContainer::new(all_text_cov_components_gs); + let mut covered_token = FxHashSet::default(); { let tok_helper = TokenHelper::new(graph)?; - for step in CycleSafeDFS::new(&combined_gs, n, 1, usize::MAX) { + for step in CycleSafeDFS::new(&all_text_cov_components_combined, n, 1, usize::MAX) { let step = step?; if tok_helper.is_token(step.node)? { covered_token.insert(step.node); } } }; - let other_coverage_gs: Vec> = graph - .get_all_components(Some(AnnotationComponentType::Coverage), None) - .into_iter() - .filter(|c| c != &inherited_cov_component) - .filter_map(|c| graph.get_graphstorage(&c)) - .filter(|gs| { - if let Some(stats) = gs.get_statistics() { - stats.nodes > 0 - } else { - true - } - }) - .collect(); // Connect all non-token nodes to the covered token nodes if no such direct coverage already exists let mut direct_coverage_targets = FxHashSet::default(); - for gs in other_coverage_gs.iter() { + for gs in other_cov_gs.iter() { for target in gs.get_outgoing_edges(n) { direct_coverage_targets.insert(target?); } } - let gs_cov = graph.get_or_create_writable(&inherited_cov_component)?; + let inherited_gs_cov = graph.get_or_create_writable(&inherited_cov_component)?; for target in &covered_token { if n != *target { if !direct_coverage_targets.contains(target) { - gs_cov.add_edge(Edge { + inherited_gs_cov.add_edge(Edge { source: n, target: *target, })?; @@ -286,10 +265,26 @@ impl AQLUpdateGraphIndex { ) -> std::result::Result<(), ComponentTypeError> { self.clear_left_right_token(graph)?; - let all_cov_components = - graph.get_all_components(Some(AnnotationComponentType::Coverage), None); + let inherited_cov_component = AnnotationComponent::new( + AnnotationComponentType::Coverage, + ANNIS_NS.into(), + "inherited-coverage".into(), + ); + let all_cov_components: Vec<_> = graph + .get_all_components(Some(AnnotationComponentType::Coverage), None) + .into_iter() + .filter(|c| c != &inherited_cov_component) + .collect(); + + let all_cov_gs: Vec<_> = all_cov_components + .iter() + .filter_map(|c| graph.get_graphstorage(c)) + .collect(); + let all_dom_components = graph.get_all_components(Some(AnnotationComponentType::Dominance), None); + let all_text_coverage_components: Vec = + [all_cov_components, all_dom_components].concat(); // go over each node and calculate the left-most and right-most token for invalid in self.invalid_nodes.iter()? { @@ -297,8 +292,9 @@ impl AQLUpdateGraphIndex { let covered_token = calculate_inherited_coverage_edges( graph, n, - &all_cov_components, - &all_dom_components, + &all_cov_gs, + &all_text_coverage_components, + &inherited_cov_component, )?; self.calculate_token_alignment( graph, From 16c338ee1b877526d3b08e8db6379f6d77ce2a3f Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 20 Aug 2024 17:16:32 +0200 Subject: [PATCH 5/6] Fix clippy issue --- graphannis/src/annis/db/aql/model.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/graphannis/src/annis/db/aql/model.rs b/graphannis/src/annis/db/aql/model.rs index b13684dde..fd1e9f5ab 100644 --- a/graphannis/src/annis/db/aql/model.rs +++ b/graphannis/src/annis/db/aql/model.rs @@ -143,16 +143,14 @@ fn calculate_inherited_coverage_edges( direct_coverage_targets.insert(target?); } } - let inherited_gs_cov = graph.get_or_create_writable(&inherited_cov_component)?; + let inherited_gs_cov = graph.get_or_create_writable(inherited_cov_component)?; for target in &covered_token { - if n != *target { - if !direct_coverage_targets.contains(target) { - inherited_gs_cov.add_edge(Edge { - source: n, - target: *target, - })?; - } + if n != *target && !direct_coverage_targets.contains(target) { + inherited_gs_cov.add_edge(Edge { + source: n, + target: *target, + })?; } } From bc93aad8b8e722c2736bac6803403e917a16236d Mon Sep 17 00:00:00 2001 From: Thomas Krause Date: Tue, 20 Aug 2024 17:25:01 +0200 Subject: [PATCH 6/6] Use authentifacted access to get the latest release --- .github/workflows/release_capi.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release_capi.yml b/.github/workflows/release_capi.yml index a16e26141..75f3218cd 100644 --- a/.github/workflows/release_capi.yml +++ b/.github/workflows/release_capi.yml @@ -2,9 +2,9 @@ on: release: types: [published] workflow_run: - workflows: + workflows: - Release - types: + types: - completed pull_request: types: [labeled] @@ -20,6 +20,7 @@ jobs: uses: pozetroninc/github-action-get-latest-release@v0.7.0 with: repository: ${{ github.repository }} + token: ${{ secrets.GITHUB_TOKEN }} - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1.0.6 with: @@ -47,6 +48,7 @@ jobs: uses: pozetroninc/github-action-get-latest-release@v0.7.0 with: repository: ${{ github.repository }} + token: ${{ secrets.GITHUB_TOKEN }} - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1.0.6 with: @@ -74,6 +76,7 @@ jobs: uses: pozetroninc/github-action-get-latest-release@v0.7.0 with: repository: ${{ github.repository }} + token: ${{ secrets.GITHUB_TOKEN }} - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1.0.6 with: