From 6357ccaa0e5268d57f20d6d7444f66144dc15c36 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Tue, 17 Oct 2023 21:44:07 +0800 Subject: [PATCH] rework exports move snippet and advice make indexer pub, remove indexer reexports --- examples/snippet.rs | 3 +- src/core/index.rs | 2 +- src/directory/mmap_directory.rs | 4 +- src/indexer/merge_operation.rs | 3 ++ src/indexer/mod.rs | 32 +++++++++------- src/indexer/segment_writer.rs | 2 + src/lib.rs | 10 +---- src/snippet/mod.rs | 65 +++++++++++++++++++++++++++++++-- src/termdict/mod.rs | 5 +-- tokenizer-api/src/lib.rs | 2 +- 10 files changed, 95 insertions(+), 33 deletions(-) diff --git a/examples/snippet.rs b/examples/snippet.rs index 263da6eb47..31bd2c166f 100644 --- a/examples/snippet.rs +++ b/examples/snippet.rs @@ -10,7 +10,8 @@ use tantivy::collector::TopDocs; use tantivy::query::QueryParser; use tantivy::schema::*; -use tantivy::{doc, Index, IndexWriter, Snippet, SnippetGenerator}; +use tantivy::snippet::{Snippet, SnippetGenerator}; +use tantivy::{doc, Index, IndexWriter}; use tempfile::TempDir; fn main() -> tantivy::Result<()> { diff --git a/src/core/index.rs b/src/core/index.rs index 01b24f4b4d..40905a8d54 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -18,11 +18,11 @@ use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_L use crate::error::{DataCorruption, TantivyError}; use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN}; use crate::indexer::segment_updater::save_metas; +use crate::indexer::IndexWriter; use crate::reader::{IndexReader, IndexReaderBuilder}; use crate::schema::document::Document; use crate::schema::{Field, FieldType, Schema}; use crate::tokenizer::{TextAnalyzer, TokenizerManager}; -use crate::IndexWriter; fn load_metas( directory: &dyn Directory, diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index b8a62f83fd..781fe7e204 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -8,6 +8,8 @@ use std::sync::{Arc, RwLock, Weak}; use common::StableDeref; use fs4::FileExt; +#[cfg(all(feature = "mmap", unix))] +pub use memmap2::Advice; use memmap2::Mmap; use serde::{Deserialize, Serialize}; use tempfile::TempDir; @@ -21,8 +23,6 @@ use crate::directory::{ AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite, WatchCallback, WatchHandle, WritePtr, }; -#[cfg(unix)] -use crate::Advice; pub type ArcBytes = Arc + Send + Sync + 'static>; pub type WeakArcBytes = Weak + Send + Sync + 'static>; diff --git a/src/indexer/merge_operation.rs b/src/indexer/merge_operation.rs index 6d547ff3e9..90e5ee86ca 100644 --- a/src/indexer/merge_operation.rs +++ b/src/indexer/merge_operation.rs @@ -63,10 +63,13 @@ impl MergeOperation { } } + /// Returns the opstamp up to which we want to consume the delete queue and reflect their + /// deletes. pub fn target_opstamp(&self) -> Opstamp { self.inner.target_opstamp } + /// Returns the list of segment to be merged. pub fn segment_ids(&self) -> &[SegmentId] { &self.inner.segment_ids[..] } diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 79b0c826fd..d5ae094b5c 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -1,23 +1,29 @@ -pub mod delete_queue; +//! Indexing and merging data. +//! +//! Contains code to create and merge segments. +//! `IndexWriter` is the main entry point for that, which created from +//! [`Index::writer`](crate::Index::writer). -pub mod doc_id_mapping; +pub(crate) mod delete_queue; + +pub(crate) mod doc_id_mapping; mod doc_opstamp_mapping; mod flat_map_with_buffer; -pub mod index_writer; -mod index_writer_status; +pub(crate) mod index_writer; +pub(crate) mod index_writer_status; mod log_merge_policy; mod merge_operation; -pub mod merge_policy; -pub mod merger; +pub(crate) mod merge_policy; +pub(crate) mod merger; mod merger_sorted_index_test; -pub mod operation; -pub mod prepared_commit; +pub(crate) mod operation; +pub(crate) mod prepared_commit; mod segment_entry; mod segment_manager; mod segment_register; -pub mod segment_serializer; -pub mod segment_updater; -mod segment_writer; +pub(crate) mod segment_serializer; +pub(crate) mod segment_updater; +pub(crate) mod segment_writer; mod stamper; use crossbeam_channel as channel; @@ -27,10 +33,10 @@ pub use self::index_writer::IndexWriter; pub use self::log_merge_policy::LogMergePolicy; pub use self::merge_operation::MergeOperation; pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy}; +pub use self::operation::UserOperation; pub use self::prepared_commit::PreparedCommit; pub use self::segment_entry::SegmentEntry; -pub use self::segment_manager::SegmentManager; -pub use self::segment_serializer::SegmentSerializer; +pub(crate) use self::segment_serializer::SegmentSerializer; pub use self::segment_updater::{merge_filtered_segments, merge_indices}; pub use self::segment_writer::SegmentWriter; use crate::indexer::operation::AddOperation; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 214d3ded4e..bb7d66077a 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -155,6 +155,8 @@ impl SegmentWriter { Ok(doc_opstamps) } + /// Returns an estimation of the current memory usage of the segment writer. + /// If the mem usage exceeds the `memory_budget`, the segment be serialized. pub fn mem_usage(&self) -> usize { self.ctx.mem_usage() + self.fieldnorms_writer.mem_usage() diff --git a/src/lib.rs b/src/lib.rs index fb05b2eb73..ab99bcb7b1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -137,7 +137,7 @@ pub use crate::future_result::FutureResult; pub type Result = std::result::Result; mod core; -mod indexer; +pub mod indexer; #[allow(unused_doc_comments)] pub mod error; @@ -161,8 +161,7 @@ pub mod termdict; mod reader; pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer}; -mod snippet; -pub use self::snippet::{Snippet, SnippetGenerator}; +pub mod snippet; mod docset; use std::fmt; @@ -181,8 +180,6 @@ pub use crate::core::{ SegmentReader, SingleSegmentIndexWriter, }; pub use crate::directory::Directory; -pub use crate::indexer::operation::UserOperation; -pub use crate::indexer::{merge_filtered_segments, merge_indices, IndexWriter, PreparedCommit}; pub use crate::postings::Postings; #[allow(deprecated)] pub use crate::schema::DatePrecision; @@ -191,9 +188,6 @@ pub use crate::schema::{DateOptions, DateTimePrecision, Document, TantivyDocumen /// Index format version. const INDEX_FORMAT_VERSION: u32 = 5; -#[cfg(all(feature = "mmap", unix))] -pub use memmap2::Advice; - /// Structure version for the index. #[derive(Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct Version { diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs index c5884d6aea..6542df5a3e 100644 --- a/src/snippet/mod.rs +++ b/src/snippet/mod.rs @@ -1,3 +1,59 @@ +//! [`SnippetGenerator`] +//! Generates a text snippet for a given document, and some highlighted parts inside it. +//! Imagine you doing a text search in a document +//! and want to show a preview of where in the document the search terms occur, +//! along with some surrounding text to give context, and the search terms highlighted. +//! +//! [`SnippetGenerator`] serves this purpose. +//! It scans a document and constructs a snippet, which consists of sections where the search terms +//! have been found, stitched together with "..." in between sections if necessary. +//! +//! ## Example +//! +//! ```rust +//! # use tantivy::query::QueryParser; +//! # use tantivy::schema::{Schema, TEXT}; +//! # use tantivy::{doc, Index}; +//! use tantivy::snippet::SnippetGenerator; +//! +//! # fn main() -> tantivy::Result<()> { +//! # let mut schema_builder = Schema::builder(); +//! # let text_field = schema_builder.add_text_field("text", TEXT); +//! # let schema = schema_builder.build(); +//! # let index = Index::create_in_ram(schema); +//! # let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?; +//! # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles, +//! # Je ne me sentis plus guidé par les haleurs : +//! # Des Peaux-Rouges criards les avaient pris pour cibles, +//! # Les ayant cloués nus aux poteaux de couleurs. +//! # +//! # J'étais insoucieux de tous les équipages, +//! # Porteur de blés flamands ou de cotons anglais. +//! # Quand avec mes haleurs ont fini ces tapages, +//! # Les Fleuves m'ont laissé descendre où je voulais. +//! # "#); +//! # index_writer.add_document(doc.clone())?; +//! # index_writer.commit()?; +//! # let query_parser = QueryParser::for_index(&index, vec![text_field]); +//! // ... +//! let query = query_parser.parse_query("haleurs flamands").unwrap(); +//! # let reader = index.reader()?; +//! # let searcher = reader.searcher(); +//! let mut snippet_generator = SnippetGenerator::create(&searcher, &*query, text_field)?; +//! snippet_generator.set_max_num_chars(100); +//! let snippet = snippet_generator.snippet_from_doc(&doc); +//! let snippet_html: String = snippet.to_html(); +//! assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les haleurs :\n Des"); +//! # Ok(()) +//! # } +//! ``` +//! +//! You can also specify the maximum number of characters for the snippets generated with the +//! `set_max_num_chars` method. By default, this limit is set to 150. +//! +//! SnippetGenerator needs to be created from the `Searcher` and the query, and the field on which +//! the `SnippetGenerator` should generate the snippets. + use std::cmp::Ordering; use std::collections::{BTreeMap, BTreeSet}; use std::ops::Range; @@ -16,7 +72,7 @@ const DEFAULT_SNIPPET_PREFIX: &str = ""; const DEFAULT_SNIPPET_POSTFIX: &str = ""; #[derive(Debug)] -pub struct FragmentCandidate { +pub(crate) struct FragmentCandidate { score: Score, start_offset: usize, stop_offset: usize, @@ -256,7 +312,7 @@ fn is_sorted(mut it: impl Iterator) -> bool { /// # use tantivy::query::QueryParser; /// # use tantivy::schema::{Schema, TEXT}; /// # use tantivy::{doc, Index}; -/// use tantivy::SnippetGenerator; +/// use tantivy::snippet::SnippetGenerator; /// /// # fn main() -> tantivy::Result<()> { /// # let mut schema_builder = Schema::builder(); @@ -346,7 +402,7 @@ impl SnippetGenerator { }) } - /// Sets a maximum number of chars. + /// Sets a maximum number of chars. Default is 150. pub fn set_max_num_chars(&mut self, max_num_chars: usize) { self.max_num_chars = max_num_chars; } @@ -398,8 +454,9 @@ mod tests { use super::{collapse_overlapped_ranges, search_fragments, select_best_fragment_combination}; use crate::query::QueryParser; use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT}; + use crate::snippet::SnippetGenerator; use crate::tokenizer::{NgramTokenizer, SimpleTokenizer}; - use crate::{Index, SnippetGenerator}; + use crate::Index; const TEST_TEXT: &str = r#"Rust is a systems programming language sponsored by Mozilla which describes it as a "safe, concurrent, practical language", supporting functional and diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index f0d2a3b9cd..cb546c5fce 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -1,5 +1,5 @@ //! The term dictionary main role is to associate the sorted [`Term`s](crate::Term) to -//! a [`TermInfo`](crate::postings::TermInfo) struct that contains some meta-information +//! a [`TermInfo`] struct that contains some meta-information //! about the term. //! //! Internally, the term dictionary relies on the `fst` crate to store @@ -16,8 +16,7 @@ //! `f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated //! as `u64`. //! -//! A second datastructure makes it possible to access a -//! [`TermInfo`](crate::postings::TermInfo). +//! A second datastructure makes it possible to access a [`TermInfo`]. #[cfg(not(feature = "quickwit"))] mod fst_termdict; diff --git a/tokenizer-api/src/lib.rs b/tokenizer-api/src/lib.rs index 1a95ca7b33..1ed4c9b8af 100644 --- a/tokenizer-api/src/lib.rs +++ b/tokenizer-api/src/lib.rs @@ -2,7 +2,7 @@ //! ready for indexing. This is an seperate crate from tantivy, so implementors don't need to update //! for each new tantivy version. //! -//! To add support for a tokenizer, implement the [`Tokenizer`](crate::Tokenizer) trait. +//! To add support for a tokenizer, implement the [`Tokenizer`] trait. //! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples. use std::borrow::{Borrow, BorrowMut};