From f1db1b0f9b08ebc5ab47f690c65d76c2768ce909 Mon Sep 17 00:00:00 2001 From: cncases <156569830+cncases@users.noreply.github.com> Date: Fri, 25 Oct 2024 11:52:53 +0000 Subject: [PATCH 1/6] fjall --- Cargo.lock | 318 +++++++++++++++++++++++++++++---------------- Cargo.toml | 2 +- config.toml | 2 +- src/bin/convert.rs | 30 +++-- src/bin/main.rs | 15 +-- src/lib.rs | 4 +- 6 files changed, 237 insertions(+), 134 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ca75db6..24205b3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -213,26 +213,6 @@ dependencies = [ "serde", ] -[[package]] -name = "bindgen" -version = "0.69.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" -dependencies = [ - "bitflags", - "cexpr", - "clang-sys", - "itertools", - "lazy_static", - "lazycell", - "proc-macro2", - "quote", - "regex", - "rustc-hash", - "shlex", - "syn", -] - [[package]] name = "bitflags" version = "2.6.0" @@ -266,17 +246,6 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ac0150caa2ae65ca5bd83f25c7de183dea78d4d366469f148435e2acfbad0da" -[[package]] -name = "bzip2-sys" -version = "0.1.11+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "cases" version = "0.0.6" @@ -286,9 +255,9 @@ dependencies = [ "basic-toml", "bincode", "csv", + "fjall", "indexmap", "jieba-rs", - "rocksdb", "serde", "stop-words", "tantivy", @@ -327,32 +296,12 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0" -[[package]] -name = "cexpr" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" -dependencies = [ - "nom", -] - [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" -[[package]] -name = "clang-sys" -version = "1.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" -dependencies = [ - "glob", - "libc", - "libloading", -] - [[package]] name = "crc32fast" version = "1.4.2" @@ -390,6 +339,16 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-skiplist" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df29de440c58ca2cc6e587ec3d22347551a32435fbde9d2bff64e78a9ffa151b" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.20" @@ -458,6 +417,20 @@ dependencies = [ "syn", ] +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "deranged" version = "0.3.11" @@ -521,6 +494,12 @@ dependencies = [ "syn", ] +[[package]] +name = "double-ended-peekable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0d05e1c0dbad51b52c38bda7adceef61b9efc2baf04acfe8726a8c4630a6f57" + [[package]] name = "downcast-rs" version = "1.2.1" @@ -533,6 +512,18 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "enum_dispatch" +version = "0.3.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa18ce2bc66555b3218614519ac839ddb759a7d6720732f979ef8d13be147ecd" +dependencies = [ + "once_cell", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -561,6 +552,22 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" +[[package]] +name = "fjall" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "454ffab4307857262156b33ec6d3e61c07a7cda1062b26d368d75aca6d2d16d8" +dependencies = [ + "byteorder", + "dashmap", + "log", + "lsm-tree", + "path-absolutize", + "std-semaphore", + "tempfile", + "xxhash-rust", +] + [[package]] name = "flate2" version = "1.0.34" @@ -668,10 +675,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" [[package]] -name = "glob" -version = "0.3.1" +name = "guardian" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +checksum = "493913a18c0d7bebb75127a26a432162c59edbe06f6cf712001e3e769345e8b5" [[package]] name = "h2" @@ -692,6 +699,12 @@ dependencies = [ "tracing", ] +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + [[package]] name = "hashbrown" version = "0.15.0" @@ -810,7 +823,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.15.0", ] [[package]] @@ -879,12 +892,6 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" - [[package]] name = "levenshtein_automata" version = "0.2.1" @@ -897,16 +904,6 @@ version = "0.2.161" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" -[[package]] -name = "libloading" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4979f22fdb869068da03c9f7528f8297c6fd2606bc3a4affe42e6a823fdb8da4" -dependencies = [ - "cfg-if", - "windows-targets", -] - [[package]] name = "libm" version = "0.2.8" @@ -914,36 +911,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" [[package]] -name = "librocksdb-sys" -version = "0.16.0+8.10.0" +name = "linux-raw-sys" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce3d60bc059831dc1c83903fb45c103f75db65c5a7bf22272764d9cc683e348c" -dependencies = [ - "bindgen", - "bzip2-sys", - "cc", - "glob", - "libc", - "libz-sys", -] +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] -name = "libz-sys" -version = "1.1.20" +name = "lock_api" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2d16453e800a8cf6dd2fc3eb4bc99b786a9b90c663b8559a5b1a041bf89e472" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" dependencies = [ - "cc", - "pkg-config", - "vcpkg", + "autocfg", + "scopeguard", ] -[[package]] -name = "linux-raw-sys" -version = "0.4.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" - [[package]] name = "lockfree-object-pool" version = "0.1.6" @@ -962,7 +944,31 @@ version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" dependencies = [ - "hashbrown", + "hashbrown 0.15.0", +] + +[[package]] +name = "lsm-tree" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be3590edb2af2fe0c4272ba25fa7c86bf7e242c8e979980c1f114f683f34aa23" +dependencies = [ + "byteorder", + "crossbeam-skiplist", + "double-ended-peekable", + "enum_dispatch", + "guardian", + "log", + "miniz_oxide", + "path-absolutize", + "quick_cache", + "rustc-hash 2.0.0", + "self_cell", + "smallvec", + "tempfile", + "value-log", + "varint-rs", + "xxhash-rust", ] [[package]] @@ -1027,6 +1033,12 @@ dependencies = [ "unicase", ] +[[package]] +name = "min-max-heap" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2687e6cf9c00f48e9284cf9fd15f2ef341d03cc7743abf9df4c5f07fdee50b18" + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -1142,6 +1154,37 @@ dependencies = [ "stable_deref_trait", ] +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "path-absolutize" +version = "3.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4af381fe79fa195b4909485d99f73a80792331df0625188e707854f0b3383f5" +dependencies = [ + "path-dedot", +] + +[[package]] +name = "path-dedot" +version = "3.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07ba0ad7e047712414213ff67533e6dd477af0a4e1d14fb52343e53d30ea9397" +dependencies = [ + "once_cell", +] + [[package]] name = "percent-encoding" version = "2.3.1" @@ -1228,6 +1271,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "quick_cache" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d7c94f8935a9df96bb6380e8592c70edf497a643f94bd23b2f76b399385dbf4" +dependencies = [ + "equivalent", + "hashbrown 0.14.5", +] + [[package]] name = "quote" version = "1.0.37" @@ -1297,6 +1350,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "redox_syscall" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" +dependencies = [ + "bitflags", +] + [[package]] name = "regex" version = "1.11.0" @@ -1341,16 +1403,6 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" -[[package]] -name = "rocksdb" -version = "0.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bd13e55d6d7b8cd0ea569161127567cd587676c99f4472f779a0279aa60a7a7" -dependencies = [ - "libc", - "librocksdb-sys", -] - [[package]] name = "rust-stemmers" version = "1.2.0" @@ -1373,6 +1425,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +[[package]] +name = "rustc-hash" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" + [[package]] name = "rustix" version = "0.38.37" @@ -1398,6 +1456,18 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "self_cell" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d369a96f978623eb3dc28807c4852d6cc617fed53da5d3c400feff1ef34a714a" + [[package]] name = "serde" version = "1.0.213" @@ -1509,6 +1579,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "std-semaphore" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ae9eec00137a8eed469fb4148acd9fc6ac8c3f9b110f52cd34698c8b5bfa0e" + [[package]] name = "stop-words" version = "0.8.0" @@ -1579,7 +1655,7 @@ dependencies = [ "rayon", "regex", "rust-stemmers", - "rustc-hash", + "rustc-hash 1.1.0", "serde", "serde_json", "sketches-ddsketch", @@ -1956,10 +2032,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" [[package]] -name = "vcpkg" -version = "0.2.15" +name = "value-log" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +checksum = "701aa53f40cdebc413fd3a1e6637c118e6a8d36e40736206f374e3722f0ddf53" +dependencies = [ + "byteorder", + "log", + "min-max-heap", + "path-absolutize", + "quick_cache", + "rustc-hash 2.0.0", + "tempfile", + "xxhash-rust", +] + +[[package]] +name = "varint-rs" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f54a172d0620933a27a4360d3db3e2ae0dd6cceae9730751a036bbf182c4b23" [[package]] name = "wasi" @@ -2136,6 +2228,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "xxhash-rust" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a5cbf750400958819fb6178eaa83bee5cd9c29a26a40cc241df8c70fdd46984" + [[package]] name = "zerocopy" version = "0.7.35" diff --git a/Cargo.toml b/Cargo.toml index 68c5ae7..7966a09 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,8 +9,8 @@ axum = { version = "0.7", features = ["http2", "query", "tokio", "http1"], defau basic-toml = "*" bincode = "1.3.3" csv = "1" +fjall = { version = "2.2.0", default-features = false, features = ["bloom", "single_writer_tx", "miniz"] } indexmap = "2" -rocksdb = { version = "0.22.0", default-features = false, features = ["snappy"] } serde = { version = "1", features = ["derive"] } stop-words = "0.8.0" tantivy = "0.22.0" diff --git a/config.toml b/config.toml index 3541a7a..e3c2de2 100644 --- a/config.toml +++ b/config.toml @@ -1,4 +1,4 @@ -db = "./rocksdb" # path to store rocksdb +db = "fjall" # path to store rocksdb index_path ="./search_index" # path to store index index_with_full_text = false # whether establish full-text index addr = "127.0.0.1:8081" # If allow LAN access, change it to "0.0.0.0:port". diff --git a/src/bin/convert.rs b/src/bin/convert.rs index b3e49d6..568a62e 100644 --- a/src/bin/convert.rs +++ b/src/bin/convert.rs @@ -1,5 +1,5 @@ use cases::{Case, CONFIG}; -use rocksdb::{WriteBatchWithTransaction, DB}; +use fjall::Config; use std::fs; use tracing::info; @@ -12,7 +12,10 @@ fn convert(raw_path: &str, db_path: &str) { let time = std::time::Instant::now(); let mut ft = Vec::with_capacity(1024); let mut id: u32 = 0; - let db = DB::open_default(db_path).unwrap(); + let keyspace = Config::new(db_path).open().unwrap(); + let db = keyspace + .open_partition("cases", Default::default()) + .unwrap(); for subdir in fs::read_dir(raw_path).unwrap() { let subdir = subdir.unwrap(); let subdir_path = subdir.path().to_str().unwrap().to_string(); @@ -28,7 +31,7 @@ fn convert(raw_path: &str, db_path: &str) { let mut rdr = csv::Reader::from_reader(file); for result in rdr.deserialize() { id += 1; - if db.key_may_exist(id.to_be_bytes()) { + if db.contains_key(id.to_be_bytes()).unwrap() { info!("skipping {}", id); continue; } @@ -45,13 +48,17 @@ fn convert(raw_path: &str, db_path: &str) { }); ft.push((id, case)); - if ft.len() >= 1024 { + if ft.len() >= 10240 { info!("inserting {id}, time: {}", time.elapsed().as_secs()); - let mut batch = WriteBatchWithTransaction::::default(); + let mut batch = keyspace.batch(); for (id, case) in ft.iter() { - batch.put((*id).to_be_bytes(), bincode::serialize(case).unwrap()); + batch.insert( + &db, + (*id).to_be_bytes(), + bincode::serialize(case).unwrap(), + ); } - db.write(batch).unwrap(); + batch.commit().unwrap(); ft.clear(); } } @@ -63,13 +70,12 @@ fn convert(raw_path: &str, db_path: &str) { } if !ft.is_empty() { - info!("inserting {id}"); - let mut batch = WriteBatchWithTransaction::::default(); + info!("inserting {id}, time: {}", time.elapsed().as_secs()); + let mut batch = keyspace.batch(); for (id, case) in ft.iter() { - batch.put((*id).to_be_bytes(), bincode::serialize(case).unwrap()); + batch.insert(&db, (*id).to_be_bytes(), bincode::serialize(case).unwrap()); } - db.write(batch).unwrap(); + batch.commit().unwrap(); ft.clear(); - drop(db); } } diff --git a/src/bin/main.rs b/src/bin/main.rs index 2f930fe..4e173a4 100644 --- a/src/bin/main.rs +++ b/src/bin/main.rs @@ -1,6 +1,6 @@ use axum::{routing::get, Router}; use cases::{case, logo, search, style, AppState, Tan, CONFIG}; -use rocksdb::{Options, DB}; +use fjall::Config; use std::{net::SocketAddr, sync::Arc, time::Duration}; use tokio::net::TcpListener; use tower::ServiceBuilder; @@ -20,15 +20,13 @@ async fn main() { .init(); let addr: SocketAddr = CONFIG.addr.parse().unwrap(); - info!("listening on http://{}", addr); - let searcher = Arc::new(Tan::searcher().unwrap()); - let db = DB::open_for_read_only(&Options::default(), CONFIG.db.as_str(), true).unwrap(); - let app_state = AppState { - db: Arc::new(db), - searcher, - }; + let keyspace = Config::new(CONFIG.db.as_str()).open().unwrap(); + let db = keyspace + .open_partition("cases", Default::default()) + .unwrap(); + let app_state = AppState { db, searcher }; let middleware_stack = ServiceBuilder::new() .layer(CompressionLayer::new()) @@ -43,6 +41,7 @@ async fn main() { .layer(middleware_stack) .with_state(app_state); + info!("listening on http://{}", addr); let listener = TcpListener::bind(addr).await.unwrap(); axum::serve(listener, app).await.unwrap(); } diff --git a/src/lib.rs b/src/lib.rs index 6ab08b5..39926d7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,6 @@ pub use config::CONFIG; pub use controller::{case, logo, search, style}; -use rocksdb::DB; +use fjall::PartitionHandle; use serde::{Deserialize, Serialize}; use std::sync::Arc; use tantivy::Searcher; @@ -12,7 +12,7 @@ mod tantivy; #[derive(Clone)] pub struct AppState { - pub db: Arc, + pub db: PartitionHandle, pub searcher: Arc, } From 8f0a56f4bdbe4d9a38c2120f3ce089368eeffce5 Mon Sep 17 00:00:00 2001 From: cncases <156569830+cncases@users.noreply.github.com> Date: Fri, 25 Oct 2024 19:54:24 +0800 Subject: [PATCH 2/6] Update ci.yml --- .github/workflows/ci.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e355ba0..4344d77 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,9 +1,6 @@ name: CHECK on: - push: - branches: - - main pull_request: branches: - '**' @@ -49,4 +46,4 @@ jobs: with: sarif_file: rust-clippy-results.sarif wait-for-processing: true - \ No newline at end of file + From ded0014969fd7249a539877373cd165a74bba131 Mon Sep 17 00:00:00 2001 From: cncases <156569830+cncases@users.noreply.github.com> Date: Sat, 26 Oct 2024 08:18:14 +0000 Subject: [PATCH 3/6] fjall --- Cargo.lock | 7 ++----- Cargo.toml | 2 +- src/bin/convert.rs | 42 +++++++++++++++++++++++++++++------------- 3 files changed, 32 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 24205b3..1dc8f62 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -555,8 +555,7 @@ checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" [[package]] name = "fjall" version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "454ffab4307857262156b33ec6d3e61c07a7cda1062b26d368d75aca6d2d16d8" +source = "git+https://github.com/fjall-rs/fjall?branch=casetest#c79674803c2ac1fc5279edb17d5814e6be8c8d8d" dependencies = [ "byteorder", "dashmap", @@ -950,8 +949,7 @@ dependencies = [ [[package]] name = "lsm-tree" version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be3590edb2af2fe0c4272ba25fa7c86bf7e242c8e979980c1f114f683f34aa23" +source = "git+https://github.com/fjall-rs/lsm-tree?branch=63#1b15c4d9384d22cfc72d81867cd8e84aa025ce29" dependencies = [ "byteorder", "crossbeam-skiplist", @@ -964,7 +962,6 @@ dependencies = [ "quick_cache", "rustc-hash 2.0.0", "self_cell", - "smallvec", "tempfile", "value-log", "varint-rs", diff --git a/Cargo.toml b/Cargo.toml index 7966a09..22536fe 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ axum = { version = "0.7", features = ["http2", "query", "tokio", "http1"], defau basic-toml = "*" bincode = "1.3.3" csv = "1" -fjall = { version = "2.2.0", default-features = false, features = ["bloom", "single_writer_tx", "miniz"] } +fjall = { git = "https://github.com/fjall-rs/fjall", branch = "casetest", default-features = false, features = ["bloom", "single_writer_tx", "miniz"] } indexmap = "2" serde = { version = "1", features = ["derive"] } stop-words = "0.8.0" diff --git a/src/bin/convert.rs b/src/bin/convert.rs index 568a62e..c094e48 100644 --- a/src/bin/convert.rs +++ b/src/bin/convert.rs @@ -1,5 +1,5 @@ use cases::{Case, CONFIG}; -use fjall::Config; +use fjall::{Config, KvSeparationOptions, PartitionCreateOptions}; use std::fs; use tracing::info; @@ -12,9 +12,21 @@ fn convert(raw_path: &str, db_path: &str) { let time = std::time::Instant::now(); let mut ft = Vec::with_capacity(1024); let mut id: u32 = 0; - let keyspace = Config::new(db_path).open().unwrap(); + let keyspace = Config::new(db_path) + .max_write_buffer_size(256_000_000) + .open() + .unwrap(); let db = keyspace - .open_partition("cases", Default::default()) + .open_partition( + "cases", + PartitionCreateOptions::default() + .max_memtable_size(128_000_000) + .with_kv_separation( + KvSeparationOptions::default() + .separation_threshold(750) + .file_target_size(256_000_000), + ), + ) .unwrap(); for subdir in fs::read_dir(raw_path).unwrap() { let subdir = subdir.unwrap(); @@ -24,6 +36,7 @@ fn convert(raw_path: &str, db_path: &str) { let file = fs::File::open(&subdir_path).unwrap(); let mut archive = zip::ZipArchive::new(file).unwrap(); + let mut buf = String::new(); for i in 0..archive.len() { let file = archive.by_index(i).unwrap(); let raw_name = file.name(); @@ -32,20 +45,23 @@ fn convert(raw_path: &str, db_path: &str) { for result in rdr.deserialize() { id += 1; if db.contains_key(id.to_be_bytes()).unwrap() { - info!("skipping {}", id); + if id % 10000 == 0 { + info!("skipping {}", id); + } continue; } let mut case: Case = result.unwrap(); - case.full_text = - case.full_text - .split_whitespace() - .fold(String::new(), |mut acc, x| { - acc.push_str("

"); - acc.push_str(x); - acc.push_str("

"); - acc - }); + + case.full_text.split_whitespace().for_each(|word| { + buf.push_str("

"); + buf.push_str(word); + buf.push_str("

"); + }); + + case.full_text = buf.clone(); + buf.clear(); + ft.push((id, case)); if ft.len() >= 10240 { From ce138e44b6d6317f22b0c864918b18b185c2dfbe Mon Sep 17 00:00:00 2001 From: cncases <156569830+cncases@users.noreply.github.com> Date: Mon, 28 Oct 2024 07:59:57 +0000 Subject: [PATCH 4/6] update --- .gitignore | 3 ++- Cargo.lock | 21 ++++++++++----------- Cargo.toml | 2 +- src/bin/main.rs | 9 ++------- 4 files changed, 15 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index 05d70ea..c0faee2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ /target /search_index /rocksdb -裁判文书全量数据(已完成) \ No newline at end of file +裁判文书全量数据(已完成) +/fjall \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 1dc8f62..4f15c54 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -905,9 +905,9 @@ checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" [[package]] name = "libm" -version = "0.2.8" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" +checksum = "a00419de735aac21d53b0de5ce2c03bd3627277cf471300f27ebc89f7d828047" [[package]] name = "linux-raw-sys" @@ -1228,9 +1228,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.14" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" +checksum = "915a1e146535de9163f3987b8944ed8cf49a18bb0056bcebcdcece385cece4ff" [[package]] name = "pin-utils" @@ -1358,9 +1358,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38200e5ee88914975b69f657f0801b6f6dccafd44fd9326302a4aaeecfacb1d8" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -1430,9 +1430,9 @@ checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" [[package]] name = "rustix" -version = "0.38.37" +version = "0.38.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811" +checksum = "aa260229e6538e52293eeb577aabd09945a09d6d9cc0fc550ed7529056c2e32a" dependencies = [ "bitflags", "errno", @@ -1918,7 +1918,6 @@ dependencies = [ "tokio-util", "tower-layer", "tower-service", - "tracing", ] [[package]] @@ -2030,9 +2029,9 @@ checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" [[package]] name = "value-log" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "701aa53f40cdebc413fd3a1e6637c118e6a8d36e40736206f374e3722f0ddf53" +checksum = "71f2774a6212a226657ca9ce1aae573b711eb014ce7590b48fc3b483d7d47bbe" dependencies = [ "byteorder", "log", diff --git a/Cargo.toml b/Cargo.toml index 22536fe..77e4264 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,7 +17,7 @@ tantivy = "0.22.0" tantivy-jieba = "0.11.0" tokio = { version = "1", features = ["macros", "rt-multi-thread"] } tower = "0.5.1" -tower-http = { version = "0.6.1", features = ["compression-zstd", "trace", "timeout"] } +tower-http = { version = "0.6.1", features = ["compression-zstd", "timeout"] } tracing = { version = "0.1", features = ["release_max_level_info", "max_level_info"] } tracing-subscriber = { version = "0.3", features = ["env-filter"] } zip = { version = "2.2.0", default-features = false, features = ["deflate"] } diff --git a/src/bin/main.rs b/src/bin/main.rs index 4e173a4..1bcb6fd 100644 --- a/src/bin/main.rs +++ b/src/bin/main.rs @@ -4,12 +4,8 @@ use fjall::Config; use std::{net::SocketAddr, sync::Arc, time::Duration}; use tokio::net::TcpListener; use tower::ServiceBuilder; -use tower_http::{ - compression::CompressionLayer, - timeout::TimeoutLayer, - trace::{DefaultMakeSpan, TraceLayer}, -}; -use tracing::{info, Level}; +use tower_http::{compression::CompressionLayer, timeout::TimeoutLayer}; +use tracing::info; use tracing_subscriber::{layer::SubscriberExt, util::SubscriberInitExt}; #[tokio::main] @@ -30,7 +26,6 @@ async fn main() { let middleware_stack = ServiceBuilder::new() .layer(CompressionLayer::new()) - .layer(TraceLayer::new_for_http().make_span_with(DefaultMakeSpan::new().level(Level::INFO))) .layer(TimeoutLayer::new(Duration::from_secs(10))); let app = Router::new() From 2b0b2a5dfe5aabbd7d29f572e965c59b61aa3776 Mon Sep 17 00:00:00 2001 From: cncases <156569830+cncases@users.noreply.github.com> Date: Sat, 2 Nov 2024 06:02:15 +0000 Subject: [PATCH 5/6] update --- Cargo.lock | 26 ++++++++++++++------------ Cargo.toml | 2 +- src/bin/convert.rs | 6 +++++- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4f15c54..61294c1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -554,8 +554,9 @@ checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" [[package]] name = "fjall" -version = "2.2.0" -source = "git+https://github.com/fjall-rs/fjall?branch=casetest#c79674803c2ac1fc5279edb17d5814e6be8c8d8d" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b96d3979f932eb4a7d42e5cd1c64bc21e42ae5f7f79876decbbf3d213cddf222" dependencies = [ "byteorder", "dashmap", @@ -795,9 +796,9 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.9" +version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41296eb09f183ac68eec06e03cdbea2e759633d4067b2f6552fc2e009bcad08b" +checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" dependencies = [ "bytes", "futures-util", @@ -905,9 +906,9 @@ checksum = "8e9489c2807c139ffd9c1794f4af0ebe86a828db53ecdc7fea2111d0fed085d1" [[package]] name = "libm" -version = "0.2.10" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a00419de735aac21d53b0de5ce2c03bd3627277cf471300f27ebc89f7d828047" +checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" [[package]] name = "linux-raw-sys" @@ -948,8 +949,9 @@ dependencies = [ [[package]] name = "lsm-tree" -version = "2.1.1" -source = "git+https://github.com/fjall-rs/lsm-tree?branch=63#1b15c4d9384d22cfc72d81867cd8e84aa025ce29" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "530c059d339bf7664e38f3483a01c6daaba18a8fcdff1cef6973491ccd45e38b" dependencies = [ "byteorder", "crossbeam-skiplist", @@ -1467,18 +1469,18 @@ checksum = "d369a96f978623eb3dc28807c4852d6cc617fed53da5d3c400feff1ef34a714a" [[package]] name = "serde" -version = "1.0.213" +version = "1.0.214" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ea7893ff5e2466df8d720bb615088341b295f849602c6956047f8f80f0e9bc1" +checksum = "f55c3193aca71c12ad7890f1785d2b73e1b9f63a0bbc353c08ef26fe03fc56b5" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.213" +version = "1.0.214" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e85ad2009c50b58e87caa8cd6dac16bdf511bbfb7af6c33df902396aa480fa5" +checksum = "de523f781f095e28fa605cdce0f8307e451cc0fd14e2eb4cd2e98a355b147766" dependencies = [ "proc-macro2", "quote", diff --git a/Cargo.toml b/Cargo.toml index 77e4264..d1c3c81 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ axum = { version = "0.7", features = ["http2", "query", "tokio", "http1"], defau basic-toml = "*" bincode = "1.3.3" csv = "1" -fjall = { git = "https://github.com/fjall-rs/fjall", branch = "casetest", default-features = false, features = ["bloom", "single_writer_tx", "miniz"] } +fjall = { version = "2.3.1", default-features = false, features = ["bloom", "single_writer_tx", "miniz"] } indexmap = "2" serde = { version = "1", features = ["derive"] } stop-words = "0.8.0" diff --git a/src/bin/convert.rs b/src/bin/convert.rs index c094e48..ed4f22d 100644 --- a/src/bin/convert.rs +++ b/src/bin/convert.rs @@ -2,9 +2,13 @@ use cases::{Case, CONFIG}; use fjall::{Config, KvSeparationOptions, PartitionCreateOptions}; use std::fs; use tracing::info; +use tracing_subscriber::{layer::SubscriberExt as _, util::SubscriberInitExt}; fn main() { - tracing_subscriber::fmt().init(); + tracing_subscriber::registry() + .with(tracing_subscriber::EnvFilter::new("info,fjall=warn")) + .with(tracing_subscriber::fmt::layer()) + .init(); convert(CONFIG.raw_data_path.as_ref().unwrap(), &CONFIG.db); } From 536b82e622fa4a0db42416c0370c61e9f2cb5845 Mon Sep 17 00:00:00 2001 From: cncases <156569830+cncases@users.noreply.github.com> Date: Sat, 2 Nov 2024 06:06:52 +0000 Subject: [PATCH 6/6] update --- config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.toml b/config.toml index e3c2de2..5802311 100644 --- a/config.toml +++ b/config.toml @@ -1,5 +1,5 @@ db = "fjall" # path to store rocksdb -index_path ="./search_index" # path to store index +index_path ="search_index" # path to store index index_with_full_text = false # whether establish full-text index addr = "127.0.0.1:8081" # If allow LAN access, change it to "0.0.0.0:port".