Skip to content

Commit

Permalink
v0.0.8
Browse files Browse the repository at this point in the history
  • Loading branch information
cncases committed Jan 19, 2025
1 parent 9da21d5 commit 2c82c04
Show file tree
Hide file tree
Showing 8 changed files with 401 additions and 285 deletions.
597 changes: 334 additions & 263 deletions Cargo.lock

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
[package]
name = "cases"
version = "0.0.6"
version = "0.0.8"
edition = "2021"

[dependencies]
askama = { version = "0.12", default-features = false }
axum = { version = "0.7", features = ["http2", "query", "tokio", "http1"], default-features = false }
axum = { version = "0.8.1", features = ["http2", "query", "tokio", "http1"], default-features = false }
basic-toml = "*"
bincode = "1.3.3"
bincode = "2.0.0-rc.3"
csv = "1"
fjall = { version = "2.3.1", default-features = false, features = ["bloom", "single_writer_tx", "miniz"] }
fjall = { version = "2.5.0", default-features = false, features = ["bloom", "single_writer_tx", "miniz"] }
indexmap = "2"
serde = { version = "1", features = ["derive"] }
stop-words = "0.8.0"
tantivy = "0.22.0"
tantivy-jieba = "0.11.0"
tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
tower = "0.5.1"
tower = "0.5.2"
tower-http = { version = "0.6.1", features = ["compression-zstd", "timeout"] }
tracing = { version = "0.1", features = ["release_max_level_info", "max_level_info"] }
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
zip = { version = "2.2.0", default-features = false, features = ["deflate"] }

[dev-dependencies]
jieba-rs = "0.7.0"
jieba-rs = "0.7.1"

[profile.release]
lto = "fat"
Expand Down
8 changes: 4 additions & 4 deletions config.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
db = "fjall" # path to store rocksdb
index_path ="search_index" # path to store index
index_with_full_text = false # whether establish full-text index
addr = "127.0.0.1:8081" # If allow LAN access, change it to "0.0.0.0:port".
db = "fjall" # path to store rocksdb
index_path = "search_index" # path to store index
index_with_full_text = false # whether establish full-text index
addr = "127.0.0.1:8081" # If allow LAN access, change it to "0.0.0.0:port".

# The raw data path you downloaded from the torrent, and you must NOT unzip it.
raw_data_path = "裁判文书全量数据(已完成)"
38 changes: 36 additions & 2 deletions src/bin/convert.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use bincode::config::standard;
use cases::{Case, CONFIG};
use fjall::{Config, KvSeparationOptions, PartitionCreateOptions};
use std::fs;
Expand Down Expand Up @@ -32,6 +33,11 @@ fn convert(raw_path: &str, db_path: &str) {
),
)
.unwrap();
let doc_ids = keyspace
.open_partition("doc_ids", PartitionCreateOptions::default())
.unwrap();
let mut no_doc_id = 0;

for subdir in fs::read_dir(raw_path).unwrap() {
let subdir = subdir.unwrap();
let subdir_path = subdir.path().to_str().unwrap().to_string();
Expand All @@ -56,6 +62,13 @@ fn convert(raw_path: &str, db_path: &str) {
}

let mut case: Case = result.unwrap();
// https://wenshu.court.gov.cn/website/wenshu/181107ANFZ0BXSK4/index.html?docId=964fc681687d4e47a0a9ace500096dde
case.doc_id = case
.doc_id
.rsplit_once("=")
.unwrap_or_default()
.1
.to_string();

case.full_text.split_whitespace().for_each(|word| {
buf.push_str("<p>");
Expand All @@ -75,8 +88,16 @@ fn convert(raw_path: &str, db_path: &str) {
batch.insert(
&db,
(*id).to_be_bytes(),
bincode::serialize(case).unwrap(),
bincode::encode_to_vec(case, standard()).unwrap(),
);
let has_full_text = case.full_text.is_empty() as u32;
if !case.doc_id.is_empty() {
let value =
[(*id).to_be_bytes(), has_full_text.to_be_bytes()].concat();
batch.insert(&doc_ids, &case.doc_id, value);
} else {
no_doc_id += 1;
}
}
batch.commit().unwrap();
ft.clear();
Expand All @@ -93,9 +114,22 @@ fn convert(raw_path: &str, db_path: &str) {
info!("inserting {id}, time: {}", time.elapsed().as_secs());
let mut batch = keyspace.batch();
for (id, case) in ft.iter() {
batch.insert(&db, (*id).to_be_bytes(), bincode::serialize(case).unwrap());
batch.insert(
&db,
(*id).to_be_bytes(),
bincode::encode_to_vec(case, standard()).unwrap(),
);
let has_full_text = case.full_text.is_empty() as u32;
if !case.doc_id.is_empty() {
let value = [(*id).to_be_bytes(), has_full_text.to_be_bytes()].concat();
batch.insert(&doc_ids, &case.doc_id, value);
} else {
no_doc_id += 1;
}
}
batch.commit().unwrap();
ft.clear();
}

info!("Done, no url counts {}", no_doc_id);
}
15 changes: 12 additions & 3 deletions src/bin/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use axum::{routing::get, Router};
use cases::{case, logo, search, style, AppState, Tan, CONFIG};
use fjall::Config;
use fjall::{Config, KvSeparationOptions, PartitionCreateOptions};
use std::{net::SocketAddr, sync::Arc, time::Duration};
use tokio::net::TcpListener;
use tower::ServiceBuilder;
Expand All @@ -20,7 +20,16 @@ async fn main() {

let keyspace = Config::new(CONFIG.db.as_str()).open().unwrap();
let db = keyspace
.open_partition("cases", Default::default())
.open_partition(
"cases",
PartitionCreateOptions::default()
.max_memtable_size(128_000_000)
.with_kv_separation(
KvSeparationOptions::default()
.separation_threshold(750)
.file_target_size(256_000_000),
),
)
.unwrap();
let app_state = AppState { db, searcher };

Expand All @@ -30,7 +39,7 @@ async fn main() {

let app = Router::new()
.route("/", get(search))
.route("/case/:id", get(case))
.route("/case/{id}", get(case))
.route("/style.css", get(style))
.route("/logo.png", get(logo))
.layer(middleware_stack)
Expand Down
7 changes: 4 additions & 3 deletions src/controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use axum::{
http::{self, header, Response, StatusCode},
response::IntoResponse,
};
use bincode::config::standard;
use indexmap::IndexSet;
use serde::Deserialize;
use tantivy::{
Expand All @@ -24,7 +25,7 @@ pub struct CasePage {

pub async fn case(State(state): State<AppState>, Path(id): Path<u32>) -> impl IntoResponse {
if let Some(v) = state.db.get(id.to_be_bytes()).unwrap() {
let case: Case = bincode::deserialize(&v).unwrap();
let (case, _): (Case, _) = bincode::decode_from_slice(&v, standard()).unwrap();
let case = CasePage { case };
into_response(&case)
} else {
Expand Down Expand Up @@ -93,7 +94,7 @@ pub async fn search(
let mut cases = Vec::with_capacity(ids.len());
for id in ids {
if let Some(v) = state.db.get(id.to_be_bytes()).unwrap() {
let mut case: Case = bincode::deserialize(&v).unwrap();
let (mut case, _): (Case, _) = bincode::decode_from_slice(&v, standard()).unwrap();
case.full_text = case.full_text.replace("<p>", " ").replace("</p>", " ");
cases.push((id, case));
}
Expand Down Expand Up @@ -124,7 +125,7 @@ pub async fn search(
for (id, case) in &cases {
wtr.write_record([
&id.to_string(),
&case.url,
&case.doc_id,
&case.case_id,
&case.case_name,
&case.court,
Expand Down
7 changes: 4 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use bincode::{Decode, Encode};
pub use config::CONFIG;
pub use controller::{case, logo, search, style};
use fjall::PartitionHandle;
use serde::{Deserialize, Serialize};
use serde::Deserialize;
use std::sync::Arc;
use tantivy::Searcher;
pub use tantivy::Tan;
Expand All @@ -16,10 +17,10 @@ pub struct AppState {
pub searcher: Arc<Searcher>,
}

#[derive(Debug, Deserialize, Serialize)]
#[derive(Debug, Encode, Decode, Deserialize)]
pub struct Case {
#[serde(rename(deserialize = "原始链接"))]
pub url: String,
pub doc_id: String,
#[serde(rename(deserialize = "案号"))]
pub case_id: String,
#[serde(rename(deserialize = "案件名称"))]
Expand Down
2 changes: 1 addition & 1 deletion templates/case.html
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ <h3 class="underlineonhover">{{ case.case_name }}</h3>
<p><b>所属地区</b>:{{ case.region }}</p>
<p><b>当事人</b>:{{ case.parties }}</p>
<p><b>案由</b>:{{ case.cause }}</p>
<p><b>原始链接</b>: <a href="{{case.url}}" target="_blank">{{ case.url }}</a></p>
<p><b>原始链接</b>: <a href="https://md5.caseopen.org/{{case.doc_id}}" target="_blank">{{ case.doc_id }}</a></p>
<p><b>法律依据</b>:{{ case.legal_basis }}</p>
<p></p>
<p></p>
Expand Down

0 comments on commit 2c82c04

Please sign in to comment.