-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathdvc.yaml
70 lines (60 loc) · 2.48 KB
/
dvc.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
vars:
- s2orc:
prefiltered_small: "data/s2orc/s2orc_ai_prefiltered.parquet"
stages:
prepare-s2orc:
cmd: python scripts/s2orc/prepare_parquet_s2orc.py
deps:
- mars/s2orc/metadata/ai/
- scripts/s2orc/prepare_parquet_s2orc.py
outs:
- ${s2orc.prefiltered_small}
process-spacy-s2orc:
cmd: python scripts/s2orc/process_spacy_docs.py ${s2orc.prefiltered_small} data/s2orc/small_ai_spacy_docs.joblib --spacy-model-name en_core_web_sm
deps:
- ${s2orc.prefiltered_small}
- scripts/s2orc/process_spacy_docs.py
outs:
- data/s2orc/small_ai_spacy_docs.joblib
extract-phrases-small-s2orc:
cmd: python scripts/s2orc/extract_phrases.py data/s2orc/small_ai_spacy_docs.joblib data/s2orc/extracted_phrases.parquet
deps:
- scripts/s2orc/extract_phrases.py
- data/s2orc/small_ai_spacy_docs.joblib
outs:
- data/s2orc/extracted_phrases.parquet
embedd-noun-chunks-small-s2orc:
cmd: python scripts/s2orc/embedd_noun_chunkgs.py data/s2orc/extracted_phrases.parquet data/s2orc/embeddings/small_ai_all-MiniLM-L6-v2.json --sentences-embedding="all-MiniLM-L6-v2"
deps:
- scripts/s2orc/embedd_noun_chunkgs.py
- data/s2orc/extracted_phrases.parquet
outs:
- data/s2orc/embeddings/small_ai_all-MiniLM-L6-v2.json
# cluster-small-s2orc:
# cmd: python scripts/s2orc/cluster.py data/s2orc/embeddings/small_ai_all-MiniLM-L6-v2.json data/s2orc/cluterings/ sbert1.parquet
# deps:
# - #TODO
# big-ai-s2orc
process-spacy-docs-big-s2orc:
cmd: python scripts/s2orc/process_spacy_docs.py data/s2orc/big_ai_dataset.parquet data/s2orc/big_ai_spacy_docs.joblib --spacy-model-name en_core_web_md
deps:
- scripts/s2orc/process_spacy_docs.py
- data/s2orc/big_ai_dataset.parquet
outs:
- data/s2orc/big_ai_spacy_docs.joblib
process-spacy-sm-docs-big-s2orc:
cmd: python scripts/s2orc/process_spacy_docs.py data/s2orc/big_ai_dataset.parquet data/s2orc/big_ai_spacy_sm_docs.joblib --spacy-model-name en_core_web_sm
deps:
- scripts/s2orc/process_spacy_docs.py
- data/s2orc/big_ai_dataset.parquet
outs:
- data/s2orc/big_ai_spacy_sm_docs.joblib
extract-noun-chunks-big-s2orc:
cmd: python scripts/s2orc/extract_phrases.py data/s2orc/big_ai_spacy_docs.joblib data/s2orc/big_ai_with_noun_chunks.parquet
deps:
- scripts/s2orc/extract_phrases.py
- data/s2orc/big_ai_spacy_docs.joblib
outs:
- data/s2orc/big_ai_with_noun_chunks.parquet
# embedd-noun-chunks-big-s2orc:
# cmd: python