-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
127 lines (104 loc) · 3.28 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from pathlib import Path
import typer
from extract import extract_export, extract_enhg_gold, extract_terminals, extract_conlluplus
from pos import compare_pos_tags, replace_pos_tags
from data import get_splits, add_root
from wordpiece import wp_512_check
app = typer.Typer()
@app.command()
def extract_mhg_export(
treebank: Path,
output_file: str,
):
"""This commands reads in the export.txt file generated from the MHG
treebank and returns it in an PTB style
"""
assert treebank.exists()
extract_export(treebank, output_file)
@app.command()
def extract_enhg(
treebank: Path,
output_file: str,
):
"""This commands reads in a gold annotated ENHG treebank and returns a preprocessed, single line ptb formatted tree
"""
assert treebank.exists()
extract_enhg_gold(treebank, output_file)
@app.command()
def convert_conlluplus(
treebank: Path,
morph: bool = typer.Option(False, "--morph", help="if to attach morph and lemma"),
):
"""This commands converts a conlluplus file into a single line ptb formatted tree. Note that this is a 100% flat tree since no syntactic information is provided.
"""
assert treebank.exists()
extract_conlluplus(treebank, morph)
@app.command()
def wordpiece_512check(
sentences: Path,
tokenizer: str,
):
"""This commands reads in a ptb treebank and tokenizer then calculates the
number of wordpieces to see which sentences are > 512.
# dbmdz/bert-base-german-cased
"""
#print(sentences)
assert sentences.exists()
wp_512_check(sentences, tokenizer)
@app.command()
def replace_pos(
treebank1: Path,
treebank2: Path,
):
"""This command replaces the POS tags from treebank with that
of the other treebank
"""
assert treebank1.exists()
assert treebank2.exists()
replace_pos_tags(treebank1, treebank2)
@app.command()
def compare_pos(
treebank1: Path,
treebank2: Path,
):
"""This command compares the POS accuracy of one treebank against
the other. The output file (pos_acc.txt) is currently hardcoded.
"""
assert treebank1.exists()
assert treebank2.exists()
compare_pos_tags(treebank1, treebank2)
@app.command()
def extract_treebank_terminals(
treebank: Path,
):
"""This command compares the POS accuracy of one treebank against
the other. The output file (pos_acc.txt) is currently hardcoded.
"""
assert treebank.exists()
extract_terminals(treebank)
@app.command()
def add_treebank_root(
treebank: Path,
):
"""This command compares the POS accuracy of one treebank against
the other. The output file (pos_acc.txt) is currently hardcoded.
"""
assert treebank.exists()
add_root(treebank)
@app.command()
def make_splits(
treebank: Path,
):
"""This commands reads in the export.txt file generated from the MHG
treebank and returns it in an PTB style. Currently these are hard coded
and should be changed in get_splits in data.py (randomization is default)
todo: option for random, option for split inputs
Args:
treebank: path to treebank file
Returns:
Train, dev, and test splits for a treebank
"""
assert treebank.exists()
get_splits(treebank)
if __name__ == "__main__":
app()