forked from vrandezo/lexicographic_coverage
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathdump-parse.py
56 lines (43 loc) · 1.16 KB
/
dump-parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import gzip
import json
import sys
import meta
count = 0
outputs = {}
for language in meta.languages:
filename = meta.output_dir + "/" + "formlist-" + language + ".txt"
outputs[language] = open(filename, "w")
errorcount = 0
dictcount = 0
try:
fh = gzip.open(meta.local_dump_file)
except Exception:
print("Couldn't read {}".format(meta.local_dump_file))
sys.exit(1)
for line in fh:
count += 1
line = line.decode("utf-8").strip()
# Not long enough to contain lexeme data
if len(line) < 2:
continue
# Remove trailing comma
if line[-1] == ",":
line = line[:-1]
lexeme = json.loads(line)
if (lexeme["language"] in meta.mapq) \
and (meta.mapq[lexeme["language"]] in meta.languages):
dictcount += 1
for form in lexeme["forms"]:
try:
for lcode in form["representations"]:
outputline = form["representations"][lcode]["value"] + "\n"
outputs[meta.mapq[lexeme["language"]]].write(outputline)
except Exception:
errorcount += 1
print(errorcount)
print(lexeme["id"])
print(lexeme["lemmas"])
print("")
for language in meta.languages:
outputs[language].close()
print("{:,} Lexemes total, {:,} used".format(count, dictcount))