forked from vrandezo/lexicographic_coverage
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcorpora-download.py
44 lines (32 loc) · 1.09 KB
/
corpora-download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import gzip
import os.path
import re
import tarfile
import urllib.request
import meta
# Download files
for language in meta.languages:
local_file = meta.corpora_dir + "/" + meta.data[language]["remotefile"]
if os.path.exists(local_file):
continue
remote_file = meta.data[language]["remoteurl"]
urllib.request.urlretrieve(remote_file, local_file)
print("Downloaded " + language)
# Convert Uni Leipzig files
for language in meta.languages:
if meta.data[language]["source"] != "unileipzig":
continue
remote_file = meta.data[language]["remotefile"]
local_file = meta.corpora_dir + "/" + remote_file
new_local_file = meta.corpora_dir + "/" + language + ".txt.gz"
if os.path.exists(new_local_file):
continue
filename = re.sub(r"^(.*)\.tar.gz$", r"\1/\1-sentences.txt", remote_file)
with tarfile.open(local_file, "r:gz") as f_in:
with gzip.open(new_local_file, "wb") as f_out:
if filename not in f_in.getnames():
print(filename + " not found in archive")
continue
for line in f_in.extractfile(filename) or []:
f_out.write(line.split(b"\t")[1])
print("Converted " + language)