-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathprepareData.py
61 lines (49 loc) · 1.55 KB
/
prepareData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/python3
import argparse
import sys, os
import nltk
import json
parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type = str, help = 'Folder containing documents to be summarized')
parser.add_argument('--prep_path', type = str, help = 'Path to store the prepared data in json format')
args = parser.parse_args()
BASEPATH = args.data_path
writepath = args.prep_path
separator = "\t"
FILES = []
FILES2 = os.listdir(BASEPATH)
for f in FILES2:
FILES.append(f)
DATA_FILES = {}
for F in FILES:
ifname = os.path.join(BASEPATH,F)
#print(F)
fp = open(ifname,'r')
dic = {}
for l in fp:
try:
wl = l.split(separator)
CL = wl[1].strip(' \t\n\r')
TEXT = wl[0].strip(' \t\n\r')
TEXT = TEXT.replace("sino noindex makedatabase footer start url", "")
if TEXT:
if dic.__contains__(CL)==True:
temp = dic[CL]
temp.append(TEXT)
dic[CL] = temp
else:
dic[CL] = [TEXT]
except Exception as e:
print(e)
f_d = {}
for cl,sentences in dic.items():
temp = []
for s in sentences:
tokens = nltk.word_tokenize(s)
t = (s,tokens,nltk.pos_tag(tokens))
temp.append(t)
f_d[cl] = temp
DATA_FILES[F.split('.txt')[0].strip(' \t\n\r')] = f_d
print('Complete {}'.format(F))
with open(writepath+'prepared_data.json','w') as legal_f:
json.dump(DATA_FILES,legal_f,indent=4)