-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathopensubtitles.py
98 lines (81 loc) · 2.96 KB
/
opensubtitles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import re
import glob
import os
import random
import xml.etree.ElementTree as ET
from tqdm import tqdm
import json
laughs = 0
class StreamArray(list):
"""
https://stackoverflow.com/questions/36157634/how-to-incrementally-write-into-a-json-file
Converts a generator into a list object that can be json serialisable
while still retaining the iterative nature of a generator.
IE. It converts it to a list without having to exhaust the generator
and keep it's contents in memory.
"""
def __init__(self, generator):
self.generator = generator
self._len = 1
def __iter__(self):
self._len = 0
for item in self.generator:
yield item
self._len += 1
def __len__(self):
"""
Json parser looks for a this method to confirm whether or not it can
be parsed
"""
return self._len
def normalize_laughs(sentence):
laugh_stems = ["laugh", "chuckl", "giggl", "titter", "scoff"
"tee-hee", "snigger", "snicker", "chortl", "guffaw", "roar"]
regexstr = '(\[|\()\s(\w+\s)*(' + \
"\w*|".join(laugh_stems) + "\w*)(\s\w+\s)*\s(\]|\))"
new_sent, lau = re.subn(regexstr, '<laughter>',
sentence, flags=re.IGNORECASE)
global laughs
laughs += lau
# if lau:
# print(new_sent)
return new_sent, lau
def xml_to_sentences(xml_corpus):
with open(xml_corpus) as f:
parse = ET.parse(f)
sentences = []
total_laughs = 0
for sentence in parse.getroot():
joined_words = " ".join([t.text for t in sentence if t.tag == "w"])
normalized_words, sentence_laughs = normalize_laughs(joined_words)
total_laughs += sentence_laughs
sentences.append(normalized_words)
return sentences, total_laughs
def os_to_json(path, n_speakers=3, shuffle=False):
speakers = list(map(chr, range(65, 65+n_speakers)))
dialogs = []
# total = len(list(glob.iglob(path + '**/*.xml', recursive=True)))
# print(f'{total} xml files in the corpus')
files = glob.iglob(path + '**/*.xml', recursive=True)
if shuffle:
files = list(files)
random.shuffle(files)
i = 0
for filename in tqdm(files, total=446612):
dialog = {}
dialog['id'] = os.path.basename(filename)
dialog['utts'], n_laughs = xml_to_sentences(filename)
dialog['speakers'] = random.choices(speakers, k=len(dialog['utts']))
yield dialog, n_laughs
if __name__ == '__main__':
handle = os_to_json(
'/scratch/DistributionalDiscourse/opensubtitles/OpenSubtitles/xml/en/')
with open('/scratch/DistributionalDiscourse/opensubtitles/opensubtitles.json', 'w') as f:
stream_array = StreamArray(handle)
for chunk in json.JSONEncoder().iterencode(stream_array):
f.write(chunk)
print(f'total laughs: {laughs}')
# Local Variables:
# python-shell-interpreter: "nix-shell"
# python-shell-interpreter-args: "--run python"
# End: