-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlibrary.py
397 lines (364 loc) · 13.7 KB
/
library.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
import json
import math
import os
import re
import string
import time
import nltk
import numpy
import threading
import psutil
import random
import pathlib
import pickle
from langdetect import detect
process = psutil.Process(os.getpid())
starting_mem = process.memory_info().rss
def printmem():
mem = process.memory_info().rss - starting_mem
print(f" bytes: {mem:,}")
def detect_english(doc):
# Default langauge is English (most docs are)
# All we care about are the 'en' ones
language = 'en'
try:
if len(doc) > 25:
language = detect(' '.join(doc[:25]))
elif len(doc) > 0:
language = detect(' '.join(doc[:len(doc)]))
except Exception as e:
# If cannot detect language, drop doc
return False
return language == 'en'
def inverse_document_frequency(term, corpus):
num_occurring = 0
for doc in corpus:
if term in doc:
num_occurring += 1
return math.log2(len(corpus)/num_occurring)
def tfidf(word_id, vocabulary, doc, corpus_length):
idf = math.log2(corpus_length/vocabulary['words'][word_id][1])
term_frequency = doc[1][word_id] / doc[2]
return term_frequency * idf
timings = {
'load': 0.0,
'lowercase': 0.0,
'punctuation': 0.0,
'tokenize': 0.0,
'stopwords': 0.0,
'stemming': 0.0,
'vocab': 0.0,
'openfile': 0.0,
'detect_english': 0.0,
'is_number': 0.0,
}
stopwords = set(nltk.corpus.stopwords.words('english'))
def prep_doc(filepath, vocab):
t1 = time.time()
with open(filepath) as file:
timings['openfile'] += time.time()-t1
t1 = time.time()
doc = json.load(file)
timings['load'] += time.time()-t1
# get text and lowercase it, then combine
t1 = time.time()
text = ' '.join([chunk['text'].lower() for chunk in doc['body_text']])
timings['lowercase'] += time.time()-t1
# remove punctuation
t1 = time.time()
regex = re.compile(f"[{re.escape(string.punctuation)}]")
text = regex.sub('', text)
timings['punctuation'] += time.time()-t1
# tokenize, remove empty words, get a sample
t1 = time.time()
text = text.split(' ')
text = [word for word in text if word != '']
timings['tokenize'] += time.time()-t1
# Check langauge - if not English, do not include it
t1 = time.time()
if not detect_english(text):
return
timings['detect_english'] += time.time()-t1
# remove stopwords
t1 = time.time()
text = [word for word in text if word not in stopwords]
timings['stopwords'] += time.time()-t1
# remove numbers
t1 = time.time()
regex = re.compile(f"^[0-9-−]+$")
text = [word for word in text if not regex.match(word)]
timings['is_number'] += time.time()-t1
# Lemmatization
t1 = time.time()
lemmatizer = nltk.wordnet.WordNetLemmatizer()
text = [lemmatizer.lemmatize(word) for word in text]
timings['stemming'] += time.time()-t1
t1 = time.time()
doc_words = {}
for word in text:
word_id = None
# add or find in vocab
if word in vocab['index']:
word_id = vocab['index'][word]
else:
# new entry
word_id = len(vocab['index'])
vocab['index'][word] = word_id
vocab['words'][word_id] = [word, 0]
# create or increment word in this doc's word list
# later the count will be replaced by tfidf
if word_id in doc_words:
doc_words[word_id] += 1
else:
doc_words[word_id] = 1
# update corpus frequencies - each unique word in this doc adds 1
for word in set(text):
word_id = vocab['index'][word]
vocab['words'][word_id][1] = vocab['words'][word_id][1] + 1
timings['vocab'] += time.time()-t1
# max_freq is being stored so we don't have to recalculate for the tfidf of every word
try:
max_freq = max(doc_words.values())
except:
print(f"Problem with doc: {filepath}")
return None
return [os.path.basename(filepath), doc_words, max_freq]
def remove_low_frequency_words(vocab, corpus, min_frequency=2):
removed_words = set()
removed_word_ids = set()
for word_id in vocab['words'].copy():
word, freq = vocab['words'][word_id]
if freq < min_frequency:
removed_words.add(word)
removed_word_ids.add(word_id)
del vocab['words'][word_id]
del vocab['index'][word]
for doc in corpus:
words_to_remove = removed_word_ids.intersection(set(doc[1]))
for word_id in words_to_remove:
del doc[1][word_id]
return vocab, corpus, removed_words
def get_n_docs(n, vocabulary):
dirpath = '../input/CORD-19-research-challenge/document_parses/pdf_json'
corpus = []
i = 0
stats = {
'last_i': 0,
'last_time': 0,
}
run_status_checker = True
def status_checker():
while run_status_checker:
completed_per_sec = (i - stats['last_i']) / (time.time() - stats['last_time'])
stats['last_i'] = i
stats['last_time'] = time.time()
print(f"{i} / {n} ({round(completed_per_sec, 1)}/sec)")
printmem()
time.sleep(1)
threading.Thread(target=status_checker, daemon=True).start()
for path in os.listdir(dirpath):
i += 1
if i > n:
break
corpus.append(prep_doc(os.path.join(dirpath, path), vocabulary))
run_status_checker = False
# Drop all NoneType docs (occurs if not English)
corpus = [x for x in corpus if x != None]
return corpus, vocabulary
def save_project(vocab, corpus, output_folder_path):
corpus_sub_folder = os.path.join(output_folder_path, 'corpus')
pathlib.Path(corpus_sub_folder).mkdir(parents=True, exist_ok=True)
for doc in corpus:
filename = doc[0] + ".pickle"
file_path = os.path.join(corpus_sub_folder, filename)
with open(file_path, 'wb') as out_file:
pickle.dump(doc, out_file)
vocab_file_name = 'vocab.pickle'
vocab_file_path = os.path.join(output_folder_path, vocab_file_name)
with open(vocab_file_path, 'wb') as out_file:
pickle.dump(vocab, out_file)
def load_project(input_folder_path, num_docs=None, random_files=False):
""" num_docs won't make sense for tfidf, since vocab IDF was based on whole corpus """
corpus_path = os.path.join(input_folder_path, 'corpus')
corpus = []
i = 0
filenames = os.listdir(corpus_path)
if random_files and num_docs > 0:
filenames = random.sample(filenames, num_docs)
for filename in filenames:
i += 1
if num_docs is not None and i > num_docs:
break
with open(os.path.join(corpus_path, filename), 'rb') as in_file:
corpus.append(pickle.load(in_file))
vocab_file_name = 'vocab.pickle'
vocab_file_path = os.path.join(input_folder_path, vocab_file_name)
with open(vocab_file_path, 'rb') as in_file:
vocab = pickle.load(in_file)
return vocab, corpus
def doc_tfidf(doc, vocabulary, corpus_length):
doc_words = doc[1]
for word_id in doc_words:
doc_words[word_id] = tfidf(word_id, vocabulary, doc, corpus_length)
return doc
def peek_vocab(vocabulary):
print('--- First 5 vocabulary entries ---')
i = 0
for word_id in vocabulary['words']:
i += 1
if i > 5:
break
print(vocabulary['words'][word_id])
print('--- Last 5 vocabulary entries ---')
i = 0
for word_id in vocabulary['words']:
i += 1
if i < len(vocabulary['words'])-4:
continue
print(vocabulary['words'][word_id])
def peek_doc(doc, vocabulary):
print('--- Peeking at document ---')
print(f"filename: {doc[0]}")
print('--- First 5 unique words in doc ---')
i = 0
for word_id in doc[1]:
i += 1
if i > 5:
continue
actualword = vocabulary['words'][word_id][0]
print(f"'{actualword}', {doc[1][word_id]}")
print('--- Last 5 unique words in doc ---')
i = 0
for word_id in doc[1]:
i += 1
if i < len(doc[1])-4:
continue
actualword = vocabulary['words'][word_id][0]
print(f"'{actualword}', {doc[1][word_id]}")
def vectorize(doc, vocab_length):
# Inflate doc to vocabulary length
doc_vector = []
for i in range(vocab_length):
if i in doc[1].keys():
# If doc contains word in vocab, save it's tfidf
doc_vector.append(doc[1][i])
else:
# Otherwise add a 0 for that word
doc_vector.append(0)
# Return vector of vocabulary length
return doc_vector
def get_vectors(n, dimension):
# These are our hash functions
vectors = []
for i in range(n):
vector = []
possibilities = [-1, 1]
# Each sketch is a vector of only +1's and -1's
for j in range(dimension):
vector.append(possibilities[random.randint(0,1)])
vectors.append(vector)
return vectors
def signature(doc, vectors, a, vocab_length):
def project_on_(u,v):
# This is the scaler projection
# Returns the amount (length) of u that is in the direction of v
return numpy.dot(u,v) / numpy.linalg.norm(v)
doc_vector = vectorize(doc, vocab_length)
signature = []
for vector in vectors:
# Get the amount of doc_vector in the direction of sketch
projection = project_on_(doc_vector, vector)
# Find which bin the projection lands in based on paramter 'a'
hash_bin = math.ceil(projection / a)
signature.append(hash_bin)
return signature
def corpus_signatures_LSH(corpus, sketches, a, r, vocab_length):
# Combines signature generation and LSH steps
def LSH(doc_sig, r):
bands_hashes = []
for i in range(b):
band = doc_sig[i:i+r]
# Use the built-in Python hash function to hash each band
# We can change this as needed
bands_hashes.append(hash(tuple(band)))
return bands_hashes
corpus_new = []
for i in range(len(corpus)):
doc = corpus[i]
doc_id = doc[0]
# Get doc signature using vector projections
doc_sig = signature(doc, sketches, a, vocab_length)
# Then do LSH on signatures with b bands to get hashes
doc_LSH = LSH(doc_sig, r)
corpus_new.append((doc_id, doc_LSH))
return corpus_new
def find_candidates(corpus, LSH_length):
candidates = {}
# Look at all pairs of docs
for i in range(len(corpus)):
doc1 = corpus[i]
doc1_matches = []
for j in range(len(corpus)):
if j != i:
doc2 = corpus[j]
# Find if doc1 and doc2 have same hash value for any band
for k in range(LSH_length):
if doc1[1][k] == doc2[1][k]:
#doc1_matches.append(doc2[0])
doc1_matches.append(j)
#candidates[doc1[0]] = doc1_matches
candidates[i] = doc1_matches
return candidates
def group_candidates(candidates):
# Find groups (clusters) of documents that are all similar to each other
grouped_candidates = {}
group_id = 0
# Once we consider a doc, we do not want to consider it again
# A document can only belong to one group (cluster)
# After we have included a document in a cluster, add it to the exclusion_list so it won't be considered for future clusters
exclusion_list = []
for doc1 in list(candidates.keys()):
if doc1 not in exclusion_list:
docs_similar_to_doc1 = candidates[doc1]
to_check = [x for x in docs_similar_to_doc1 if x not in exclusion_list]
doc1_group = [doc1]
# Check if doc_i that is similar to doc1 is also similar to all other docs in the cluster
for doc_i in to_check:
for clustered_doc in doc1_group:
# If these document appear in each other's similarity lists, add them to group
if doc_i in candidates[clustered_doc]:
doc1_group.append(doc_i)
# Remove those documents from pool
exclusion_list.append(doc_i)
grouped_candidates[group_id] = list(set(doc1_group))
exclusion_list.append(doc1)
group_id += 1
return grouped_candidates
def top_group_words(grouped_candidates, corpus, vocabulary):
new_group_obj = []
# A word can only be associated with one cluster - first come, first served
word_exclusion_list = []
for group_num in grouped_candidates:
group_docs = grouped_candidates[group_num]
group_words = []
for doc in group_docs:
# Add largest tfidf word from each doc that is not already used, up to 10 words
if len(group_words) < 10:
added = False
i = -1
while not added:
largest_tfidf_word = vocabulary['words'][sorted(corpus[0][1], key=corpus[0][1].get)[i]][0]
if largest_tfidf_word not in word_exclusion_list:
group_words.append(largest_tfidf_word)
word_exclusion_list.append(largest_tfidf_word)
added = True
else:
i -= 1
new_group_obj.append((group_docs, group_words))
return new_group_obj
def get_doc_title_from_filename(filename, docs_dir):
path = os.path.join(docs_dir, filename)
doc = json.load(open(path))
return doc['metadata']['title']
def lookup_word(word_id, vocabulary):
return vocabulary['words'][word_id][0]