-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtfidf2.py
126 lines (100 loc) · 4.15 KB
/
tfidf2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""
Computes TFIDF cosine similarity
Add your documents as two-element lists `[docname, [list_of_words_in_the_document]]` with `addDocument(docname, list_of_words)`. Get a dict of all the `{docname: similarity_score}` pairs relative to a document by calling `similarities(queryName)`.
"""
import math
import numpy as np
class tfidf2:
def __init__(self):
self.documents = {} # {doc:{word:tfidf}} after call prep(), contains each doc name and its doc dict, the doc dict is {word:tfidf}
self.corpus_dict = {} # {word:df}, contains all the words, and document frequency (df)
self.idf = {} # idf
self.num_docs = 0 # number of documents
self.prepStatus = False
def addDocument(self, doc_name, list_of_words):
'''
Add document one by one
:param doc_name: document name, or uuid
:param list_of_words: word list correspond to the doc name
:return: void
'''
## compute tf (doc dict is the dict of the single doc)
doc_dict = {}
for w in list_of_words:
doc_dict[w] = doc_dict.get(w, 0.0) + 1.0 # if the word w exists, plus 1 to its value; if not exists, make its value 1
# normalizing the doc dict (creating tf score)
length = float(len(list_of_words))
for k in doc_dict:
doc_dict[k] = doc_dict[k] / length
# add the normalized document and its tf score to the corpus
self.documents[doc_name] = doc_dict
## finish the work on tf
# make change to the global df
for w in set(list_of_words):
self.corpus_dict[w] = self.corpus_dict.get(w, 0.0) + 1.0 # count each word's to the whole corpus contribution only once
def prep(self):
'''
Prepare the tfidf value for each doc in corpus.
:return: void
'''
# creating idf dict
self.num_docs = len(self.documents)
for i, j in self.corpus_dict.items():
self.idf[i] = math.log(self.num_docs / self.corpus_dict[i])
# computing tfidf for each document
for doc in self.documents:
for i in self.documents[doc]: # i is word
self.documents[doc][i] *= self.idf[i]
self.prepStatus = True
def similarities_by_name(self, queryName):
'''
Calculates cosine tfidf similarities w.r.t each doc in the corpus. Query by name existed. Returns a dict {docname:similarity_score} pairs.
:param queryName: query uuid
:return: sims: query word list's tfidf similarity to all documents in the corpus
'''
if self.prepStatus == False:
print "Not Prepared, pls call prep() first"
return
query_dict = self.documents[queryName]
# computing similarities
sims = {}
for doc in self.documents:
score = 0.0
doc_dict = self.documents[doc]
for k in query_dict: # k is each word in query dict
if k in doc_dict:
score += query_dict[k] * doc_dict[k]
score /= np.linalg.norm(np.array(query_dict.values())) * np.linalg.norm(np.array(doc_dict.values()))
sims[doc] = score
return sims
def similarities_by_wordlist(self, list_of_words):
'''
Calculates cosine tfidf similarities w.r.t each doc in the corpus. Query by new list of words. Returns a dict {docname:similarity_score} pairs.
:param list_of_words: a list of words
:return: sims: query word list's tfidf similarity to all documents in the corpus
'''
if self.prepStatus == False:
print "Not Prepared, pls call prep() first"
return
query_dict = {}
for w in list_of_words:
query_dict[w] = query_dict.get(w, 0.0) + 1.0 # if the word w exists, plus 1 to its value; if not exists, make its value 1
# calculate tfidf
# assume all the query words exist in idf !!!IMPORTANT!!!
length = float(len(list_of_words))
for k in query_dict:
query_dict[k] = (query_dict[k] / length) * self.idf[k]
# computing similarities
sims = {}
for doc in self.documents:
score = 0.0
doc_dict = self.documents[doc]
for k in query_dict: # k is each word in query dict
if k in doc_dict:
score += query_dict[k] * doc_dict[k]
score /= np.linalg.norm(np.array(query_dict.values())) * np.linalg.norm(np.array(doc_dict.values()))
sims[doc] = score
return sims
# usage
# import tf_idf
# table = tf_idf.tfidf()