-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcounting.py
60 lines (42 loc) · 1.81 KB
/
counting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import collections
import math
import typing
from documents import TransformedDocument, TransformedDocumentCollection
def count_words(doc: TransformedDocument) -> collections.Counter:
return collections.Counter(doc.tokens)
def count_words_in_collection(docs: TransformedDocumentCollection) -> collections.Counter:
totals = collections.Counter()
for doc in docs.get_all_docs():
totals.update(count_words(doc))
return totals
def document_counts(docs: TransformedDocumentCollection) -> collections.Counter:
"""
Compute number of documents each word occurs in.
:param docs: TransformedDocumentCollection to run over
:return: A counter mapping tokens to the number of documents each token occurs in.
"""
num_docs = collections.Counter()
for doc in docs.get_all_docs():
num_docs.update(collections.Counter(set(doc.tokens)))
return num_docs
def term_frequency(count:int, doc_len: int) -> float:
return count / doc_len
def inverse_doc_frequency(doc_count: int, collection_size: int) -> float:
return math.log(collection_size / doc_count)
def tf_idf(tf: float, idf: float) -> float:
return tf * idf
def doc_tf_idf_scores(doc: TransformedDocument, doc_frequencies: collections.Counter) -> typing.Dict[str, float]:
out = dict()
term_frequencies = count_words(doc)
for term, freq in term_frequencies:
weight = freq / doc_frequencies[term]
out[term] = weight
return out
def tf_idf_scores(docs: TransformedDocumentCollection):
doc_frequencies = document_counts(docs)
out = list()
for doc in docs.get_all_docs():
out.append(doc_tf_idf_scores(doc, doc_frequencies))
return out
def query_score(query: typing.List[str], doc_weights: typing.Dict[str, float]) -> float:
return sum([doc_weights[term] for term in query])