forked from kaeleylenard/ElasticSearch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfinder.py
124 lines (98 loc) · 3.93 KB
/
finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import json
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from collections import defaultdict
import math
import time
term_frequency = defaultdict(int)
def stem_words(query):
"""
query: list of original query terms
rare_words: list of query terms that are stemmed and not stop words
"""
ps = PorterStemmer()
return [ps.stem(word) for word in query]
def weigh_query(query):
Scores = defaultdict(list) # Holds score for all docs found
Magnitude = defaultdict(int) # Holds length of documents
tdif_dict = "C:\Test\/tf_idf_score_dict.txt"
# Areeta
# final_text_index = "/Users/AreetaW/Desktop/final_text_index.txt"
# final_url_index = "/Users/AreetaW/Desktop/final_url_index.txt"
# dev_directory = '/Users/AreetaW/Desktop/cs/cs-121/assignment3/DEV/'
# Kaeley
# final_text_index = "/Users/kaeleylenard/Desktop/final_text_index.txt"
# final_url_index = "/Users/kaeleylenard/Desktop/final_url_index.txt"
# dev_directory = '/Users/kaeleylenard/Documents/CS121-Spring2020/SearchEngine/DEV'
# Cristian
final_url_index = "C:\Test\/branch\/final_url_index.txt"
dev_directory = 'C:\Test\DEV\/'
index_positions = "C:\Test\/index_positions.txt"
compiled_index = "C:\Test\/compiled_text_index.txt"
# Load file with idf scores for cosine
with open(tdif_dict) as idf_scores:
tfidf_response = json.loads(idf_scores.read())
# Load file with positions of word lines in compiled_index
with open(index_positions) as index_file:
index_responses = json.loads(index_file.read())
# Have address of compiled text
text_file = open(compiled_index)
for word in query:
if word in index_responses:
pos = index_responses[word]
text_file.seek(pos)
posts = text_file.readline()
posts = re.sub(f'{word},"', '', str(posts))
posts = re.sub(f'"', '', str(posts))
posts = re.sub('}{', ',', str(posts))
posts = eval(posts)
term_idf = tfidf_response[word]
for (docID, docTF, docLength) in posts:
if docLength < 150:
pass
else:
Scores[docID].append(([word, term_idf * docTF]))
Magnitude[docID] = docLength
else:
pass
text_file.close()
idf_scores.close()
# Ensure found documents have all query terms
query_length_check = [key for key in Scores if len(Scores[key]) < len(query)]
for key in query_length_check:
del Scores[key]
# Combine scores from each individual query term into final doc score
for k, v in Scores.items():
Scores[k] = sum([float([v for v in group if type(v) == float][0]) for group in v])
# Attempt to normalize by diving by length of document
for doc in Scores:
Scores[doc] = Scores[doc] / Magnitude[doc]
# Get top 5 from scores by ranking by cosine score
five = [k for k, v in sorted(Scores.items(), key=lambda item: item[1], reverse=True)]
final = []
# Load file with urls attached to json file name
with open(final_url_index) as url_file:
url_response = json.loads(url_file.read())
for docID in five[:5]:
url = url_response['0'][str(docID)]
json_path = dev_directory + url_response['0'][str(docID)]
json_response = json.loads((open(json_path)).read())
final.append(json_response['url'])
url_file.close()
return final
def retrieval_component(query):
"""
query: list of original query terms
query_words: stemmed, rare query terms
returned_docs: docs that contain ALL the query_words
"""
rare_query = stem_words(query)
returned_docs = weigh_query(rare_query)
for links in returned_docs:
print(links)
return returned_docs
if __name__ == "__main__":
user_query = input("Search: ")
split_query = user_query.split()
retrieval_component(split_query)