-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsearcher.py
162 lines (126 loc) · 5.34 KB
/
searcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# -*- coding: UTF-8 -*-
import sys
from search import Search
from content import Content
from collections import Counter
from tokenizer import Tokenizer
import termcolor
NGRAM = 2
DAMPING_SCORE = 10
BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8) #term color
class Searcher:
def __init__(self):
self.engine = Search(NGRAM, "./")
self.tokenizer = Tokenizer("ma")
self.query_string = []
#just a wrapper for search engine search function
def _execute(self, statement, numOfResult):
return self.engine.zenhan_search(unicode(statement, "UTF-8"), numOfResult)
#single word is statement without operator (without space as AND or OR keyword)
def execute_with_singleword(self, statement, numOfResult):
search_result = self._execute(statement, numOfResult)
statement = self.tokenizer.split(unicode(statement, "UTF-8"))
self.print_result(search_result, statement)
#input as a list of tubles: [(id1, score1), (id2, score2)... ]
#search content for each id
#print content
def print_result(self, search_result, query):
#[TODO] print matched words with color
for elem in search_result:
doc = self.engine.content.get(elem[0])
termcolor.printcolor(doc, query)
print ""
#list word is statement with operator (with space as AND and OR operator)
def execute_with_listword(self, statementList, numOfResult):
normalized_list = []
if ("OR" in statementList): #--> or routine
#because can not contain AND and OR in one query
#so we normalize all strings which have space
statementList = statementList.split()
prev_or = -1
statements_len = len(statementList)
for i in range(0, statements_len):
if (statementList[i] == "OR"):
if (prev_or + 1) >= i:
return None
else:
normalized_list.append("".join(statementList[(prev_or+1):(i)]))
prev_or = i
if (i == statements_len-1):
normalized_list.append("".join(statementList[(prev_or+1):(i+1)]))
return self._or_operator(normalized_list, numOfResult)
else: #--> and routine
normalized_list = statementList.split()
return self._and_operator(normalized_list, numOfResult)
#take input as statement list (for example "a OR B" will as ["a", "OR", "b"]
#preprocess to concat string with space (for ex: "a b OR c" will as ["ab", "c"]
#take result of each statement and return list of result
#execute OR operator for all results
def _or_operator(self, statementList, numOfResult):
result = []
for statement in statementList:
#append to query to print color
tokens = self.tokenizer.split(unicode(statement, "UTF-8"))
for t in tokens:
if t not in self.query_string: self.query_string.append(t)
for i in range(0, len(statementList)):
temp_ret = self._execute(statementList[i], "all")
result.append(temp_ret) #[TODO] move below process to here!!!
#or list of result
prev_list = []
cur_list = []
accumulate_result = Counter()
for j in range(0, len(result)):
if not result[j]: continue; #in case not search any thing
if (j == 0):
prev_list = result[j]
continue
cur_list = result[j]
#OR operator bw previous list to current list
max_score = cur_list[0][1] #max score is first element because our list is sorted
for m in range(0, len(cur_list)-1):
content_id = cur_list[m][0]
content_score = cur_list[m][1]
exist = [i for i,v in enumerate(prev_list) if v[0] == content_id]
if (len(exist) > 0): # if an elent exist in both list, reduce score
accumulate_result[content_id] = content_score - max_score
else :
accumulate_result[content_id] = content_score
prev_list = cur_list
self.print_result(accumulate_result.most_common(numOfResult), self.query_string)
#take input as statement list (for example "a b" will as ["a", "b"]
#take result of each statement and return list of result
#execute AND operator for all results (simply merge all result + increase score)
def _and_operator(self, statementList, numOfResult):
#[TODO] set ealier token higher score
accumulate_result = Counter()
for statement in statementList:
#append to query to print color
tokens = self.tokenizer.split(unicode(statement, "UTF-8"))
for t in tokens:
if t not in self.query_string: self.query_string.append(t)
result = self._execute(statement, "all")
for content in result:
id = content[0]
score = content[1]
accumulate_result[id] += score
self.print_result(accumulate_result.most_common(numOfResult), self.query_string)
if __name__ == "__main__":
#[TODO] load once, search multiple!
param_len = len(sys.argv)
if (param_len) < 3:
print "usage: ./searcher.py statement numOfResult"
sys.exit(1)
statement = None
statement_list = None
if (param_len == 3):
statement = sys.argv[1]
numOfResult = int(sys.argv[2])
else:
statement_list = " ".join(sys.argv[1:(param_len-1)])
numOfResult = int(sys.argv[param_len-1])
searcher = Searcher()
if statement != None:
searcher.execute_with_singleword(statement, numOfResult)
if statement_list != None:
searcher.execute_with_listword(statement_list, numOfResult)