-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwsd.py
139 lines (114 loc) · 5.05 KB
/
wsd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import abc
import math
import operator
import csv
from collections import defaultdict
class DecisionList(object):
def __init__(self):
self.decision_items = defaultdict(float)
def add_sense(self, sense, collocation, collocation_object_index, score):
key = ",".join([sense, collocation, str(collocation_object_index)])
self.decision_items[key] = score # Overwrite if the key already exists
def save(self, outfile):
x = sorted(self.decision_items.items(), key=operator.itemgetter(1), reverse=True)
y = []
for key, value in x:
y.append( key.split(",") + [str(value)] )
with open(outfile, "w") as f:
writer = csv.writer(f)
writer.writerows(y)
def load(self, infile):
with open(infile) as senses_file:
reader = csv.reader(senses_file)
for row in reader:
sense, collocation, collocation_object_index, score = row
self.add_sense(sense, collocation, collocation_object_index, float(score))
def get_sense(self, tokens, collocation_object_index):
sense = None
score_max = 0
for token in tokens:
kv = { k:v for k,v in self.decision_items.iteritems() if k.endswith(",".join(["", token, str(collocation_object_index)])) }
if len(kv) == 0 or len(kv) > 1: continue # No sense found or ambiguous sense
key, score = kv.items()[0]
if score > score_max:
score_max = score
sense,_,_ = key.split(",")
#print kv.items()[0]
#print "-----"
return sense
def get_senses_score(self, tokens, collocation_object_index):
score_max = 0
senses_max = None
ambiguous = False
for token in tokens:
kv = { k:v for k,v in self.decision_items.iteritems() if k.endswith(",".join(["", token, str(collocation_object_index)])) }
if len(kv) == 0: return None # No sense assigned
keys, scores = kv.keys(), kv.values()
senses = [x.split(",")[0] for x in keys]
if scores[0] > score_max:
score_max = scores[0]
senses_max = senses
if len(scores) > 1:
ambiguous = True
else:
ambiguous = False
return [ senses_max, score_max, ambiguous ]
class Collocation(object):
__metaclass__ = abc.ABCMeta
def __init__(self, senses, index):
self.frequencies = defaultdict(defaultdict)
self.index = index
for sense in senses:
self.frequencies[sense] = defaultdict(int)
@abc.abstractmethod
def get_collocation(self, text, offset):
pass
def add_collocation(self, text, offset, sense):
tokens = self.get_collocation(text, offset)
if tokens == None: return
for token in tokens:
self.frequencies[sense][token] += 1
def update_decision_list(self, decision_list):
sense_a = self.frequencies.items()[0][0]
sense_b = self.frequencies.items()[1][0]
for collocation_a, frequency_a in self.frequencies.items()[0][1].items():
frequency_b = next((frequency_b for collocation_b, frequency_b in self.frequencies.items()[1][1].items() if collocation_a == collocation_b), 1e-1)
frequency_a = float(frequency_a)
frequency_b = float(frequency_b)
total = frequency_a + frequency_b
logl = abs(math.log((frequency_a/total) / (frequency_b/total)))
decision_list.add_sense(sense_a, collocation_a, self.index, logl)
for collocation_b, frequency_b in self.frequencies.items()[1][1].items():
frequency_a = next((frequency_a for collocation_a, frequency_a in self.frequencies.items()[0][1].items() if collocation_a == collocation_b), 1e-1)
frequency_a = float(frequency_a)
frequency_b = float(frequency_b)
total = frequency_a + frequency_b
logl = abs(math.log((frequency_a/total) / (frequency_b/total)))
decision_list.add_sense(sense_b, collocation_b, self.index, logl)
class BigramLeft(Collocation):
def get_collocation(self, text, offset):
if offset == 0:
return None
return [text.tokens[offset-1]]
class BigramRight(Collocation):
def get_collocation(self, text, offset):
if offset >= len(text.tokens) - 1:
return None
return [text.tokens[offset+1]]
class BigramScope(Collocation):
def __init__(self, senses, index, scope):
super(self.__class__, self).__init__(senses, index)
self.scope = scope
def get_collocation(self, text, offset):
scope_left = [max(offset - self.scope[1], 0), offset - self.scope[0]]
if offset - scope_left[0] < self.scope[0]: scope_left = [-1, -1]
scope_right = [offset + self.scope[0], min(offset + self.scope[1], len(text.tokens) - 1)]
if scope_right[1] - offset < self.scope[1]: scope_right = [-1, -1]
tokens = []
for i in range(scope_left[0], scope_left[1] + 1) + range(scope_right[0], scope_right[1] + 1):
tokens += [text.tokens[i]]
tokens = list(set(tokens)) # Remove duplicates
return tokens
def print_context(text, offset, offset_margin=20):
#print " ".join(text.tokens[map(lambda x: x-5 if (x-offset_margin) > 0 else 0, [offset])[0]:offset+offset_margin])
print " ".join(text.tokens[ max(offset-offset_margin, 0) : min(offset+offset_margin, len(text.tokens)) ])