-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlexicon.py
102 lines (77 loc) · 3.56 KB
/
lexicon.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
'''
Represents a lexicon, which describes all encountered tokens and frequencies,
along with unknown tokens.
The lexicon is typically computed during training time.
'''
from utils import *
from conll_utils import ParsedConllFile, ParsedConllSentence, ParsedConllToken
from feature_map import IndexEncodedFeatureMap
class Lexicon(object):
def __init__(self, modelParams):
self.modelParams = modelParams
self.featureMaps = None
self.tagMap = IndexEncodedFeatureMap()
self.labelMap = IndexEncodedFeatureMap()
self.wordMap = IndexEncodedFeatureMap()
'''
Compute a lexicon (using the training data)
'''
def compute(self):
projectivizeTrainingSet = self.modelParams.cfg \
['projectivizeTrainingSet']
# parameters here must match parameters during corpus feature bag
# generation (such as projectivization)
trainingData = ParsedConllFile(keepMalformed=False,
projectivize=projectivizeTrainingSet, logStats=True)
# log stats here instead of during bag-of-features generation
# because lexicon computation always happens during training
trainingData.read(open(self.modelParams.trainingFile, 'r',
encoding='utf-8').read())
for sentence in trainingData:
for token in sentence.tokens:
# for SyntaxNet,
# normalization ONLY happens in lexicon builder
# yet numbers and up as <UNKNOWN> during training
# interesting...
form = normalizeDigits(token.FORM)
self.wordMap.incrementTerm(form)
self.tagMap.incrementTerm(token.XPOSTAG)
self.labelMap.incrementTerm(token.DEPREL)
self.finalizeLexicon()
def read(self):
self.tagMap = IndexEncodedFeatureMap().loadFrom(
self.modelParams.getFilePath('tag-map'))
self.labelMap = IndexEncodedFeatureMap().loadFrom(
self.modelParams.getFilePath('label-map'))
self.wordMap = IndexEncodedFeatureMap().loadFrom(
self.modelParams.getFilePath('word-map'))
# special values don't get saved, so we still need to finalize lexicon
self.finalizeLexicon()
def write(self):
self.tagMap.writeTo(self.modelParams.getFilePath('tag-map'))
self.labelMap.writeTo(self.modelParams.getFilePath('label-map'))
self.wordMap.writeTo(self.modelParams.getFilePath('word-map'))
'''
After done reading corpus...
'''
def finalizeLexicon(self):
self.wordMap.finalizeBaseValues()
self.tagMap.finalizeBaseValues()
self.labelMap.finalizeBaseValues()
# order of special tokens matches SyntaxNet
self.wordMap.appendSpecialValue("<UNKNOWN>")
self.tagMap.appendSpecialValue("<UNKNOWN>")
self.labelMap.appendSpecialValue("<UNKNOWN>")
self.wordMap.appendSpecialValue("<OUTSIDE>")
self.tagMap.appendSpecialValue("<OUTSIDE>")
self.labelMap.appendSpecialValue("<OUTSIDE>")
# FIXME: is <ROOT> in tag even possible? it seemed to happen in
# testdata but not in UD_English
# difference between stack.tag and stack.token.tag?
#self.tagMap.appendSpecialValue("<ROOT>")
self.labelMap.appendSpecialValue("<ROOT>")
self.featureMaps = {'word': self.wordMap, 'tag': self.tagMap,
'label': self.labelMap}
def getFeatureMaps(self):
assert self.featureMaps != None, 'feature maps not yet created'
return self.featureMaps