-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
188 lines (161 loc) · 5.8 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import tensorflow as tf
from tensorflow.python.ops import control_flow_ops as cf
'''
Replace all digits with 9s like SyntaxNet
'''
def normalizeDigits(form):
newform = ''
for i in range(len(form)):
if ord(form[i]) >= ord('0') and ord(form[i]) <= ord('9'):
newform += '9'
else:
newform += form[i]
return newform
'''
Gets array shape of dynamically shaped tensors
Ex.
dense_golden = tensorPrintShape(dense_golden, [dense_golden],
'dense_golden shape')
'''
def tensorPrintShape(inp, data, comment):
def np_print(*args):
for x in args:
print(comment, x.shape)
return cf.with_dependencies([tf.py_func(np_print, data, [])], inp)
'''
Ex.
dense_golden = tensorPrint(dense_golden, [dense_golden], 'dense_golden data')
'''
def tensorPrint(inp, data, comment):
def np_print(*args):
for x in args:
print(comment, x)
return cf.with_dependencies([tf.py_func(np_print, data, [])], inp)
tensorDumpValsCallCount = {}
'''
Ex.
dense_golden = tensorDumpVals(dense_golden, [dense_golden],
'/tmp/ash_dense_golden_1', 1)
'''
# print only the desired_iter'th time the function is called (1-based)
# for this particular filename
def tensorDumpVals(inp, data, fname, desired_iter):
global tensorDumpValsCallCount
def np_print(*args):
global tensorDumpValsCallCount
if fname not in tensorDumpValsCallCount:
tensorDumpValsCallCount[fname] = 0
tensorDumpValsCallCount[fname] += 1
# only execute for the iteration # desired
if tensorDumpValsCallCount[fname] == desired_iter:
fd = open(fname, 'w')
for x in args:
for elem in x.flatten():
fd.write('%.8f\n' % elem)
fd.close()
return cf.with_dependencies([tf.py_func(np_print, data, [])], inp)
'''
Ex.
dense_golden = tensorDumpValsAllIter(dense_golden, [dense_golden],
'/tmp/ash_dense_golden')
'''
def tensorDumpValsAllIter(inp, data, fname):
global tensorDumpValsCallCount
def np_print(*args):
global tensorDumpValsCallCount
if fname not in tensorDumpValsCallCount:
tensorDumpValsCallCount[fname] = 0
tensorDumpValsCallCount[fname] += 1
fd = open('%s_%04d' % (fname, tensorDumpValsCallCount[fname]), 'w')
for x in args:
for elem in x.flatten():
fd.write('%.8f\n' % elem)
fd.close()
return cf.with_dependencies([tf.py_func(np_print, data, [])], inp)
'''
See SyntaxNet utils.h
'''
kPunctuation = [
(33, 35), (37, 42), (44, 47), (58, 59),
(63, 64), (91, 93), (95, 95), (123, 123),
(125, 125), (161, 161), (171, 171), (183, 183),
(187, 187), (191, 191), (894, 894), (903, 903),
(1370, 1375), (1417, 1418), (1470, 1470), (1472, 1472),
(1475, 1475), (1478, 1478), (1523, 1524), (1548, 1549),
(1563, 1563), (1566, 1567), (1642, 1645), (1748, 1748),
(1792, 1805), (2404, 2405), (2416, 2416), (3572, 3572),
(3663, 3663), (3674, 3675), (3844, 3858), (3898, 3901),
(3973, 3973), (4048, 4049), (4170, 4175), (4347, 4347),
(4961, 4968), (5741, 5742), (5787, 5788), (5867, 5869),
(5941, 5942), (6100, 6102), (6104, 6106), (6144, 6154),
(6468, 6469), (6622, 6623), (6686, 6687), (8208, 8231),
(8240, 8259), (8261, 8273), (8275, 8286), (8317, 8318),
(8333, 8334), (9001, 9002), (9140, 9142), (10088, 10101),
(10181, 10182), (10214, 10219), (10627, 10648), (10712, 10715),
(10748, 10749), (11513, 11516), (11518, 11519), (11776, 11799),
(11804, 11805), (12289, 12291), (12296, 12305), (12308, 12319),
(12336, 12336), (12349, 12349), (12448, 12448), (12539, 12539),
(64830, 64831), (65040, 65049), (65072, 65106), (65108, 65121),
(65123, 65123), (65128, 65128), (65130, 65131), (65281, 65283),
(65285, 65290), (65292, 65295), (65306, 65307), (65311, 65312),
(65339, 65341), (65343, 65343), (65371, 65371), (65373, 65373),
(65375, 65381), (65792, 65793), (66463, 66463), (68176, 68184)
]
'''
Determines if the specified unicode ordinal is punctuation or not
'''
def isPunctuation(uni_ord):
assert type(uni_ord) is int
i = 0
while kPunctuation[i][0] > 0:
if uni_ord < kPunctuation[i][0]:
return False
if uni_ord <= kPunctuation[i][1]:
return True
i += 1
return False
'''
Returns true if word consists of punctuation characters.
'''
def isPunctuationToken(word):
for c in word:
if not isPunctuation(ord(c)):
return False
return True
'''
Determine if tag is a punctuation tag.
'''
def isPunctuationTag(tag):
# match SyntaxNet behavior
#if len(tag) == 0:
# return False
for c in tag:
if (c != ',' and c != ':' and c != '.' and c != '\'' and c != '`'):
return False
return True
'''
Returns true if tag is non-empty and has only punctuation or parens
symbols.
'''
def isPunctuationTagOrParens(tag):
if len(tag) == 0:
return False
for c in tag:
if (c != '(' and c != ')' and c != ',' and c != ':' and c != '.' and \
c != '\'' and c != '`'):
return False
return True
# FIXME: empty tags might show as '_' in CoNLL. '_' behavior is not
# well-defined in the specification.
'''
Return whether or not we should score a token based on the current
scoring strategy
'''
def shouldScoreToken(word, tag, scoring_strategy):
if scoring_strategy == 'default':
return len(tag) == 0 or not isPunctuationTag(tag)
elif scoring_strategy == 'conllx':
return not isPunctuationToken(word)
elif scoring_strategy == 'ignore_parens':
return not isPunctuationTagOrParens(tag)
assert None, 'unknown scoring strategy: ' + scoring_strategy