-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxml_util.py
executable file
·400 lines (312 loc) · 12.3 KB
/
xml_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
#!/usr/bin/python2.4
#
# Copyright 2009 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""XML utilities.
Useful for dealing with XML DOM objects (xml.dom.Node, etc.) in the context of
App Engine Datastore (google.appengine.ext.db).
"""
__author__ = '[email protected] (Matt Frantz)'
import re
try:
import google3
from google3.apphosting.ext import db
from google3.apphosting.runtime.apiproxy_errors import DeadlineExceededError
from google3.pyglib import logging
except ImportError, e:
import logging
from google.appengine.ext import db
from google.appengine.runtime import DeadlineExceededError
import iso8601
# Allow independent control of the logging from this module.
try:
logger = logging.getLogger(__file__)
logger.setLevel(logging.INFO)
except AttributeError:
logger = logging
logger.warn('No independent logging')
class Error(Exception):
pass
class RecoverableError(Error):
"""Base exception for CodeNode errors that do not abort the copy."""
class MultipleNodeError(RecoverableError):
"""Used by CopyNodes to indicate multiple XML nodes.
Attributes:
tag_name: Name of the XML node (str)
child_nodes: List of two or more XML nodes (List of xml.dom.Node)
"""
def __init__(self, tag_name, child_nodes):
"""Initializes a MultipleNodeError.
Args:
node_name: Name of the XML node (str)
child_nodes: List of two or more XML nodes (List of xml.dom.Node)
"""
RecoverableError.__init__(
self, 'Duplicate child nodes "%s": %s' %
(tag_name, ','.join([str(x) for x in child_nodes])))
self.tag_name = tag_name
self.child_nodes = child_nodes
class CopyNodeError(RecoverableError):
"""Used by CopyNodes to indicate an attribute that could not be copied.
Attributes:
attribute_name: Name of the target attribute (str)
attribute_value: Text form of the value that provoked the error (str)
root_cause: Exception that describes the error in detail (Exception)
"""
def __init__(self, attribute_name, attribute_value, root_cause):
"""Initializes a CopyNodeError.
Args:
attribute_name: Name of the target attribute (str)
attribute_value: Text form of the value that provoked the error (str)
root_cause: Exception that describes the error in detail (Exception)
"""
RecoverableError.__init__(
self, 'Error copying %r from %r: %s: %s' %
(attribute_name, attribute_value, root_cause.__class__.__name__,
root_cause))
self.attribute_name = attribute_name
self.attribute_value = attribute_value
self.root_cause = root_cause
def NodeToString(xml_node):
"""Returns an XML string.
Args:
xml_node: xml.dom.Node object
Returns:
String containing XML
"""
return xml_node.toxml()
def GetText(nodes):
"""Concatenates text from text nodes.
Args:
nodes: List of xml.dom.Node objects
Returns:
Concatenation of text from any TEXT_NODE nodes (string)
"""
text = ""
for node in nodes:
if node.nodeType == node.TEXT_NODE:
text = text + node.data
return text.strip()
def CopyNodes(model, node, names, converter, name_map=None):
"""Copies XML nodes into model attributes of the same respective names.
Args:
model: db.Model object
node: xml.dom.Node object
names: List of child node / model attribute names (strings)
converter: Function which converts the text from the XML into an
appropriate object for the data model.
name_map: Dict to translate child node names to model attribute names
(str:str)
Returns:
List of RecoverableError objects, possibly empty, representing multiple
nodes that are found for any of the names, or if there is a problem
converting the text into the appropriate value object.
"""
errors = []
for tag_name in names:
child_nodes = node.getElementsByTagName(tag_name)
if child_nodes:
# If we have a validating XML parser, we shouldn't have to check this.
if len(child_nodes) > 1:
errors.append(MultipleNodeError(tag_name, child_nodes))
continue
text = GetText(child_nodes[0].childNodes)
if text:
if name_map and tag_name in name_map:
attr_name = name_map[tag_name]
else:
attr_name = tag_name
logger.debug('Setting "%s" to "%s"', attr_name, text)
try:
setattr(model, attr_name, converter(text))
except (DeadlineExceededError, AssertionError):
raise
except Exception, e:
errors.append(CopyNodeError(attr_name, text, e))
return errors
def CopyNodeLists(model, node, names, converter, name_map=None):
"""Copies multiple XML nodes into model list attributes.
Args:
model: db.Model object
node: xml.dom.Node object
names: List of child node / model attribute names (strings)
converter: Function which converts the text from the XML into an
appropriate object for the data model.
name_map: Dict to translate child node names to model attribute names
(str:str)
Returns:
List of RecoverableError objects, possibly empty, representing problems
converting the text into the appropriate value object.
"""
errors = []
for tag_name in names:
for child_node in node.getElementsByTagName(tag_name):
text = GetText(child_node.childNodes)
if text:
if name_map and tag_name in name_map:
attr_name = name_map[tag_name]
else:
attr_name = tag_name
logger.debug('Appending "%s" with "%s"', attr_name, text)
try:
getattr(model, attr_name).append(converter(text))
except (DeadlineExceededError, AssertionError):
raise
except Exception, e:
errors.append(CopyNodeError(attr_name, text, e))
return errors
# Default Unicode encoding when strings are parsed.
# TODO(Matt Frantz): Decide if this is the right encoding.
_DEFAULT_ENCODING = 'utf8'
def ParseString(xml_text):
"""Converts XML text into a unicode object for a db.StringProperty attribute.
The default encoding for the db module is 'ascii', but we may receive other
encodings from XML, which could contain non-ASCII characters. This routine
produces 'utf8' unicode for str arguments.
Args:
xml_text: From GetText (str or unicode)
Returns:
unicode object
"""
if type(xml_text) == unicode:
return xml_text
else:
return unicode(xml_text, encoding=_DEFAULT_ENCODING)
def CopyStringNodes(model, node, names, name_map=None):
"""Copies XML nodes into string attributes of the same respective names.
Args:
model: db.Model object
node: xml.dom.Node object
names: List of child node / model attribute names (strings) that have
StringProperty type.
name_map: Dict to translate child node names to model attribute names
(str:str)
Returns:
List of RecoverableError objects, possibly empty, representing multiple
nodes that are found for any of the names, or if there is a problem
converting the text into the appropriate value object.
"""
return CopyNodes(model, node, names, ParseString, name_map)
def CopyStringNodeLists(model, node, names, name_map=None):
"""Copies multiple XML nodes into model list attributes.
Args:
model: db.Model object
node: xml.dom.Node object
names: List of child node / model attribute names (strings) that have
StringProperty type.
name_map: Dict to translate child node names to model attribute names
(str:str)
Returns:
List of RecoverableError objects, possibly empty, representing problems
converting the text into the appropriate value object.
"""
return CopyNodeLists(model, node, names, ParseString, name_map)
def ParseText(xml_text):
"""Converts XML text into a db.Text object.
Args:
xml_text: From GetText (str or unicode)
Returns:
db.Text object
"""
return db.Text(ParseString(xml_text))
def CopyTextNodes(model, node, names, name_map=None):
"""Copies XML nodes into model text attributes of the same respective names.
Args:
model: db.Model object
node: xml.dom.Node object
names: List of child node / model attribute names (strings) that have
TextProperty type.
Returns:
List of RecoverableError objects, possibly empty, representing multiple
nodes that are found for any of the names, or if there is a problem
converting the text into the appropriate value object.
"""
return CopyNodes(model, node, names, ParseText, name_map)
def CopyTextNodeLists(model, node, names, name_map=None):
"""Copies multiple XML nodes into model list attributes.
Args:
model: db.Model object
node: xml.dom.Node object
names: List of child node / model attribute names (strings) that have
TextProperty type.
name_map: Dict to translate child node names to model attribute names
(str:str)
Returns:
List of RecoverableError objects, possibly empty, representing problems
converting the text into the appropriate value object.
"""
return CopyNodeLists(model, node, names, ParseText, name_map)
# Hi-resolution timestamps are not officially supported by ISO 8601, but they
# appear in some CAP's, like those in the USGS volcano feed. This regex will
# extract the conforming portion.
HIRES_DATETIME = re.compile('^(\d{8}T\d{6})\.\d{3}Z$')
# Parse certain ISO 8601 timestamps that do not include delimiters between the
# components. See below for TODO.
ISO_8601_WITHOUT_DELIMITERS = re.compile(
'^(\d{4})(\d\d)(\d\d)T(\d\d)(\d\d)(\d\d)(.*)')
def ParseDateTime(xml_text):
"""Converts XML ISO 8601 date/time representation into datetime.
Args:
xml_text: ISO 8601 representation (string)
Returns:
datetime.datetime object
Raises:
ValueError: If xml_text is not a valid ISO 8601 representation.
"""
# TODO(Matt Frantz): Figure out how to handle non-standard datetime formats.
# Right now, we assume it is ISO 8601 compliant before trying other formats.
try:
return iso8601.parse_date(xml_text)
except (TypeError, iso8601.ParseError):
# TODO(Matt Frantz): When the iso8601 module supports all ISO 8601 formats,
# we should just be able to simply drop the "without delimiters" portion.
# But to workaround, we need to add delimiters.
match = ISO_8601_WITHOUT_DELIMITERS.match(xml_text)
if match:
iso8601_without_delimiters = '%s-%s-%sT%s:%s:%s%s' % match.groups()
logger.debug('ISO 8601 without delimiters: %r',
iso8601_without_delimiters)
return iso8601.parse_date(iso8601_without_delimiters)
else:
raise ValueError('Invalid date-time representation: %r' % xml_text)
def CopyDateTimeNodes(model, node, names, name_map=None):
"""Copies XML nodes with date-time data into model attributes by name.
Args:
model: db.Model object
node: xml.dom.Node object
names: List of child node / model attribute names (strings) that have
DateTimeProperty type.
name_map: Dict to translate child node names to model attribute names
(str:str)
Returns:
List of RecoverableError objects, possibly empty, representing multiple
nodes that are found for any of the names, or if there is a problem
converting the text into the appropriate value object.
"""
return CopyNodes(model, node, names, ParseDateTime, name_map)
def CopyIntegerNodes(model, node, names, name_map=None):
"""Copies XML nodes with integer data into model attributes by name.
Args:
model: db.Model object
node: xml.dom.Node object
names: List of child node / model attribute names (strings) that have
IntegerProperty type.
name_map: Dict to translate child node names to model attribute names
(str:str)
Returns:
List of RecoverableError objects, possibly empty, representing multiple
nodes that are found for any of the names, or if there is a problem
converting the text into the appropriate value object.
"""
return CopyNodes(model, node, names, int, name_map)