1
|
# XML parsing
|
2
|
|
3
|
import re
|
4
|
import xml.dom.minidom as minidom
|
5
|
import xml.parsers.expat as expat
|
6
|
|
7
|
import exc
|
8
|
import streams
|
9
|
import strings
|
10
|
|
11
|
def parse_str(str_): return minidom.parseString(str_).documentElement
|
12
|
|
13
|
class ConsecXmlInputStream(streams.TracedStream):
|
14
|
'''Wraps an input stream, inserting EOF after each consecutive XML document
|
15
|
'''
|
16
|
def __init__(self, stream):
|
17
|
self.eof = False
|
18
|
self.root_close_tag = None
|
19
|
|
20
|
def trace(line):
|
21
|
if self.root_close_tag == None: # before XML document
|
22
|
match = re.match(r'^<(?!\?)([^\s>]+)', line)
|
23
|
if match: self.root_close_tag = '</'+match.group(1)+'>'
|
24
|
else: # inside XML document
|
25
|
if line.rstrip().endswith(self.root_close_tag):
|
26
|
self.root_close_tag = None # now outside XML document
|
27
|
self.eof = True # next read will produce EOF
|
28
|
|
29
|
streams.TracedStream.__init__(self, trace, stream)
|
30
|
|
31
|
def readline(self):
|
32
|
if self.eof:
|
33
|
self.eof = False
|
34
|
return '' # don't read from underlying stream
|
35
|
else: return streams.TracedStream.readline(self)
|
36
|
|
37
|
def parse_next(stream, on_error=exc.raise_):
|
38
|
'''Parses the next document in a consecutive sequence of XML documents.
|
39
|
@return The next document's root, or None if no more documents available
|
40
|
'''
|
41
|
try: return minidom.parse(streams.FilterStream(strings.strip_ctrl,
|
42
|
ConsecXmlInputStream(stream))).documentElement
|
43
|
except expat.ExpatError, e:
|
44
|
# Save in case another exception raised, overwriting sys.exc_info()
|
45
|
exc.add_traceback(e)
|
46
|
if str(e).startswith('no element found:'): return None # no more docs
|
47
|
else: # parser error, which corrupts the rest of the current document
|
48
|
on_error(e)
|
49
|
streams.consume(stream) # advance to next XML document
|
50
|
|
51
|
def docs_iter(stream, *args, **kw_args):
|
52
|
return iter(lambda: parse_next(stream, *args, **kw_args), None)
|