1 |
1708
|
aaronmk
|
# XML parsing
|
2 |
|
|
|
3 |
1712
|
aaronmk
|
import re
|
4 |
1708
|
aaronmk
|
import xml.dom.minidom as minidom
|
5 |
1712
|
aaronmk
|
import xml.parsers.expat as expat
|
6 |
1708
|
aaronmk
|
|
7 |
1757
|
aaronmk
|
import exc
|
8 |
1712
|
aaronmk
|
import streams
|
9 |
1763
|
aaronmk
|
import strings
|
10 |
1712
|
aaronmk
|
|
11 |
1708
|
aaronmk
|
def parse_str(str_): return minidom.parseString(str_).documentElement
|
12 |
1712
|
aaronmk
|
|
13 |
|
|
class ConsecXmlInputStream(streams.TracedStream):
|
14 |
|
|
'''Wraps an input stream, inserting EOF after each consecutive XML document
|
15 |
|
|
'''
|
16 |
|
|
def __init__(self, stream):
|
17 |
|
|
self.eof = False
|
18 |
|
|
self.root_close_tag = None
|
19 |
|
|
|
20 |
|
|
def trace(line):
|
21 |
|
|
if self.root_close_tag == None: # before XML document
|
22 |
|
|
match = re.match(r'^<(?!\?)([^\s>]+)', line)
|
23 |
|
|
if match: self.root_close_tag = '</'+match.group(1)+'>'
|
24 |
|
|
else: # inside XML document
|
25 |
|
|
if line.rstrip().endswith(self.root_close_tag):
|
26 |
|
|
self.root_close_tag = None # now outside XML document
|
27 |
|
|
self.eof = True # next read will produce EOF
|
28 |
|
|
|
29 |
|
|
streams.TracedStream.__init__(self, trace, stream)
|
30 |
|
|
|
31 |
|
|
def readline(self):
|
32 |
|
|
if self.eof:
|
33 |
|
|
self.eof = False
|
34 |
|
|
return '' # don't read from underlying stream
|
35 |
|
|
else: return streams.TracedStream.readline(self)
|
36 |
|
|
|
37 |
1757
|
aaronmk
|
def parse_next(stream, on_error=exc.raise_):
|
38 |
1712
|
aaronmk
|
'''Parses the next document in a consecutive sequence of XML documents.
|
39 |
|
|
@return The next document's root, or None if no more documents available
|
40 |
|
|
'''
|
41 |
1763
|
aaronmk
|
try: return minidom.parse(streams.FilterStream(strings.strip_ctrl,
|
42 |
|
|
ConsecXmlInputStream(stream))).documentElement
|
43 |
1712
|
aaronmk
|
except expat.ExpatError, e:
|
44 |
1757
|
aaronmk
|
# Save in case another exception raised, overwriting sys.exc_info()
|
45 |
|
|
exc.add_traceback(e)
|
46 |
1712
|
aaronmk
|
if str(e).startswith('no element found:'): return None # no more docs
|
47 |
1760
|
aaronmk
|
else: # parser error, which corrupts the rest of the current document
|
48 |
|
|
on_error(e)
|
49 |
|
|
streams.consume(stream) # advance to next XML document
|
50 |
1712
|
aaronmk
|
|
51 |
1757
|
aaronmk
|
def docs_iter(stream, *args, **kw_args):
|
52 |
|
|
return iter(lambda: parse_next(stream, *args, **kw_args), None)
|