1 |
1708
|
aaronmk
|
# XML parsing
|
2 |
|
|
|
3 |
1712
|
aaronmk
|
import re
|
4 |
1708
|
aaronmk
|
import xml.dom.minidom as minidom
|
5 |
1712
|
aaronmk
|
import xml.parsers.expat as expat
|
6 |
1708
|
aaronmk
|
|
7 |
1712
|
aaronmk
|
import streams
|
8 |
|
|
|
9 |
1708
|
aaronmk
|
def parse_str(str_): return minidom.parseString(str_).documentElement
|
10 |
1712
|
aaronmk
|
|
11 |
|
|
class ConsecXmlInputStream(streams.TracedStream):
|
12 |
|
|
'''Wraps an input stream, inserting EOF after each consecutive XML document
|
13 |
|
|
'''
|
14 |
|
|
def __init__(self, stream):
|
15 |
|
|
self.eof = False
|
16 |
|
|
self.root_close_tag = None
|
17 |
|
|
|
18 |
|
|
def trace(line):
|
19 |
|
|
if self.root_close_tag == None: # before XML document
|
20 |
|
|
match = re.match(r'^<(?!\?)([^\s>]+)', line)
|
21 |
|
|
if match: self.root_close_tag = '</'+match.group(1)+'>'
|
22 |
|
|
else: # inside XML document
|
23 |
|
|
if line.rstrip().endswith(self.root_close_tag):
|
24 |
|
|
self.root_close_tag = None # now outside XML document
|
25 |
|
|
self.eof = True # next read will produce EOF
|
26 |
|
|
|
27 |
|
|
streams.TracedStream.__init__(self, trace, stream)
|
28 |
|
|
|
29 |
|
|
def readline(self):
|
30 |
|
|
if self.eof:
|
31 |
|
|
self.eof = False
|
32 |
|
|
return '' # don't read from underlying stream
|
33 |
|
|
else: return streams.TracedStream.readline(self)
|
34 |
|
|
|
35 |
|
|
def read(self, n): return self.readline() # forward all reads to readline()
|
36 |
|
|
|
37 |
|
|
def parse_next(stream):
|
38 |
|
|
'''Parses the next document in a consecutive sequence of XML documents.
|
39 |
|
|
@return The next document's root, or None if no more documents available
|
40 |
|
|
'''
|
41 |
|
|
try: return minidom.parse(ConsecXmlInputStream(stream)).documentElement
|
42 |
|
|
except expat.ExpatError, e:
|
43 |
|
|
if str(e).startswith('no element found:'): return None # no more docs
|
44 |
|
|
else: raise e
|
45 |
|
|
|
46 |
|
|
def docs_iter(stream): return iter(lambda: parse_next(stream), None)
|