1
|
# XML parsing
|
2
|
|
3
|
import re
|
4
|
import xml.dom.minidom as minidom
|
5
|
import xml.parsers.expat as expat
|
6
|
|
7
|
import streams
|
8
|
|
9
|
def parse_str(str_): return minidom.parseString(str_).documentElement
|
10
|
|
11
|
class ConsecXmlInputStream(streams.TracedStream):
|
12
|
'''Wraps an input stream, inserting EOF after each consecutive XML document
|
13
|
'''
|
14
|
def __init__(self, stream):
|
15
|
self.eof = False
|
16
|
self.root_close_tag = None
|
17
|
|
18
|
def trace(line):
|
19
|
if self.root_close_tag == None: # before XML document
|
20
|
match = re.match(r'^<(?!\?)([^\s>]+)', line)
|
21
|
if match: self.root_close_tag = '</'+match.group(1)+'>'
|
22
|
else: # inside XML document
|
23
|
if line.rstrip().endswith(self.root_close_tag):
|
24
|
self.root_close_tag = None # now outside XML document
|
25
|
self.eof = True # next read will produce EOF
|
26
|
|
27
|
streams.TracedStream.__init__(self, trace, stream)
|
28
|
|
29
|
def readline(self):
|
30
|
if self.eof:
|
31
|
self.eof = False
|
32
|
return '' # don't read from underlying stream
|
33
|
else: return streams.TracedStream.readline(self)
|
34
|
|
35
|
def read(self, n): return self.readline() # forward all reads to readline()
|
36
|
|
37
|
def parse_next(stream):
|
38
|
'''Parses the next document in a consecutive sequence of XML documents.
|
39
|
@return The next document's root, or None if no more documents available
|
40
|
'''
|
41
|
try: return minidom.parse(ConsecXmlInputStream(stream)).documentElement
|
42
|
except expat.ExpatError, e:
|
43
|
if str(e).startswith('no element found:'): return None # no more docs
|
44
|
else: raise e
|
45
|
|
46
|
def docs_iter(stream): return iter(lambda: parse_next(stream), None)
|