Revision 1712
Added by Aaron Marcuse-Kubitza over 12 years ago
xml_parse.py | ||
---|---|---|
1 | 1 |
# XML parsing |
2 | 2 |
|
3 |
import re |
|
3 | 4 |
import xml.dom.minidom as minidom |
5 |
import xml.parsers.expat as expat |
|
4 | 6 |
|
7 |
import streams |
|
8 |
|
|
5 | 9 |
def parse_str(str_): return minidom.parseString(str_).documentElement |
10 |
|
|
11 |
class ConsecXmlInputStream(streams.TracedStream): |
|
12 |
'''Wraps an input stream, inserting EOF after each consecutive XML document |
|
13 |
''' |
|
14 |
def __init__(self, stream): |
|
15 |
self.eof = False |
|
16 |
self.root_close_tag = None |
|
17 |
|
|
18 |
def trace(line): |
|
19 |
if self.root_close_tag == None: # before XML document |
|
20 |
match = re.match(r'^<(?!\?)([^\s>]+)', line) |
|
21 |
if match: self.root_close_tag = '</'+match.group(1)+'>' |
|
22 |
else: # inside XML document |
|
23 |
if line.rstrip().endswith(self.root_close_tag): |
|
24 |
self.root_close_tag = None # now outside XML document |
|
25 |
self.eof = True # next read will produce EOF |
|
26 |
|
|
27 |
streams.TracedStream.__init__(self, trace, stream) |
|
28 |
|
|
29 |
def readline(self): |
|
30 |
if self.eof: |
|
31 |
self.eof = False |
|
32 |
return '' # don't read from underlying stream |
|
33 |
else: return streams.TracedStream.readline(self) |
|
34 |
|
|
35 |
def read(self, n): return self.readline() # forward all reads to readline() |
|
36 |
|
|
37 |
def parse_next(stream): |
|
38 |
'''Parses the next document in a consecutive sequence of XML documents. |
|
39 |
@return The next document's root, or None if no more documents available |
|
40 |
''' |
|
41 |
try: return minidom.parse(ConsecXmlInputStream(stream)).documentElement |
|
42 |
except expat.ExpatError, e: |
|
43 |
if str(e).startswith('no element found:'): return None # no more docs |
|
44 |
else: raise e |
|
45 |
|
|
46 |
def docs_iter(stream): return iter(lambda: parse_next(stream), None) |
Also available in: Unified diff
xml_parse.py: Added support for parsing consecutive XML documents in a stream