Project

General

Profile

1
# XML parsing
2

    
3
import re
4
import xml.dom.minidom as minidom
5
import xml.parsers.expat as expat
6

    
7
import streams
8

    
9
def parse_str(str_): return minidom.parseString(str_).documentElement
10

    
11
class ConsecXmlInputStream(streams.TracedStream):
12
    '''Wraps an input stream, inserting EOF after each consecutive XML document
13
    '''
14
    def __init__(self, stream):
15
        self.eof = False
16
        self.root_close_tag = None
17
        
18
        def trace(line):
19
            if self.root_close_tag == None: # before XML document
20
                match = re.match(r'^<(?!\?)([^\s>]+)', line)
21
                if match: self.root_close_tag = '</'+match.group(1)+'>'
22
            else: # inside XML document
23
                if line.rstrip().endswith(self.root_close_tag):
24
                    self.root_close_tag = None # now outside XML document
25
                    self.eof = True # next read will produce EOF
26
        
27
        streams.TracedStream.__init__(self, trace, stream)
28
    
29
    def readline(self):
30
        if self.eof:
31
            self.eof = False
32
            return '' # don't read from underlying stream
33
        else: return streams.TracedStream.readline(self)
34
    
35
    def read(self, n): return self.readline() # forward all reads to readline()
36

    
37
def parse_next(stream):
38
    '''Parses the next document in a consecutive sequence of XML documents.
39
    @return The next document's root, or None if no more documents available
40
    '''
41
    try: return minidom.parse(ConsecXmlInputStream(stream)).documentElement
42
    except expat.ExpatError, e:
43
        if str(e).startswith('no element found:'): return None # no more docs
44
        else: raise e
45

    
46
def docs_iter(stream): return iter(lambda: parse_next(stream), None)
(23-23/25)