Project

General

Profile

1
# XML parsing
2

    
3
import re
4
import xml.dom.minidom as minidom
5
import xml.parsers.expat as expat
6

    
7
import exc
8
import streams
9

    
10
def parse_str(str_): return minidom.parseString(str_).documentElement
11

    
12
class ConsecXmlInputStream(streams.TracedStream):
13
    '''Wraps an input stream, inserting EOF after each consecutive XML document
14
    '''
15
    def __init__(self, stream):
16
        self.eof = False
17
        self.root_close_tag = None
18
        
19
        def trace(line):
20
            if self.root_close_tag == None: # before XML document
21
                match = re.match(r'^<(?!\?)([^\s>]+)', line)
22
                if match: self.root_close_tag = '</'+match.group(1)+'>'
23
            else: # inside XML document
24
                if line.rstrip().endswith(self.root_close_tag):
25
                    self.root_close_tag = None # now outside XML document
26
                    self.eof = True # next read will produce EOF
27
        
28
        streams.TracedStream.__init__(self, trace, stream)
29
    
30
    def readline(self):
31
        if self.eof:
32
            self.eof = False
33
            return '' # don't read from underlying stream
34
        else: return streams.TracedStream.readline(self)
35
    
36
    def read(self, n): return self.readline() # forward all reads to readline()
37

    
38
def parse_next(stream, on_error=exc.raise_):
39
    '''Parses the next document in a consecutive sequence of XML documents.
40
    @return The next document's root, or None if no more documents available
41
    '''
42
    try: return minidom.parse(ConsecXmlInputStream(stream)).documentElement
43
    except expat.ExpatError, e:
44
        # Save in case another exception raised, overwriting sys.exc_info()
45
        exc.add_traceback(e)
46
        if str(e).startswith('no element found:'): return None # no more docs
47
        else: on_error(e)
48

    
49
def docs_iter(stream, *args, **kw_args):
50
    return iter(lambda: parse_next(stream, *args, **kw_args), None)
(23-23/25)