Project

General

Profile

1
# XML parsing
2

    
3
import re
4
import xml.dom.minidom as minidom
5
import xml.parsers.expat as expat
6

    
7
import exc
8
import streams
9
import strings
10

    
11
def parse_str(str_): return minidom.parseString(str_).documentElement
12

    
13
class ConsecXmlInputStream(streams.TracedStream):
14
    '''Wraps an input stream, inserting EOF after each consecutive XML document
15
    '''
16
    def __init__(self, stream):
17
        self.eof = False
18
        self.root_close_tag = None
19
        
20
        def trace(line):
21
            if self.root_close_tag == None: # before XML document
22
                match = re.match(r'^<(?!\?)([^\s>]+)', line)
23
                if match: self.root_close_tag = '</'+match.group(1)+'>'
24
            else: # inside XML document
25
                if line.rstrip().endswith(self.root_close_tag):
26
                    self.root_close_tag = None # now outside XML document
27
                    self.eof = True # next read will produce EOF
28
        
29
        streams.TracedStream.__init__(self, trace, stream)
30
    
31
    def readline(self):
32
        if self.eof:
33
            self.eof = False
34
            return '' # don't read from underlying stream
35
        else: return streams.TracedStream.readline(self)
36

    
37
def parse_next(stream, on_error=exc.raise_):
38
    '''Parses the next document in a consecutive sequence of XML documents.
39
    @return The next document's root, or None if no more documents available
40
    '''
41
    try: return minidom.parse(streams.FilterStream(strings.strip_ctrl,
42
        ConsecXmlInputStream(stream))).documentElement
43
    except expat.ExpatError, e:
44
        # Save in case another exception raised, overwriting sys.exc_info()
45
        exc.add_traceback(e)
46
        if str(e).startswith('no element found:'): return None # no more docs
47
        else: # parser error, which corrupts the rest of the current document
48
            on_error(e)
49
            streams.consume(stream) # advance to next XML document
50

    
51
def docs_iter(stream, *args, **kw_args):
52
    return iter(lambda: parse_next(stream, *args, **kw_args), None)
(37-37/39)