Project

General

Profile

1
#!/usr/bin/env python
2
# A DiGIR client
3

    
4
import os
5
import os.path
6
import sys
7
import urllib
8
import urllib2
9
import xml.dom.minidom as minidom
10

    
11
sys.path.append(os.path.dirname(__file__)+"/../lib")
12

    
13
import dates
14
import http
15
import opts
16
import streams
17
import util
18
import xml_dom
19
import xpath
20

    
21
# Config
22
timeout = 20 # sec
23
default_chunk_size = 10000 # records
24

    
25
schema = 'http://digir.net/schema/conceptual/darwin/full/2003/1.0/darwin2full.xsd'
26
request_xml_template = '''\
27
<?xml version="1.0" encoding="UTF-8"?>
28
<request
29
    xmlns="http://digir.net/schema/protocol/2003/1.0"
30
    xmlns:xsd="http://www.w3.org/2001/XMLSchema"
31
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
32
    xmlns:digir="http://digir.net/schema/protocol/2003/1.0"
33
    xmlns:darwin="http://digir.net/schema/conceptual/darwin/2003/1.0"
34
    xmlns:dwc="http://digir.net/schema/conceptual/darwin/2003/1.0"
35
    xsi:schemaLocation="http://digir.net/schema/protocol/2003/1.0 
36
      http://digir.sourceforge.net/schema/protocol/2003/1.0/digir.xsd 
37
      http://digir.net/schema/conceptual/darwin/2003/1.0 
38
      http://digir.sourceforge.net/schema/conceptual/darwin/2003/1.0/darwin2.xsd">
39
    <header>
40
        <version>1.0</version>
41
        <sendTime>[time]</sendTime>
42
        <source>[source]</source>
43
        <destination resource="[resource]">[url]</destination>
44
        <type>search</type>
45
    </header>
46
    <search>
47
        <filter>
48
            <equals>
49
                <darwin:Kingdom>plantae</darwin:Kingdom>
50
            </equals>
51
        </filter>
52
        <records limit="[count]" start="[start]">
53
            <structure schemaLocation="[schema]"/>
54
        </records>
55
        <count>true</count>
56
    </search>
57
</request>
58
'''
59

    
60
diags_start = '<diagnostics>'
61
diags_end = '</diagnostics>'
62

    
63
class InputError(Exception): pass
64

    
65
def main():
66
    # Usage
67
    env_names = []
68
    def usage_err():
69
        raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' '
70
            +sys.argv[0]+' 2>>log')
71
    
72
    # Get config from env vars
73
    url = opts.get_env_var('url', None, env_names)
74
    resource = opts.get_env_var('resource', None, env_names)
75
    start = util.cast(int, opts.get_env_var('start', 0, env_names))
76
    count = util.cast(int, opts.get_env_var('n', None, env_names))
77
    chunk_size = util.cast(int, opts.get_env_var('chunk_size',
78
        default_chunk_size, env_names))
79
    debug = opts.env_flag('debug', False, env_names)
80
    if url == None or resource == None: usage_err()
81
    
82
    # Logging
83
    def clear_line(): sys.stderr.write('\n')
84
    log_indent = 0
85
    def log(msg, line_ending='\n'): sys.stderr.write(msg+line_ending)
86
    def debug_log(str_, label=None):
87
        if debug:
88
            if label != None: sys.stderr.write(label+':\n')
89
            sys.stderr.write(str_+'\n')
90
    
91
    # Request XML
92
    self_dir = os.path.dirname(__file__)
93
    source = os.popen(self_dir+"/local_ip").read().strip()
94
    this_request_xml_template = (request_xml_template
95
        .replace('[source]', source)
96
        .replace('[url]', url)
97
        .replace('[resource]', resource)
98
        .replace('[schema]', schema)
99
        )
100
    
101
    # Stats
102
    total = 0
103
    def print_status(line_ending='\n'):
104
        log('Processed '+str(total)+' record(s)', line_ending)
105
    match_ct = None
106
    
107
    # Retrieve data
108
    while count == None or total < count:
109
        # Adjust chunk size if last chunk
110
        this_count = chunk_size
111
        if count != None: this_count = min(this_count, count - total)
112
        
113
        # Request XML
114
        time = dates.strftime('%Y-%m-%d %H:%M:%S %Z', dates.now())
115
        request_xml = (this_request_xml_template
116
            .replace('[count]', str(this_count))
117
            .replace('[start]', str(start))
118
            .replace('[time]', time)
119
            )
120
        debug_log(request_xml, 'request')
121
        
122
        # Send request
123
        this_url = url+'?'+urllib.urlencode({'request': request_xml})
124
        stream = streams.CaptureStream(streams.TimeoutInputStream(
125
            urllib2.urlopen(this_url), timeout), diags_start, diags_end)
126
        
127
        # Retrieve response
128
        streams.copy(stream, sys.stdout)
129
        # Make sure output ends in a newline so that consecutive XML documents
130
        # are on different lines
131
        sys.stdout.write('\n')
132
        stream.close()
133
        
134
        # Parse diagnostics
135
        diags_str = stream.match
136
        debug_log(diags_str, 'diagnostics')
137
        diags = xml_dom.parse_str(diags_str)
138
        def get_diag(name):
139
            return xpath.get_value(diags, 'diagnostic[@code='+name+']')
140
        
141
        # Process match count
142
        this_match_ct = util.cast(int, get_diag('MATCH_COUNT'))
143
        if this_match_ct != match_ct: # first or updated match count
144
            match_ct = this_match_ct
145
            log('Found '+str(match_ct)+' record(s)')
146
        
147
        # Process record count
148
        this_ct = util.cast(int, get_diag('RECORD_COUNT'))
149
        if this_ct == None: raise InputError('Missing RECORD_COUNT diagnostic')
150
        total += this_ct
151
        start += this_ct # advance start to fetch next set
152
        print_status('\r') # CR at end so next print overwrites msg
153
        
154
        # Decide if done
155
        if this_ct == 0 or get_diag('END_OF_RECORDS') == 'true': break
156
    
157
    print_status()
158

    
159
main()
(7-7/40)