Project

General

Profile

1
#!/usr/bin/env python
2
# A DiGIR client
3

    
4
import os
5
import os.path
6
import sys
7
import urllib
8
import urllib2
9
import xml.dom.minidom as minidom
10

    
11
sys.path.append(os.path.dirname(__file__)+"/../lib")
12

    
13
import dates
14
import http
15
import opts
16
import profiling
17
import streams
18
import util
19
import xml_parse
20
import xpath
21

    
22
# Config
23
timeout = 20 # sec
24
default_chunk_size = 10000 # records
25

    
26
schema = 'http://digir.net/schema/conceptual/darwin/full/2003/1.0/darwin2full.xsd'
27
request_xml_template = '''\
28
<?xml version="1.0" encoding="UTF-8"?>
29
<request
30
    xmlns="http://digir.net/schema/protocol/2003/1.0"
31
    xmlns:xsd="http://www.w3.org/2001/XMLSchema"
32
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
33
    xmlns:digir="http://digir.net/schema/protocol/2003/1.0"
34
    xmlns:darwin="http://digir.net/schema/conceptual/darwin/2003/1.0"
35
    xmlns:dwc="http://digir.net/schema/conceptual/darwin/2003/1.0"
36
    xsi:schemaLocation="http://digir.net/schema/protocol/2003/1.0 
37
      http://digir.sourceforge.net/schema/protocol/2003/1.0/digir.xsd 
38
      http://digir.net/schema/conceptual/darwin/2003/1.0 
39
      http://digir.sourceforge.net/schema/conceptual/darwin/2003/1.0/darwin2.xsd">
40
    <header>
41
        <version>1.0</version>
42
        <sendTime>[time]</sendTime>
43
        <source>[source]</source>
44
        <destination resource="[resource]">[url]</destination>
45
        <type>search</type>
46
    </header>
47
    <search>
48
        <filter>
49
            <equals>
50
                <darwin:Kingdom>plantae</darwin:Kingdom>
51
            </equals>
52
        </filter>
53
        <records limit="[count]" start="[start]">
54
            <structure schemaLocation="[schema]"/>
55
        </records>
56
        <count>true</count>
57
    </search>
58
</request>
59
'''
60

    
61
diags_start = '<diagnostics>'
62
diags_end = '</diagnostics>'
63

    
64
class InputError(Exception): pass
65

    
66
def main():
67
    # Usage
68
    env_names = []
69
    def usage_err():
70
        raise SystemExit('Usage: '+opts.env_usage(env_names, True)+' '
71
            +sys.argv[0]+' 2>>log')
72
    
73
    # Get config from env vars
74
    url = opts.get_env_var('url', None, env_names)
75
    resource = opts.get_env_var('resource', None, env_names)
76
    start = util.cast(int, opts.get_env_var('start', 0, env_names))
77
    count = util.cast(int, opts.get_env_var('n', None, env_names))
78
    chunk_size = util.cast(int, opts.get_env_var('chunk_size',
79
        default_chunk_size, env_names))
80
    debug = opts.env_flag('debug', False, env_names)
81
    if url == None or resource == None: usage_err()
82
    
83
    # Logging
84
    def clear_line(): sys.stderr.write('\n')
85
    log_indent = 0
86
    def log(msg, line_ending='\n'): sys.stderr.write(msg+line_ending)
87
    def debug_log(str_, label=None):
88
        if debug:
89
            if label != None: sys.stderr.write(label+':\n')
90
            sys.stderr.write(str_+'\n')
91
    
92
    # Request XML
93
    self_dir = os.path.dirname(__file__)
94
    source = os.popen(self_dir+"/local_ip").read().strip()
95
    this_request_xml_template = (request_xml_template
96
        .replace('[source]', source)
97
        .replace('[url]', url)
98
        .replace('[resource]', resource)
99
        .replace('[schema]', schema)
100
        )
101
    
102
    # Stats
103
    total = 0
104
    def print_status(line_ending='\n'):
105
        log('Processed '+str(total)+' record(s)', line_ending)
106
    match_ct = None
107
    
108
    profiler = profiling.ItersProfiler(start_now=True, iter_text='record')
109
    
110
    # Retrieve data
111
    while count == None or total < count:
112
        # Adjust chunk size if last chunk
113
        this_count = chunk_size
114
        if count != None: this_count = min(this_count, count - total)
115
        
116
        # Request XML
117
        time = dates.strftime('%Y-%m-%d %H:%M:%S %Z', dates.now())
118
        request_xml = (this_request_xml_template
119
            .replace('[count]', str(this_count))
120
            .replace('[start]', str(start))
121
            .replace('[time]', time)
122
            )
123
        debug_log(request_xml, 'request')
124
        
125
        # Send request
126
        this_url = url+'?'+urllib.urlencode({'request': request_xml})
127
        stream = streams.CaptureStream(streams.TimeoutInputStream(
128
            urllib2.urlopen(this_url), timeout), diags_start, diags_end)
129
        
130
        # Retrieve response
131
        streams.copy(stream, sys.stdout)
132
        # Make sure output ends in a newline so that consecutive XML documents
133
        # are on different lines
134
        sys.stdout.write('\n')
135
        stream.close()
136
        
137
        # Parse diagnostics
138
        diags_str = stream.matches[0]
139
        debug_log(diags_str, 'diagnostics')
140
        diags = xml_parse.parse_str(diags_str)
141
        def get_diag(name):
142
            return xpath.get_value(diags, 'diagnostic[@code='+name+']')
143
        
144
        # Process match count
145
        this_match_ct = util.cast(int, get_diag('MATCH_COUNT'))
146
        if this_match_ct != match_ct: # first or updated match count
147
            match_ct = this_match_ct
148
            log('Found '+str(match_ct)+' record(s)')
149
        
150
        # Process record count
151
        this_ct = util.cast(int, get_diag('RECORD_COUNT'))
152
        if this_ct == None: raise InputError('Missing RECORD_COUNT diagnostic')
153
        total += this_ct
154
        start += this_ct # advance start to fetch next set
155
        print_status('\r') # CR at end so next print overwrites msg
156
        
157
        # Decide if done
158
        if this_ct == 0 or get_diag('END_OF_RECORDS') == 'true': break
159
    
160
    print_status()
161
    profiler.stop(total)
162
    log(profiler.msg())
163

    
164
main()
(15-15/82)