Project

General

Profile

« Previous | Next » 

Revision 274

Renamed util to bin

View differences:

util/env_password
1
#!/bin/bash
2
# Sets a password environment variable
3

  
4
test "$#" -ge 1 || \
5
{ echo "Usage: . $0 env_var_name [msg] (note the initial \".\")" >&2; exit 2;}
6

  
7
msg="$2"
8
test -n "$msg" || msg="$1"
9

  
10
if test -z "${!1+t}"; then # env var with name $1 is unset
11
    read -s -p "Enter $msg password: "; echo
12
    export "$1"="$REPLY"
13
fi
14 0

  
util/ch_root
1
#!/usr/bin/env python
2
# Transforms a map spreadsheet to use a different root
3

  
4
import csv
5
import os.path
6
import re
7
import sys
8

  
9
sys.path.append(os.path.dirname(__file__)+"/../lib")
10

  
11
import opts
12

  
13
def all_not_none(list_):
14
    return reduce(lambda a, b: a and b, map(lambda e: e != None, list_))
15

  
16
def main():
17
    # Get config from env vars
18
    config_names = ['in_root', 'out_root']
19
    env_names = []
20
    configs = map(lambda col: opts.get_env_vars(config_names, col, env_names),
21
        ['left', 'right'])
22
    if not all_not_none(configs): raise SystemExit(
23
        'Usage: '+opts.env_usage(env_names)+' '+sys.argv[0]+' <in_map >out_map')
24
    
25
    # Transform map
26
    reader = csv.reader(sys.stdin)
27
    writer = csv.writer(sys.stdout)
28
    cols = reader.next()
29
    for i in xrange(len(configs)):
30
        config = configs[i]
31
        label, sep, config['root'] = cols[i].partition(':')
32
        cols[i] = label+sep+config['out_root']
33
        config['in_root_re'] = r'^'+re.escape(config['in_root'])+r'\b'
34
    writer.writerow(cols)
35
    for row in reader:
36
        for i in xrange(len(configs)): row[i] = configs[i]['root']+row[i]
37
        def sub(i):
38
            row[i], n = re.subn(configs[i]['in_root_re'], r'', row[i])
39
            return n > 0
40
        if sub(0):
41
            if not sub(1): raise SystemExit('Map error: Root "'
42
                +configs[1]['in_root']+'" is not contained in output mapping: '
43
                +row[1])
44
            writer.writerow(row)
45

  
46
main()
47 0

  
util/map
1
#!/usr/bin/env python
2
# Maps one datasource to another, using a map spreadsheet if needed
3
# For outputting an XML file to a PostgreSQL database, use the general format of
4
# http://vegbank.org/vegdocs/xml/vegbank_example_ver1.0.2.xml
5

  
6
import os.path
7
import sys
8
import xml.dom.minidom
9

  
10
sys.path.append(os.path.dirname(__file__)+"/../lib")
11

  
12
import opts
13
from Parser import SyntaxException
14
import sql
15
import xml_dom
16
import xml_func
17

  
18
def metadata_value(name):
19
    if type(name) == str and name.startswith(':'): return name[1:]
20
    else: return None
21

  
22
def main():
23
    env_names = []
24
    def usage_err():
25
        raise SystemExit('Usage: '+opts.env_usage(env_names, True)
26
            +' [commit=1] '+sys.argv[0]+' [map_path] [<input] [>output]')
27
    limit = opts.get_env_var('n', None, env_names)
28
    if limit != None: limit = int(limit)
29
    commit = opts.env_flag('commit')
30
    
31
    # Get db config from env vars
32
    db_config_names = ['engine', 'host', 'user', 'password', 'database']
33
    def get_db_config(prefix):
34
        return opts.get_env_vars(db_config_names, prefix, env_names)
35
    in_db_config = get_db_config('in')
36
    out_db_config = get_db_config('out')
37
    in_is_db = 'engine' in in_db_config
38
    out_is_db = 'engine' in out_db_config
39
    
40
    # Parse args
41
    map_path = None
42
    try: _prog_name, map_path = sys.argv
43
    except ValueError:
44
        if in_is_db: usage_err()
45
    
46
    # Load map header
47
    in_is_xpaths = True
48
    if map_path != None:
49
        import copy
50
        import csv
51
        
52
        import xpath
53
        
54
        metadata = []
55
        mappings = []
56
        stream = open(map_path, 'rb')
57
        reader = csv.reader(stream)
58
        in_label, out_label = reader.next()[:2]
59
        def split_col_name(name):
60
            name, sep, root = name.partition(':')
61
            return name, sep != '', root
62
        in_label, in_is_xpaths, in_root = split_col_name(in_label)
63
        out_label, out_is_xpaths, out_root = split_col_name(out_label)
64
        assert out_is_xpaths # CSV output not supported yet
65
        has_types = out_root.startswith('/*s/') # outer elements are types
66
        for row in reader:
67
            in_, out = row[:2]
68
            if out != '':
69
                if out_is_xpaths: out = out_root+out
70
                mappings.append((in_, out))
71
        stream.close()
72
    in_is_xml = in_is_xpaths and not in_is_db
73
    
74
    # Input datasource to XML tree, mapping if needed
75
    if in_is_xml:
76
        doc0 = xml.dom.minidom.parse(sys.stdin)
77
    if map_path != None:
78
        doc1 = xml_dom.create_doc(out_label)
79
        root = doc1.documentElement
80
        if in_is_db:
81
            assert in_is_xpaths
82
            
83
            import db_xml
84
            
85
            in_root_xml = xpath.path2xml(in_root)
86
            for i, mapping in enumerate(mappings):
87
                in_, out = mapping
88
                if metadata_value(in_) == None:
89
                    mappings[i] = (xpath.path2xml(in_root+'/'+in_), out)
90
            
91
            in_db = sql.connect(in_db_config)
92
            in_pkeys = {}
93
            for row_idx, row in enumerate(sql.rows(db_xml.get(in_db,
94
                in_root_xml, in_pkeys, limit))):
95
                row_id = str(row_idx)
96
                pkey, = row
97
                for in_, out in mappings:
98
                    value = metadata_value(in_)
99
                    if value == None:
100
                        in_ = in_.cloneNode(True) # don't modify orig value!
101
                        xml_dom.set_id(xpath.get(in_, in_root), pkey)
102
                        value = sql.value_or_none(db_xml.get(in_db, in_,
103
                            in_pkeys))
104
                    if value != None:
105
                        xpath.put_obj(root, out, row_id, has_types, str(value))
106
            in_db.close()
107
        elif in_is_xml:
108
            row = xpath.get(doc0.documentElement, in_root)
109
            for row_idx, row in enumerate(xml_dom.NodeElemIter(row.parentNode)):
110
                if not (limit == None or row_idx < limit): break
111
                row_id = str(row_idx)
112
                for in_, out in mappings:
113
                    value = metadata_value(in_)
114
                    if value == None:
115
                        node = xpath.get(row, in_)
116
                        if node != None: value = xml_dom.value(node)
117
                    if value != None:
118
                        xpath.put_obj(root, out, row_id, has_types, value)
119
        else: # input is CSV
120
            map_ = dict(mappings)
121
            reader = csv.reader(sys.stdin)
122
            cols = reader.next()
123
            col_idxs = dict([(value, idx) for idx, value in enumerate(cols)])
124
            for i, mapping in enumerate(mappings):
125
                in_, out = mapping
126
                if metadata_value(in_) == None:
127
                    try: mappings[i] = (col_idxs[in_], out)
128
                    except KeyError: pass
129
            
130
            for row_idx, row in enumerate(reader):
131
                if not (limit == None or row_idx < limit): break
132
                row_id = str(row_idx)
133
                for in_, out in mappings:
134
                    value = metadata_value(in_)
135
                    if value == None:
136
                        value = row[in_]
137
                        if value == '': value = None
138
                    if value != None:
139
                        xpath.put_obj(root, out, row_id, has_types, value)
140
        xml_func.process(root)
141
    else: doc1 = doc0
142
    
143
    # Output XML tree
144
    if out_is_db:
145
        from psycopg2.extensions import ISOLATION_LEVEL_SERIALIZABLE
146
        import db_xml
147
        
148
        out_db = sql.connect(out_db_config)
149
        out_db.set_isolation_level(ISOLATION_LEVEL_SERIALIZABLE)
150
        try:
151
            row_ct_ref = [0]
152
            db_xml.xml2db(out_db, doc1.documentElement, row_ct_ref)
153
            print 'Inserted '+str(row_ct_ref[0])+' rows'
154
            if commit: out_db.commit()
155
        finally:
156
            out_db.rollback()
157
            out_db.close()
158
    else: xml_dom.writexml(sys.stdout, doc1) # output is XML
159

  
160
try: main()
161
except SyntaxException, ex: raise SystemExit(str(ex))
162 0

  
util/vegbien_dest
1
#!/bin/sh
2
# Requires var bien_password
3

  
4
export out_engine=PostgreSQL out_host=localhost out_user=bien \
5
out_password="$bien_password" out_database=vegbien
6 0

  
util/psql_vegbien
1
#!/bin/sh
2
# Accesses vegbank db
3

  
4
selfDir="$(dirname -- "$0")"
5

  
6
bien_password="$(cat "$selfDir/bien_password")"
7
. "$selfDir/vegbien_dest"
8
export PGHOST="$out_host" PGUSER="$out_user" PGPASSWORD="$out_password"
9

  
10
psql --dbname="$out_database" "$@"
11 0

  
util/join
1
#!/usr/bin/env python
2
# Inner-joins two map spreadsheets A->B and B->C to A->C
3

  
4
import csv
5
import sys
6

  
7
def main():
8
    try: _prog_name, map_1_path = sys.argv
9
    except ValueError:
10
        raise SystemExit('Usage: '+sys.argv[0]+' <map_0 map_1 [| '+sys.argv[0]
11
            +' map_2]... >joined_map')
12
    
13
    # Get map 1
14
    map_1 = {}
15
    stream = open(map_1_path, 'rb')
16
    reader = csv.reader(stream)
17
    map_1_in, map_1_out = reader.next()[:2]
18
    for row in reader:
19
        if row[1] != '': map_1[row[0]] = row[1]
20
    stream.close()
21
    
22
    # Join map 1 to map 0
23
    reader = csv.reader(sys.stdin)
24
    writer = csv.writer(sys.stdout)
25
    cols = reader.next()
26
    if not cols[1] == map_1_in: raise SystemExit('Map error: '
27
        'Map 0 output column name doesn\'t match map 1 input column name')
28
    cols[1] = map_1_out
29
    writer.writerow(cols)
30
    for row in reader:
31
        try: row[1] = map_1[row[1]]
32
        except KeyError: continue # skip row
33
        writer.writerow(row)
34

  
35
main()
36 0

  
util/simplify_xpath
1
#!/usr/bin/env python
2
# Removes duplication from XPath expressions
3
# Filters one XPath per line from stdin to stdout
4

  
5
import re
6
import sys
7

  
8
def main():
9
    while True:
10
        line = sys.stdin.readline()
11
        if line == '': break
12
        # Forward * abbrs
13
        line = re.sub(r'(?i)(\w+)(?=\w*(?:->/[^/]*)?/\1\b)', r'*', line)
14
        # Backward * abbrs
15
        line = re.sub(r'(?i)((\w+)->/[^/]*/[^/]*\[)\2', r'\1*', line)
16
        sys.stdout.write(line)
17

  
18
main()
19 0

  
util/review
1
#!/usr/bin/env python
2
# Converts a map spreadsheet to human-readable (but machine unusable) form
3
# Usage: self <in_map >out_ma
4

  
5
import csv
6
import re
7
import sys
8

  
9
def sub_nested(regex, repl, str_):
10
    while True:
11
        str_, n = re.subn(regex, repl, str_)
12
        if n == 0: return str_
13

  
14
def cleanup(xpath):
15
    truncated = False
16
    
17
    # Remove attrs
18
    xpath = sub_nested(r':\[[^\[\]]*?\]', r'', xpath)
19
    
20
    # Remove lookahead assertions
21
    xpath = sub_nested(r'\((/[^\)]*?)\)(?=/)', r'\1', xpath)
22
    
23
    # Remove pointers
24
    xpath, n = re.subn(r'^.*->', r'', xpath)
25
    if n > 0: truncated = True
26
    
27
    # Remove part of path before first key list, XML function, or path end
28
    # Leave enough to include the table of a user-defined value
29
    xpath, n = re.subn(r'^(?:/(?!_)[\w*]+)*(?=(?:/(?!_)[\w*]+){2}(?:\[|/_|$))',
30
        r'', xpath)
31
    # Prepend / to show truncation
32
    if n > 0: truncated = True
33
    
34
    # Remove backward (child-to-parent) pointer's target ID attr
35
    xpath = re.sub(r'\[[\w*]+\]|(?<=\[)[\w*]+,', r'', xpath)
36
    
37
    # Remove negative keys
38
    xpath = re.sub(r',?!(?:[\w*]+/)*@?[\w*]+', r'', xpath)
39
    
40
    # Remove path before key
41
    xpath = re.sub(r'(?:[\w*]+/)*(@?[\w*]+)(?==)', r'\1', xpath)
42
    
43
    # Prepend / to show truncation
44
    if truncated: xpath = '/'+xpath
45
    
46
    return xpath
47

  
48
def main():
49
    # Convert map
50
    reader = csv.reader(sys.stdin)
51
    writer = csv.writer(sys.stdout)
52
    writer.writerow(reader.next())
53
    for row in reader:
54
        for i in xrange(2): row[i] = cleanup(row[i])
55
        writer.writerow(row)
56

  
57
main()
58 0

  
util/fix_permissions
1
#!/bin/sh
2
# Sets correct permissions on shared bien files
3

  
4
selfDir="$(dirname -- "$0")" # dir of symlink $0, not this script itself
5
cd "$selfDir"
6

  
7
opts='--quiet --changes'
8
chgrp --recursive --no-dereference $opts bien .
9
find . -type d -exec chmod $opts g+s "{}" \; # only dirs should be setgid
10
for perm in r w x; do # extend user perms to group
11
    find . -not -type l -perm -u=$perm -exec chmod $opts g+$perm "{}" \;
12
        # don't chmod symlinks
13
done
14 0

  
util/repl
1
#!/usr/bin/env python
2
# Modifies a map spreadsheet A->B or any file using a replacements spreadsheet
3
# A->C or B->C
4

  
5
import csv
6
import re
7
import sys
8

  
9
def main():
10
    try: _prog_name, repl_path = sys.argv[:2]
11
    except ValueError:
12
        raise SystemExit('Usage: '+sys.argv[0]+' <map repl [col_num] [| '+
13
            sys.argv[0]+' repl_1 [col_num_1]]... >new_map')
14
    col_num = None
15
    try: col_num = sys.argv[2]
16
    except IndexError: pass
17
    if col_num != None: col_num = int(col_num) # 0-based
18
    
19
    # Get replacements
20
    repls = []
21
    stream = open(repl_path, 'rb')
22
    reader = csv.reader(stream)
23
    repl_in, repl_out = reader.next()[:2]
24
    for row in reader:
25
        in_, out = row[:2]
26
        if in_ != '':
27
            if re.match(r'^\w+$', in_): in_ = r'\b'+in_+r'(?=(?:(?i)_id)?\b)'
28
                # match whole word
29
            repls.append((in_, out))
30
    stream.close()
31
    def repl_all(str_):
32
        for repl, with_ in repls: str_ = re.sub(repl, with_, str_)
33
        return str_
34
    
35
    # Modify map or file
36
    if col_num != None:
37
        reader = csv.reader(sys.stdin)
38
        writer = csv.writer(sys.stdout)
39
        cols = reader.next()
40
        label, sep, root = cols[col_num].partition(':')
41
        if label != repl_in: raise SystemExit('Map error: Map column '+
42
            str(col_num)+' label "'+label+'" doesn\'t match replacements input '
43
            'column label "'+repl_in+'"')
44
        cols[col_num] = repl_out+sep+repl_all(root)
45
        writer.writerow(cols)
46
        for row in reader:
47
            row[col_num] = repl_all(row[col_num])
48
            writer.writerow(row)
49
    else:
50
        while True:
51
            line = sys.stdin.readline()
52
            if line == '': break
53
            sys.stdout.write(repl_all(line))
54

  
55
main()
56 0

  
util/join_sort
1
#!/bin/bash
2
# Inner-joins two map spreadsheets and sorts the output on the output column
3

  
4
selfDir="$(dirname -- "$0")"
5

  
6
"$selfDir/join" "$@"|"$selfDir/sort" 1 0
7 0

  
util/sort
1
#!/usr/bin/env python
2
# Sorts a spreadsheet based on a column
3

  
4
import csv
5
import sys
6

  
7
def by_idx(list_, idxes): return [list_[i] for i in idxes]
8

  
9
def main():
10
    col_nums = sys.argv[1:]
11
    if col_nums == []:
12
        raise SystemExit('Usage: '+sys.argv[0]+' col_num... <map >sorted_map')
13
    col_nums = map(int, col_nums) # 0-based
14
    
15
    # Sort map
16
    reader = csv.reader(sys.stdin)
17
    cols = reader.next()
18
    def get_sort_by(row): return by_idx(row, col_nums)
19
    rows = list(reader)
20
    rows.sort(lambda *rows: cmp(*map(get_sort_by, rows)))
21
    writer = csv.writer(sys.stdout)
22
    writer.writerow(cols)
23
    map(writer.writerow, rows)
24

  
25
main()
26 0

  
util/union
1
#!/usr/bin/env python
2
# Combines two map spreadsheets A0->B and A1->C to A->B, with A0 overwriting A1
3

  
4
import csv
5
import sys
6

  
7
def main():
8
    try: _prog_name, map_1_path = sys.argv
9
    except ValueError:
10
        raise SystemExit('Usage: '+sys.argv[0]+' <map_0 map_1 [| '+sys.argv[0]
11
            +' map_2]... >union_map')
12
    
13
    map_ = {}
14
    def add_map(reader):
15
        for row in reader:
16
            if row[1] != '': map_[row[0]] = row[1:]
17
    
18
    # Get map 1
19
    stream = open(map_1_path, 'rb')
20
    reader = csv.reader(stream)
21
    map_1_cols = reader.next()
22
    add_map(reader)
23
    stream.close()
24
    
25
    # Add map 0 to map 1, overwriting existing entries
26
    reader = csv.reader(sys.stdin)
27
    map_0_cols = reader.next()
28
    if not map_0_cols[0] == map_1_cols[0]: raise SystemExit('Map error: '
29
        'Map 1 column 0 name doesn\'t match map 0 column 0 name')
30
    add_map(reader)
31
    
32
    # Write combined map
33
    writer = csv.writer(sys.stdout)
34
    writer.writerow(map_0_cols)
35
    for in_, rest in map_.iteritems(): writer.writerow([in_]+rest)
36

  
37
main()
38 0

  
util/join_passthru
1
#!/bin/sh
2
# Joins two map spreadsheets A->B0 and B0->B1 to A->B1, B1 a modification of B0
3

  
4
selfDir="$(dirname -- "$0")"
5

  
6
test "$#" -eq 2 || { echo "Usage: $0 map_0 map_1 >joined_map" >&2; exit 2;}
7

  
8
"$selfDir/join" <"$1" "$2"|"$selfDir/union" "$1"
9 0

  
test/map
12 12
test -n "$n" || export n="$tests_n" testMode=1
13 13

  
14 14
test "$(hostname)" = nimoy && isNimoy=t || isNimoy=
15
test -n "$isNimoy" && . ../util/env_password mysql_password "your MySQL"
15
test -n "$isNimoy" && . ../bin/env_password mysql_password "your MySQL"
16 16

  
17 17
bien_password="$(cat ../config/bien_password)"
18 18

  
......
74 74
    true # ignore last command's exit status
75 75
}
76 76

  
77
vegbienDest=../util/vegbien_dest
77
vegbienDest=../bin/vegbien_dest
78 78

  
79 79
function toDb()
80 80
{
inputs/input.Makefile
4 4
vegbienMaps := $(subst .VegX.,.VegBIEN.,$(vegxMaps))
5 5

  
6 6
root := ../..
7
util := $(root)/util
7
bin := $(root)/bin
8 8
mappings := $(root)/mappings
9 9
map := $(root)/map
10 10
map2vegbien := env out_database=vegbien $(map)
......
30 30
#####
31 31

  
32 32
map.VegBIEN.%.csv: map.VegX.%.csv
33
	$(util)/join_sort <$< $(mappings)/VegX-VegBIEN.$(*F).csv >$@
33
	$(bin)/join_sort <$< $(mappings)/VegX-VegBIEN.$(*F).csv >$@
34 34
.PRECIOUS: map.VegBIEN.%.csv
35 35

  
36 36
#####
bin/fix_permissions
1
#!/bin/sh
2
# Sets correct permissions on shared bien files
3

  
4
selfDir="$(dirname -- "$0")" # dir of symlink $0, not this script itself
5
cd "$selfDir"
6

  
7
opts='--quiet --changes'
8
chgrp --recursive --no-dereference $opts bien .
9
find . -type d -exec chmod $opts g+s "{}" \; # only dirs should be setgid
10
for perm in r w x; do # extend user perms to group
11
    find . -not -type l -perm -u=$perm -exec chmod $opts g+$perm "{}" \;
12
        # don't chmod symlinks
13
done
0 14

  
bin/vegbien_dest
1
#!/bin/sh
2
# Requires var bien_password
3

  
4
export out_engine=PostgreSQL out_host=localhost out_user=bien \
5
out_password="$bien_password" out_database=vegbien
0 6

  
bin/join_sort
1
#!/bin/bash
2
# Inner-joins two map spreadsheets and sorts the output on the output column
3

  
4
selfDir="$(dirname -- "$0")"
5

  
6
"$selfDir/join" "$@"|"$selfDir/sort" 1 0
0 7

  
bin/env_password
1
#!/bin/bash
2
# Sets a password environment variable
3

  
4
test "$#" -ge 1 || \
5
{ echo "Usage: . $0 env_var_name [msg] (note the initial \".\")" >&2; exit 2;}
6

  
7
msg="$2"
8
test -n "$msg" || msg="$1"
9

  
10
if test -z "${!1+t}"; then # env var with name $1 is unset
11
    read -s -p "Enter $msg password: "; echo
12
    export "$1"="$REPLY"
13
fi
0 14

  
bin/map
1
#!/usr/bin/env python
2
# Maps one datasource to another, using a map spreadsheet if needed
3
# For outputting an XML file to a PostgreSQL database, use the general format of
4
# http://vegbank.org/vegdocs/xml/vegbank_example_ver1.0.2.xml
5

  
6
import os.path
7
import sys
8
import xml.dom.minidom
9

  
10
sys.path.append(os.path.dirname(__file__)+"/../lib")
11

  
12
import opts
13
from Parser import SyntaxException
14
import sql
15
import xml_dom
16
import xml_func
17

  
18
def metadata_value(name):
19
    if type(name) == str and name.startswith(':'): return name[1:]
20
    else: return None
21

  
22
def main():
23
    env_names = []
24
    def usage_err():
25
        raise SystemExit('Usage: '+opts.env_usage(env_names, True)
26
            +' [commit=1] '+sys.argv[0]+' [map_path] [<input] [>output]')
27
    limit = opts.get_env_var('n', None, env_names)
28
    if limit != None: limit = int(limit)
29
    commit = opts.env_flag('commit')
30
    
31
    # Get db config from env vars
32
    db_config_names = ['engine', 'host', 'user', 'password', 'database']
33
    def get_db_config(prefix):
34
        return opts.get_env_vars(db_config_names, prefix, env_names)
35
    in_db_config = get_db_config('in')
36
    out_db_config = get_db_config('out')
37
    in_is_db = 'engine' in in_db_config
38
    out_is_db = 'engine' in out_db_config
39
    
40
    # Parse args
41
    map_path = None
42
    try: _prog_name, map_path = sys.argv
43
    except ValueError:
44
        if in_is_db: usage_err()
45
    
46
    # Load map header
47
    in_is_xpaths = True
48
    if map_path != None:
49
        import copy
50
        import csv
51
        
52
        import xpath
53
        
54
        metadata = []
55
        mappings = []
56
        stream = open(map_path, 'rb')
57
        reader = csv.reader(stream)
58
        in_label, out_label = reader.next()[:2]
59
        def split_col_name(name):
60
            name, sep, root = name.partition(':')
61
            return name, sep != '', root
62
        in_label, in_is_xpaths, in_root = split_col_name(in_label)
63
        out_label, out_is_xpaths, out_root = split_col_name(out_label)
64
        assert out_is_xpaths # CSV output not supported yet
65
        has_types = out_root.startswith('/*s/') # outer elements are types
66
        for row in reader:
67
            in_, out = row[:2]
68
            if out != '':
69
                if out_is_xpaths: out = out_root+out
70
                mappings.append((in_, out))
71
        stream.close()
72
    in_is_xml = in_is_xpaths and not in_is_db
73
    
74
    # Input datasource to XML tree, mapping if needed
75
    if in_is_xml:
76
        doc0 = xml.dom.minidom.parse(sys.stdin)
77
    if map_path != None:
78
        doc1 = xml_dom.create_doc(out_label)
79
        root = doc1.documentElement
80
        if in_is_db:
81
            assert in_is_xpaths
82
            
83
            import db_xml
84
            
85
            in_root_xml = xpath.path2xml(in_root)
86
            for i, mapping in enumerate(mappings):
87
                in_, out = mapping
88
                if metadata_value(in_) == None:
89
                    mappings[i] = (xpath.path2xml(in_root+'/'+in_), out)
90
            
91
            in_db = sql.connect(in_db_config)
92
            in_pkeys = {}
93
            for row_idx, row in enumerate(sql.rows(db_xml.get(in_db,
94
                in_root_xml, in_pkeys, limit))):
95
                row_id = str(row_idx)
96
                pkey, = row
97
                for in_, out in mappings:
98
                    value = metadata_value(in_)
99
                    if value == None:
100
                        in_ = in_.cloneNode(True) # don't modify orig value!
101
                        xml_dom.set_id(xpath.get(in_, in_root), pkey)
102
                        value = sql.value_or_none(db_xml.get(in_db, in_,
103
                            in_pkeys))
104
                    if value != None:
105
                        xpath.put_obj(root, out, row_id, has_types, str(value))
106
            in_db.close()
107
        elif in_is_xml:
108
            row = xpath.get(doc0.documentElement, in_root)
109
            for row_idx, row in enumerate(xml_dom.NodeElemIter(row.parentNode)):
110
                if not (limit == None or row_idx < limit): break
111
                row_id = str(row_idx)
112
                for in_, out in mappings:
113
                    value = metadata_value(in_)
114
                    if value == None:
115
                        node = xpath.get(row, in_)
116
                        if node != None: value = xml_dom.value(node)
117
                    if value != None:
118
                        xpath.put_obj(root, out, row_id, has_types, value)
119
        else: # input is CSV
120
            map_ = dict(mappings)
121
            reader = csv.reader(sys.stdin)
122
            cols = reader.next()
123
            col_idxs = dict([(value, idx) for idx, value in enumerate(cols)])
124
            for i, mapping in enumerate(mappings):
125
                in_, out = mapping
126
                if metadata_value(in_) == None:
127
                    try: mappings[i] = (col_idxs[in_], out)
128
                    except KeyError: pass
129
            
130
            for row_idx, row in enumerate(reader):
131
                if not (limit == None or row_idx < limit): break
132
                row_id = str(row_idx)
133
                for in_, out in mappings:
134
                    value = metadata_value(in_)
135
                    if value == None:
136
                        value = row[in_]
137
                        if value == '': value = None
138
                    if value != None:
139
                        xpath.put_obj(root, out, row_id, has_types, value)
140
        xml_func.process(root)
141
    else: doc1 = doc0
142
    
143
    # Output XML tree
144
    if out_is_db:
145
        from psycopg2.extensions import ISOLATION_LEVEL_SERIALIZABLE
146
        import db_xml
147
        
148
        out_db = sql.connect(out_db_config)
149
        out_db.set_isolation_level(ISOLATION_LEVEL_SERIALIZABLE)
150
        try:
151
            row_ct_ref = [0]
152
            db_xml.xml2db(out_db, doc1.documentElement, row_ct_ref)
153
            print 'Inserted '+str(row_ct_ref[0])+' rows'
154
            if commit: out_db.commit()
155
        finally:
156
            out_db.rollback()
157
            out_db.close()
158
    else: xml_dom.writexml(sys.stdout, doc1) # output is XML
159

  
160
try: main()
161
except SyntaxException, ex: raise SystemExit(str(ex))
0 162

  
bin/psql_vegbien
1
#!/bin/sh
2
# Accesses vegbank db
3

  
4
selfDir="$(dirname -- "$0")"
5

  
6
bien_password="$(cat "$selfDir/bien_password")"
7
. "$selfDir/vegbien_dest"
8
export PGHOST="$out_host" PGUSER="$out_user" PGPASSWORD="$out_password"
9

  
10
psql --dbname="$out_database" "$@"
0 11

  
bin/repl
1
#!/usr/bin/env python
2
# Modifies a map spreadsheet A->B or any file using a replacements spreadsheet
3
# A->C or B->C
4

  
5
import csv
6
import re
7
import sys
8

  
9
def main():
10
    try: _prog_name, repl_path = sys.argv[:2]
11
    except ValueError:
12
        raise SystemExit('Usage: '+sys.argv[0]+' <map repl [col_num] [| '+
13
            sys.argv[0]+' repl_1 [col_num_1]]... >new_map')
14
    col_num = None
15
    try: col_num = sys.argv[2]
16
    except IndexError: pass
17
    if col_num != None: col_num = int(col_num) # 0-based
18
    
19
    # Get replacements
20
    repls = []
21
    stream = open(repl_path, 'rb')
22
    reader = csv.reader(stream)
23
    repl_in, repl_out = reader.next()[:2]
24
    for row in reader:
25
        in_, out = row[:2]
26
        if in_ != '':
27
            if re.match(r'^\w+$', in_): in_ = r'\b'+in_+r'(?=(?:(?i)_id)?\b)'
28
                # match whole word
29
            repls.append((in_, out))
30
    stream.close()
31
    def repl_all(str_):
32
        for repl, with_ in repls: str_ = re.sub(repl, with_, str_)
33
        return str_
34
    
35
    # Modify map or file
36
    if col_num != None:
37
        reader = csv.reader(sys.stdin)
38
        writer = csv.writer(sys.stdout)
39
        cols = reader.next()
40
        label, sep, root = cols[col_num].partition(':')
41
        if label != repl_in: raise SystemExit('Map error: Map column '+
42
            str(col_num)+' label "'+label+'" doesn\'t match replacements input '
43
            'column label "'+repl_in+'"')
44
        cols[col_num] = repl_out+sep+repl_all(root)
45
        writer.writerow(cols)
46
        for row in reader:
47
            row[col_num] = repl_all(row[col_num])
48
            writer.writerow(row)
49
    else:
50
        while True:
51
            line = sys.stdin.readline()
52
            if line == '': break
53
            sys.stdout.write(repl_all(line))
54

  
55
main()
0 56

  
bin/review
1
#!/usr/bin/env python
2
# Converts a map spreadsheet to human-readable (but machine unusable) form
3
# Usage: self <in_map >out_ma
4

  
5
import csv
6
import re
7
import sys
8

  
9
def sub_nested(regex, repl, str_):
10
    while True:
11
        str_, n = re.subn(regex, repl, str_)
12
        if n == 0: return str_
13

  
14
def cleanup(xpath):
15
    truncated = False
16
    
17
    # Remove attrs
18
    xpath = sub_nested(r':\[[^\[\]]*?\]', r'', xpath)
19
    
20
    # Remove lookahead assertions
21
    xpath = sub_nested(r'\((/[^\)]*?)\)(?=/)', r'\1', xpath)
22
    
23
    # Remove pointers
24
    xpath, n = re.subn(r'^.*->', r'', xpath)
25
    if n > 0: truncated = True
26
    
27
    # Remove part of path before first key list, XML function, or path end
28
    # Leave enough to include the table of a user-defined value
29
    xpath, n = re.subn(r'^(?:/(?!_)[\w*]+)*(?=(?:/(?!_)[\w*]+){2}(?:\[|/_|$))',
30
        r'', xpath)
31
    # Prepend / to show truncation
32
    if n > 0: truncated = True
33
    
34
    # Remove backward (child-to-parent) pointer's target ID attr
35
    xpath = re.sub(r'\[[\w*]+\]|(?<=\[)[\w*]+,', r'', xpath)
36
    
37
    # Remove negative keys
38
    xpath = re.sub(r',?!(?:[\w*]+/)*@?[\w*]+', r'', xpath)
39
    
40
    # Remove path before key
41
    xpath = re.sub(r'(?:[\w*]+/)*(@?[\w*]+)(?==)', r'\1', xpath)
42
    
43
    # Prepend / to show truncation
44
    if truncated: xpath = '/'+xpath
45
    
46
    return xpath
47

  
48
def main():
49
    # Convert map
50
    reader = csv.reader(sys.stdin)
51
    writer = csv.writer(sys.stdout)
52
    writer.writerow(reader.next())
53
    for row in reader:
54
        for i in xrange(2): row[i] = cleanup(row[i])
55
        writer.writerow(row)
56

  
57
main()
0 58

  
bin/sort
1
#!/usr/bin/env python
2
# Sorts a spreadsheet based on a column
3

  
4
import csv
5
import sys
6

  
7
def by_idx(list_, idxes): return [list_[i] for i in idxes]
8

  
9
def main():
10
    col_nums = sys.argv[1:]
11
    if col_nums == []:
12
        raise SystemExit('Usage: '+sys.argv[0]+' col_num... <map >sorted_map')
13
    col_nums = map(int, col_nums) # 0-based
14
    
15
    # Sort map
16
    reader = csv.reader(sys.stdin)
17
    cols = reader.next()
18
    def get_sort_by(row): return by_idx(row, col_nums)
19
    rows = list(reader)
20
    rows.sort(lambda *rows: cmp(*map(get_sort_by, rows)))
21
    writer = csv.writer(sys.stdout)
22
    writer.writerow(cols)
23
    map(writer.writerow, rows)
24

  
25
main()
0 26

  
bin/join_passthru
1
#!/bin/sh
2
# Joins two map spreadsheets A->B0 and B0->B1 to A->B1, B1 a modification of B0
3

  
4
selfDir="$(dirname -- "$0")"
5

  
6
test "$#" -eq 2 || { echo "Usage: $0 map_0 map_1 >joined_map" >&2; exit 2;}
7

  
8
"$selfDir/join" <"$1" "$2"|"$selfDir/union" "$1"
0 9

  
bin/union
1
#!/usr/bin/env python
2
# Combines two map spreadsheets A0->B and A1->C to A->B, with A0 overwriting A1
3

  
4
import csv
5
import sys
6

  
7
def main():
8
    try: _prog_name, map_1_path = sys.argv
9
    except ValueError:
10
        raise SystemExit('Usage: '+sys.argv[0]+' <map_0 map_1 [| '+sys.argv[0]
11
            +' map_2]... >union_map')
12
    
13
    map_ = {}
14
    def add_map(reader):
15
        for row in reader:
16
            if row[1] != '': map_[row[0]] = row[1:]
17
    
18
    # Get map 1
19
    stream = open(map_1_path, 'rb')
20
    reader = csv.reader(stream)
21
    map_1_cols = reader.next()
22
    add_map(reader)
23
    stream.close()
24
    
25
    # Add map 0 to map 1, overwriting existing entries
26
    reader = csv.reader(sys.stdin)
27
    map_0_cols = reader.next()
28
    if not map_0_cols[0] == map_1_cols[0]: raise SystemExit('Map error: '
29
        'Map 1 column 0 name doesn\'t match map 0 column 0 name')
30
    add_map(reader)
31
    
32
    # Write combined map
33
    writer = csv.writer(sys.stdout)
34
    writer.writerow(map_0_cols)
35
    for in_, rest in map_.iteritems(): writer.writerow([in_]+rest)
36

  
37
main()
0 38

  
bin/join
1
#!/usr/bin/env python
2
# Inner-joins two map spreadsheets A->B and B->C to A->C
3

  
4
import csv
5
import sys
6

  
7
def main():
8
    try: _prog_name, map_1_path = sys.argv
9
    except ValueError:
10
        raise SystemExit('Usage: '+sys.argv[0]+' <map_0 map_1 [| '+sys.argv[0]
11
            +' map_2]... >joined_map')
12
    
13
    # Get map 1
14
    map_1 = {}
15
    stream = open(map_1_path, 'rb')
16
    reader = csv.reader(stream)
17
    map_1_in, map_1_out = reader.next()[:2]
18
    for row in reader:
19
        if row[1] != '': map_1[row[0]] = row[1]
20
    stream.close()
21
    
22
    # Join map 1 to map 0
23
    reader = csv.reader(sys.stdin)
24
    writer = csv.writer(sys.stdout)
25
    cols = reader.next()
26
    if not cols[1] == map_1_in: raise SystemExit('Map error: '
27
        'Map 0 output column name doesn\'t match map 1 input column name')
28
    cols[1] = map_1_out
29
    writer.writerow(cols)
30
    for row in reader:
31
        try: row[1] = map_1[row[1]]
32
        except KeyError: continue # skip row
33
        writer.writerow(row)
34

  
35
main()
0 36

  
bin/ch_root
1
#!/usr/bin/env python
2
# Transforms a map spreadsheet to use a different root
3

  
4
import csv
5
import os.path
6
import re
7
import sys
8

  
9
sys.path.append(os.path.dirname(__file__)+"/../lib")
10

  
11
import opts
12

  
13
def all_not_none(list_):
14
    return reduce(lambda a, b: a and b, map(lambda e: e != None, list_))
15

  
16
def main():
17
    # Get config from env vars
18
    config_names = ['in_root', 'out_root']
19
    env_names = []
20
    configs = map(lambda col: opts.get_env_vars(config_names, col, env_names),
21
        ['left', 'right'])
22
    if not all_not_none(configs): raise SystemExit(
23
        'Usage: '+opts.env_usage(env_names)+' '+sys.argv[0]+' <in_map >out_map')
24
    
25
    # Transform map
26
    reader = csv.reader(sys.stdin)
27
    writer = csv.writer(sys.stdout)
28
    cols = reader.next()
29
    for i in xrange(len(configs)):
30
        config = configs[i]
31
        label, sep, config['root'] = cols[i].partition(':')
32
        cols[i] = label+sep+config['out_root']
33
        config['in_root_re'] = r'^'+re.escape(config['in_root'])+r'\b'
34
    writer.writerow(cols)
35
    for row in reader:
36
        for i in xrange(len(configs)): row[i] = configs[i]['root']+row[i]
37
        def sub(i):
38
            row[i], n = re.subn(configs[i]['in_root_re'], r'', row[i])
39
            return n > 0
40
        if sub(0):
41
            if not sub(1): raise SystemExit('Map error: Root "'
42
                +configs[1]['in_root']+'" is not contained in output mapping: '
43
                +row[1])
44
            writer.writerow(row)
45

  
46
main()
0 47

  
bin/simplify_xpath
1
#!/usr/bin/env python
2
# Removes duplication from XPath expressions
3
# Filters one XPath per line from stdin to stdout
4

  
5
import re
6
import sys
7

  
8
def main():
9
    while True:
10
        line = sys.stdin.readline()
11
        if line == '': break
12
        # Forward * abbrs
13
        line = re.sub(r'(?i)(\w+)(?=\w*(?:->/[^/]*)?/\1\b)', r'*', line)
14
        # Backward * abbrs
15
        line = re.sub(r'(?i)((\w+)->/[^/]*/[^/]*\[)\2', r'\1*', line)
16
        sys.stdout.write(line)
17

  
18
main()
0 19

  
Makefile
69 69

  
70 70
psqlOpts := --set ON_ERROR_STOP=1 --quiet
71 71
psqlAsAdmin := sudo -u postgres psql $(psqlOpts)
72
psqlAsBien := ./util/psql_vegbien $(psqlOpts)
72
psqlAsBien := ./bin/psql_vegbien $(psqlOpts)
73 73
bienPassword := $(shell cat config/bien_password)
74 74

  
75 75
postgres_user: _not_file
map
2 2
# Runs map with BIEN defaults
3 3

  
4 4
selfDir="$(dirname -- "$0")"
5
utilDir="$selfDir/util"
5
binDir="$selfDir/bin"
6 6

  
7 7
shopt -s nullglob
8 8

  
......
14 14
    if test -z "$in_user"; then
15 15
        if test _"$(hostname)" = _nimoy -a _"$in_engine" = _MySQL; then
16 16
            test -n "${mysql_password+t}" || \
17
                . "$utilDir/env_password" mysql_password "your MySQL"
17
                . "$binDir/env_password" mysql_password "your MySQL"
18 18
            export in_user="$USER" in_password="$mysql_password"
19 19
        else
20 20
            export in_user=bien in_password="$bien_password"
21 21
        fi
22 22
    fi
23 23
fi
24
test _"$out_database" = _vegbien && . "$utilDir/vegbien_dest"
24
test _"$out_database" = _vegbien && . "$binDir/vegbien_dest"
25 25

  
26
"$utilDir/map" "$@"
26
"$binDir/map" "$@"
mappings/join
9 9
for map in *-$1.*.csv; do
10 10
    type="${map%.*}" # remove extension
11 11
    type="${type#*.}" # remove up to last "."
12
    ../util/join <"$map" "$1-$2.$type.csv"|\
13
    ../util/sort 1 0 >"${map/$1/$2}" # sort on output col
12
    ../bin/join <"$map" "$1-$2.$type.csv"|\
13
    ../bin/sort 1 0 >"${map/$1/$2}" # sort on output col
14 14
done
mappings/review
9 9
mkdir -p for_review
10 10

  
11 11
for in in *.{organisms,plots}.csv; do
12
    ../util/review <"$in" >"for_review/$in"
12
    ../bin/review <"$in" >"for_review/$in"
13 13
done
mappings/Makefile
30 30
misc: $(misc)
31 31
.PRECIOUS: $(misc)
32 32

  
33
repl = ../util/repl
34
sort = ../util/sort 1 0
35
chRoot = ../util/ch_root <$< >$@
33
repl = ../bin/repl
34
sort = ../bin/sort 1 0
35
chRoot = ../bin/ch_root <$< >$@
36 36
empty = $(SED) -n \
37 37
's/^CREATE TABLE ([0-9A-Za-z_]+) \($$/TRUNCATE \1 CASCADE;/p' <$< >$@
38 38

  

Also available in: Unified diff