Project

General

Profile

1 73 aaronmk
# String manipulation
2
3 1295 aaronmk
import codecs
4 861 aaronmk
import re
5
6 1232 aaronmk
import util
7
8 4028 aaronmk
##### Sentinel values
9
10
class NonInternedStr(str):
11
    '''Each instance is unique and does not compare identical with `is`.'''
12
    pass
13
14
none_str = NonInternedStr()
15 4032 aaronmk
isspace_none_str = '\r\n\t\v' # clone-safe, but must be compared with ==
16 4028 aaronmk
17 1622 aaronmk
##### Parsing
18
19 4115 aaronmk
def raw_extra_len(str_):
20
    '''Calculates the difference between the character length and byte length of
21
    a string. These lengths will differ for Unicode strings containing
22
    multi-byte characters.'''
23
    return len(to_raw_str(str_))-len(str_)
24
25 3749 aaronmk
def concat(str0, str1, max_len):
26 4116 aaronmk
    '''Concatenates two strings, ensuring that the combined byte length is no
27
    greater than the provided length limit.
28
    Note that if the combined length is longer than max_len, the truncated
29
    length may sometimes be shorter than max_len if there are multi-byte
30
    characters after str0's truncation point. Thus, you *cannot determine if the
31
    string was truncated* by checking if the new length equals max_len.
32
    '''
33
    # Use raw_extra_len() because Unicode characters can be multi-byte, and
34
    # length limits often apply to the byte length, not the character length.
35
    max_len -= raw_extra_len(str0)+raw_extra_len(str1)
36
    return str0[:max_len-len(str1)]+str1
37 2932 aaronmk
38 1622 aaronmk
def split(sep, str_):
39
    '''Returns [] if str_ == ""'''
40
    if str_ == '': return []
41
    else: return str_.split(sep)
42
43 2221 aaronmk
def remove_prefix(prefix, str_, removed_ref=None):
44
    if removed_ref == None: removed_ref = [False]
45
46
    removed_ref[0] = str_.startswith(prefix)
47
    if removed_ref[0]: return str_[len(prefix):]
48 1622 aaronmk
    else: return str_
49
50 1680 aaronmk
def remove_prefixes(prefixes, str_):
51
    for prefix in prefixes: str_ = remove_prefix(prefix, str_)
52
    return str_
53
54
def with_prefixes(prefixes, str_): return (p+str_ for p in prefixes)
55
56 2221 aaronmk
def remove_suffix(suffix, str_, removed_ref=None):
57
    if removed_ref == None: removed_ref = [False]
58
59
    removed_ref[0] = str_.endswith(suffix)
60
    if removed_ref[0]: return str_[:-len(suffix)]
61 1622 aaronmk
    else: return str_
62
63 1959 aaronmk
def contains_any(haystack, needles):
64
    for needle in needles:
65
        if haystack.find(needle) >= 0: return True
66
    return False
67
68 1622 aaronmk
def overlaps(str0, str1): return str0.find(str1) >= 0 or str1.find(str0) >= 0
69
70 2761 aaronmk
##### Escaping
71
72
def esc_for_mogrify(query):
73
    '''Escapes a query right before being passed to a mogrifying function.'''
74
    return query.replace('%', '%%')
75
76 1401 aaronmk
##### Unicode
77
78 2882 aaronmk
def to_raw_str(str_):
79
    if isinstance(str_, unicode): str_ = str_.encode('utf_8')
80
    return str_
81
82 1295 aaronmk
unicode_reader = codecs.getreader('utf_8')
83
84 73 aaronmk
def to_unicode(str_):
85
    if isinstance(str_, unicode): return str_
86
    encodings = ['utf_8', 'latin_1']
87
    for encoding in encodings:
88
        try: return unicode(str_, encoding)
89
        except UnicodeDecodeError, e: pass
90
    raise AssertionError(encoding+' is not a catch-all encoding')
91 340 aaronmk
92 3748 aaronmk
def ustr(value):
93 1232 aaronmk
    '''Like built-in str() but converts to unicode object'''
94 3748 aaronmk
    if util.is_str(value): str_ = value # already a string
95
    elif hasattr(value, '__str__'): str_ = value.__str__()
96
    else: str_ = str(value)
97
    return to_unicode(str_)
98 1232 aaronmk
99 3747 aaronmk
def urepr(value):
100
    '''Like built-in repr() but converts to unicode object'''
101
    if hasattr(value, '__repr__'): str_ = value.__repr__()
102
    else: str_ = repr(value)
103
    return to_unicode(str_)
104
105 2502 aaronmk
def repr_no_u(value):
106
    '''Like built-in repr() but removes the "u" in `u'...'`'''
107 3747 aaronmk
    return re.sub(r"^u(?=')", r'', urepr(value))
108 2502 aaronmk
109 1401 aaronmk
##### Line endings
110
111 1622 aaronmk
def extract_line_ending(line):
112
    '''@return tuple (contents, ending)'''
113
    contents = remove_suffix('\r', remove_suffix('\n', line))
114
    return (contents, line[len(contents):])
115 714 aaronmk
116 1622 aaronmk
def remove_line_ending(line): return extract_line_ending(line)[0]
117
118
def ensure_newl(str_): return remove_line_ending(str_)+'\n'
119
120 856 aaronmk
def is_multiline(str_):
121
    newl_idx = str_.find('\n')
122
    return newl_idx >= 0 and newl_idx != len(str_)-1 # has newline before end
123
124 860 aaronmk
def remove_extra_newl(str_):
125 856 aaronmk
    if is_multiline(str_): return str_
126 2480 aaronmk
    else: return str_.rstrip('\n')
127 856 aaronmk
128 714 aaronmk
def std_newl(str_): return str_.replace('\r\n', '\n').replace('\r', '\n')
129
130 3255 aaronmk
def join_lines(lines): return ''.join((l+'\n' for l in lines))
131
132 1401 aaronmk
##### Whitespace
133
134 714 aaronmk
def cleanup(str_): return std_newl(str_.strip())
135 861 aaronmk
136 1364 aaronmk
def single_space(str_): return re.sub(r' {2,}', r' ', str_.strip())
137
138 861 aaronmk
def one_line(str_): return re.sub(r'\n *', r' ', cleanup(str_))
139 1761 aaronmk
140
##### Control characters
141
142
def is_ctrl(char):
143
    '''Whether char is a (non-printable) control character'''
144
    return ord(char) < 32 and not char.isspace()
145
146
def strip_ctrl(str_):
147
    '''Strips (non-printable) control characters'''
148
    return ''.join(filter(lambda c: not is_ctrl(c), str_))
149 2471 aaronmk
150 3172 aaronmk
##### Text
151
152
def first_word(str_): return str_.partition(' ')[0]
153
154 2471 aaronmk
##### Formatting
155
156 3466 aaronmk
def indent(str_, level=1, indent_str='    '):
157
    indent_str *= level
158
    return ('\n'.join((indent_str+l for l in str_.rstrip().split('\n'))))+'\n'
159
160 2477 aaronmk
def as_tt(str_): return '@'+str_+'@'
161
162 2475 aaronmk
def as_code(str_, lang=None, multiline=True):
163 2471 aaronmk
    '''Wraps a string in Redmine tags to syntax-highlight it.'''
164 2480 aaronmk
    str_ = '\n'+str_.rstrip('\n')+'\n'
165 2471 aaronmk
    if lang != None: str_ = '<code class="'+lang+'">'+str_+'</code>'
166 2475 aaronmk
    if multiline: str_ = '<pre>'+str_+'</pre>'
167
    return str_
168 2477 aaronmk
169 2504 aaronmk
def as_inline_table(dict_, key_label='Output', value_label='Input', ustr=ustr):
170 2477 aaronmk
    '''Wraps a dict in Redmine tags to format it as a table.'''
171
    str_ = ''
172 2481 aaronmk
    def row(entry): return (': '.join(entry))+'\n'
173 2477 aaronmk
    str_ += row([key_label, value_label])
174 2481 aaronmk
    for entry in dict_.iteritems(): str_ += row([ustr(v) for v in entry])
175
    return '<pre>\n'+str_+'</pre>'
176 2482 aaronmk
177 2504 aaronmk
def as_table(dict_, key_label='Output', value_label='Input', ustr=ustr):
178 2482 aaronmk
    '''Wraps a dict in Redmine tags to format it as a table.'''
179
    str_ = ''
180
    def row(entry): return ('|'.join(['']+entry+['']))+'\n'# '' for outer border
181
    str_ += row([key_label, value_label])
182
    for entry in dict_.iteritems(): str_ += row([as_tt(ustr(v)) for v in entry])
183
    return '\n'+str_+' ' # space protects last \n so blank line ends table