Project

General

Profile

1 73 aaronmk
# String manipulation
2
3 1295 aaronmk
import codecs
4 861 aaronmk
import re
5
6 1232 aaronmk
import util
7
8 4028 aaronmk
##### Sentinel values
9
10
class NonInternedStr(str):
11
    '''Each instance is unique and does not compare identical with `is`.'''
12
    pass
13
14
none_str = NonInternedStr()
15 4032 aaronmk
isspace_none_str = '\r\n\t\v' # clone-safe, but must be compared with ==
16 4028 aaronmk
17 1622 aaronmk
##### Parsing
18
19 4115 aaronmk
def raw_extra_len(str_):
20
    '''Calculates the difference between the character length and byte length of
21
    a string. These lengths will differ for Unicode strings containing
22
    multi-byte characters.'''
23
    return len(to_raw_str(str_))-len(str_)
24
25 3749 aaronmk
def concat(str0, str1, max_len):
26 4116 aaronmk
    '''Concatenates two strings, ensuring that the combined byte length is no
27
    greater than the provided length limit.
28
    Note that if the combined length is longer than max_len, the truncated
29
    length may sometimes be shorter than max_len if there are multi-byte
30
    characters after str0's truncation point. Thus, you *cannot determine if the
31
    string was truncated* by checking if the new length equals max_len.
32
    '''
33
    # Use raw_extra_len() because Unicode characters can be multi-byte, and
34
    # length limits often apply to the byte length, not the character length.
35
    max_len -= raw_extra_len(str0)+raw_extra_len(str1)
36 5299 aaronmk
    str0_new_len = max_len - len(str1)
37
    if str0_new_len < 0: return str1[:max_len] # str1 too long
38
    return str0[:str0_new_len]+str1
39 2932 aaronmk
40 5814 aaronmk
def join(sep, strs):
41
    '''Returns None if .join() returns ""'''
42
    return util.none_if(sep.join(strs), u'')
43
44 1622 aaronmk
def split(sep, str_):
45
    '''Returns [] if str_ == ""'''
46
    if str_ == '': return []
47
    else: return str_.split(sep)
48
49 4937 aaronmk
def remove_prefix(prefix, str_, removed_ref=None, require=False):
50 2221 aaronmk
    if removed_ref == None: removed_ref = [False]
51
52
    removed_ref[0] = str_.startswith(prefix)
53
    if removed_ref[0]: return str_[len(prefix):]
54 4937 aaronmk
    elif require: raise Exception(urepr(str_)+' needs '+urepr(prefix)+' prefix')
55 1622 aaronmk
    else: return str_
56
57 1680 aaronmk
def remove_prefixes(prefixes, str_):
58
    for prefix in prefixes: str_ = remove_prefix(prefix, str_)
59
    return str_
60
61
def with_prefixes(prefixes, str_): return (p+str_ for p in prefixes)
62
63 4937 aaronmk
def remove_suffix(suffix, str_, removed_ref=None, require=False):
64 2221 aaronmk
    if removed_ref == None: removed_ref = [False]
65
66
    removed_ref[0] = str_.endswith(suffix)
67
    if removed_ref[0]: return str_[:-len(suffix)]
68 4937 aaronmk
    elif require: raise Exception(urepr(str_)+' needs '+urepr(suffix)+' suffix')
69 1622 aaronmk
    else: return str_
70
71 5432 aaronmk
def find_any(haystack, needles):
72 1959 aaronmk
    for needle in needles:
73 5432 aaronmk
        if haystack.find(needle) >= 0: return needle
74
    return None
75 1959 aaronmk
76 1622 aaronmk
def overlaps(str0, str1): return str0.find(str1) >= 0 or str1.find(str0) >= 0
77
78 5147 aaronmk
def flip_map(map_):
79
    '''
80
    For use with replace_all(), replace_all_re().
81
    @param map_ [(from_, to), ...]
82
    @return [(to, from_), ...]
83
    '''
84
    return [(to, from_) for from_, to in map_]
85
86
def replace_all(map_, str_):
87
    '''
88
    @param map_ [(from_, to), ...]
89
    '''
90
    for from_, to in map_: str_ = str_.replace(from_, to)
91
    return str_
92
93
def replace_all_re(map_, str_):
94
    '''
95
    @param map_ [(from_, to), ...]
96
    '''
97
    for from_, to in map_: str_ = re.sub(from_, to, str_)
98
    return str_
99
100 2761 aaronmk
##### Escaping
101
102 14538 aaronmk
def no_esc_prefix_re(esc='\\'):
103
    esc_re = re.escape(esc)
104
    return '(?<!'+esc_re+')((?:'+esc_re+'{2})*)' # an even # of escs
105
106
def escd_char_re(escd_char, esc):
107
    assert escd_char != esc # not supported; use str_.replace() instead
108
    return no_esc_prefix_re(esc)+re.escape(escd_char)
109
110
def esc_re_map(map_):
111
    '''
112
    for use with replace_all_re()
113
    @param map_ [(escd_char, char), ...] # use flip_map() if needed
114
    @return [(escd_char_re, escd_char_re_sub), ...]
115
    '''
116
    _1st_escd_char = map_[0][0] # 1st entry > escd_char
117
    esc = _1st_escd_char[0] # esc char is 1st char
118
    return [(escd_char_re(escd_char, esc), r'\1'+char)
119
        for escd_char, char in map_]
120
121 5142 aaronmk
def esc_quotes(str_, quote='"', esc='\\', quote_esc=None):
122
    if quote_esc == None: quote_esc = esc+quote
123
124
    if esc != quote: str_ = str_.replace(esc, esc+esc)
125
    str_ = str_.replace(quote, quote_esc)
126
    return str_
127
128 14538 aaronmk
def unesc_quotes(str_, quote='"', esc='\\', quote_esc=None):
129
    if quote_esc == None: quote_esc = esc+quote
130
131
    # can't use `.decode('string_escape')` for this because it doesn't decode
132
    # custom escapes, such as GWT's \! for |
133
    return replace_all_re(esc_re_map([(quote_esc, quote)]),
134
        str_).replace(esc+esc, esc)
135
136 5161 aaronmk
json_encode_map = [
137
    ('\n', r'\n'),
138
    ('\r', r'\r'),
139
]
140 5143 aaronmk
141 5161 aaronmk
def json_encode(str_):
142
    return replace_all(json_encode_map, esc_quotes(str_, '"'))
143
144 14537 aaronmk
def json_decode(str_): return str_.decode('string_escape')
145
146 2761 aaronmk
def esc_for_mogrify(query):
147
    '''Escapes a query right before being passed to a mogrifying function.'''
148
    return query.replace('%', '%%')
149
150 5148 aaronmk
def regexp_repl_esc(str_): return lambda m: str_
151
152 1401 aaronmk
##### Unicode
153
154 2882 aaronmk
def to_raw_str(str_):
155
    if isinstance(str_, unicode): str_ = str_.encode('utf_8')
156
    return str_
157
158 1295 aaronmk
unicode_reader = codecs.getreader('utf_8')
159
160 73 aaronmk
def to_unicode(str_):
161
    if isinstance(str_, unicode): return str_
162
    encodings = ['utf_8', 'latin_1']
163
    for encoding in encodings:
164
        try: return unicode(str_, encoding)
165
        except UnicodeDecodeError, e: pass
166
    raise AssertionError(encoding+' is not a catch-all encoding')
167 340 aaronmk
168 3748 aaronmk
def ustr(value):
169 1232 aaronmk
    '''Like built-in str() but converts to unicode object'''
170 3748 aaronmk
    if util.is_str(value): str_ = value # already a string
171
    elif hasattr(value, '__str__'): str_ = value.__str__()
172
    else: str_ = str(value)
173
    return to_unicode(str_)
174 1232 aaronmk
175 3747 aaronmk
def urepr(value):
176
    '''Like built-in repr() but converts to unicode object'''
177
    if hasattr(value, '__repr__'): str_ = value.__repr__()
178
    else: str_ = repr(value)
179
    return to_unicode(str_)
180
181 2502 aaronmk
def repr_no_u(value):
182
    '''Like built-in repr() but removes the "u" in `u'...'`'''
183 3747 aaronmk
    return re.sub(r"^u(?=')", r'', urepr(value))
184 2502 aaronmk
185 1401 aaronmk
##### Line endings
186
187 1622 aaronmk
def extract_line_ending(line):
188
    '''@return tuple (contents, ending)'''
189
    contents = remove_suffix('\r', remove_suffix('\n', line))
190
    return (contents, line[len(contents):])
191 714 aaronmk
192 1622 aaronmk
def remove_line_ending(line): return extract_line_ending(line)[0]
193
194
def ensure_newl(str_): return remove_line_ending(str_)+'\n'
195
196 856 aaronmk
def is_multiline(str_):
197
    newl_idx = str_.find('\n')
198
    return newl_idx >= 0 and newl_idx != len(str_)-1 # has newline before end
199
200 860 aaronmk
def remove_extra_newl(str_):
201 856 aaronmk
    if is_multiline(str_): return str_
202 2480 aaronmk
    else: return str_.rstrip('\n')
203 856 aaronmk
204 714 aaronmk
def std_newl(str_): return str_.replace('\r\n', '\n').replace('\r', '\n')
205
206 3255 aaronmk
def join_lines(lines): return ''.join((l+'\n' for l in lines))
207
208 1401 aaronmk
##### Whitespace
209
210 714 aaronmk
def cleanup(str_): return std_newl(str_.strip())
211 861 aaronmk
212 1364 aaronmk
def single_space(str_): return re.sub(r' {2,}', r' ', str_.strip())
213
214 861 aaronmk
def one_line(str_): return re.sub(r'\n *', r' ', cleanup(str_))
215 1761 aaronmk
216
##### Control characters
217
218
def is_ctrl(char):
219
    '''Whether char is a (non-printable) control character'''
220
    return ord(char) < 32 and not char.isspace()
221
222
def strip_ctrl(str_):
223
    '''Strips (non-printable) control characters'''
224
    return ''.join(filter(lambda c: not is_ctrl(c), str_))
225 2471 aaronmk
226 3172 aaronmk
##### Text
227
228
def first_word(str_): return str_.partition(' ')[0]
229
230 2471 aaronmk
##### Formatting
231
232 3466 aaronmk
def indent(str_, level=1, indent_str='    '):
233
    indent_str *= level
234
    return ('\n'.join((indent_str+l for l in str_.rstrip().split('\n'))))+'\n'
235
236 2477 aaronmk
def as_tt(str_): return '@'+str_+'@'
237
238 2475 aaronmk
def as_code(str_, lang=None, multiline=True):
239 2471 aaronmk
    '''Wraps a string in Redmine tags to syntax-highlight it.'''
240 2480 aaronmk
    str_ = '\n'+str_.rstrip('\n')+'\n'
241 2471 aaronmk
    if lang != None: str_ = '<code class="'+lang+'">'+str_+'</code>'
242 2475 aaronmk
    if multiline: str_ = '<pre>'+str_+'</pre>'
243
    return str_
244 2477 aaronmk
245 2504 aaronmk
def as_inline_table(dict_, key_label='Output', value_label='Input', ustr=ustr):
246 2477 aaronmk
    '''Wraps a dict in Redmine tags to format it as a table.'''
247
    str_ = ''
248 2481 aaronmk
    def row(entry): return (': '.join(entry))+'\n'
249 2477 aaronmk
    str_ += row([key_label, value_label])
250 2481 aaronmk
    for entry in dict_.iteritems(): str_ += row([ustr(v) for v in entry])
251
    return '<pre>\n'+str_+'</pre>'
252 2482 aaronmk
253 2504 aaronmk
def as_table(dict_, key_label='Output', value_label='Input', ustr=ustr):
254 2482 aaronmk
    '''Wraps a dict in Redmine tags to format it as a table.'''
255
    str_ = ''
256
    def row(entry): return ('|'.join(['']+entry+['']))+'\n'# '' for outer border
257
    str_ += row([key_label, value_label])
258
    for entry in dict_.iteritems(): str_ += row([as_tt(ustr(v)) for v in entry])
259
    return '\n'+str_+' ' # space protects last \n so blank line ends table