1 |
73
|
aaronmk
|
# String manipulation
|
2 |
|
|
|
3 |
1295
|
aaronmk
|
import codecs
|
4 |
861
|
aaronmk
|
import re
|
5 |
|
|
|
6 |
1232
|
aaronmk
|
import util
|
7 |
|
|
|
8 |
4028
|
aaronmk
|
##### Sentinel values
|
9 |
|
|
|
10 |
|
|
class NonInternedStr(str):
|
11 |
|
|
'''Each instance is unique and does not compare identical with `is`.'''
|
12 |
|
|
pass
|
13 |
|
|
|
14 |
|
|
none_str = NonInternedStr()
|
15 |
4032
|
aaronmk
|
isspace_none_str = '\r\n\t\v' # clone-safe, but must be compared with ==
|
16 |
4028
|
aaronmk
|
|
17 |
1622
|
aaronmk
|
##### Parsing
|
18 |
|
|
|
19 |
4115
|
aaronmk
|
def raw_extra_len(str_):
|
20 |
|
|
'''Calculates the difference between the character length and byte length of
|
21 |
|
|
a string. These lengths will differ for Unicode strings containing
|
22 |
|
|
multi-byte characters.'''
|
23 |
|
|
return len(to_raw_str(str_))-len(str_)
|
24 |
|
|
|
25 |
3749
|
aaronmk
|
def concat(str0, str1, max_len):
|
26 |
4116
|
aaronmk
|
'''Concatenates two strings, ensuring that the combined byte length is no
|
27 |
|
|
greater than the provided length limit.
|
28 |
|
|
Note that if the combined length is longer than max_len, the truncated
|
29 |
|
|
length may sometimes be shorter than max_len if there are multi-byte
|
30 |
|
|
characters after str0's truncation point. Thus, you *cannot determine if the
|
31 |
|
|
string was truncated* by checking if the new length equals max_len.
|
32 |
|
|
'''
|
33 |
|
|
# Use raw_extra_len() because Unicode characters can be multi-byte, and
|
34 |
|
|
# length limits often apply to the byte length, not the character length.
|
35 |
|
|
max_len -= raw_extra_len(str0)+raw_extra_len(str1)
|
36 |
5299
|
aaronmk
|
str0_new_len = max_len - len(str1)
|
37 |
|
|
if str0_new_len < 0: return str1[:max_len] # str1 too long
|
38 |
|
|
return str0[:str0_new_len]+str1
|
39 |
2932
|
aaronmk
|
|
40 |
5814
|
aaronmk
|
def join(sep, strs):
|
41 |
|
|
'''Returns None if .join() returns ""'''
|
42 |
|
|
return util.none_if(sep.join(strs), u'')
|
43 |
|
|
|
44 |
1622
|
aaronmk
|
def split(sep, str_):
|
45 |
|
|
'''Returns [] if str_ == ""'''
|
46 |
|
|
if str_ == '': return []
|
47 |
|
|
else: return str_.split(sep)
|
48 |
|
|
|
49 |
4937
|
aaronmk
|
def remove_prefix(prefix, str_, removed_ref=None, require=False):
|
50 |
2221
|
aaronmk
|
if removed_ref == None: removed_ref = [False]
|
51 |
|
|
|
52 |
|
|
removed_ref[0] = str_.startswith(prefix)
|
53 |
|
|
if removed_ref[0]: return str_[len(prefix):]
|
54 |
4937
|
aaronmk
|
elif require: raise Exception(urepr(str_)+' needs '+urepr(prefix)+' prefix')
|
55 |
1622
|
aaronmk
|
else: return str_
|
56 |
|
|
|
57 |
1680
|
aaronmk
|
def remove_prefixes(prefixes, str_):
|
58 |
|
|
for prefix in prefixes: str_ = remove_prefix(prefix, str_)
|
59 |
|
|
return str_
|
60 |
|
|
|
61 |
|
|
def with_prefixes(prefixes, str_): return (p+str_ for p in prefixes)
|
62 |
|
|
|
63 |
4937
|
aaronmk
|
def remove_suffix(suffix, str_, removed_ref=None, require=False):
|
64 |
2221
|
aaronmk
|
if removed_ref == None: removed_ref = [False]
|
65 |
|
|
|
66 |
|
|
removed_ref[0] = str_.endswith(suffix)
|
67 |
|
|
if removed_ref[0]: return str_[:-len(suffix)]
|
68 |
4937
|
aaronmk
|
elif require: raise Exception(urepr(str_)+' needs '+urepr(suffix)+' suffix')
|
69 |
1622
|
aaronmk
|
else: return str_
|
70 |
|
|
|
71 |
5432
|
aaronmk
|
def find_any(haystack, needles):
|
72 |
1959
|
aaronmk
|
for needle in needles:
|
73 |
5432
|
aaronmk
|
if haystack.find(needle) >= 0: return needle
|
74 |
|
|
return None
|
75 |
1959
|
aaronmk
|
|
76 |
1622
|
aaronmk
|
def overlaps(str0, str1): return str0.find(str1) >= 0 or str1.find(str0) >= 0
|
77 |
|
|
|
78 |
5147
|
aaronmk
|
def flip_map(map_):
|
79 |
|
|
'''
|
80 |
|
|
For use with replace_all(), replace_all_re().
|
81 |
|
|
@param map_ [(from_, to), ...]
|
82 |
|
|
@return [(to, from_), ...]
|
83 |
|
|
'''
|
84 |
|
|
return [(to, from_) for from_, to in map_]
|
85 |
|
|
|
86 |
|
|
def replace_all(map_, str_):
|
87 |
|
|
'''
|
88 |
|
|
@param map_ [(from_, to), ...]
|
89 |
|
|
'''
|
90 |
|
|
for from_, to in map_: str_ = str_.replace(from_, to)
|
91 |
|
|
return str_
|
92 |
|
|
|
93 |
|
|
def replace_all_re(map_, str_):
|
94 |
|
|
'''
|
95 |
|
|
@param map_ [(from_, to), ...]
|
96 |
|
|
'''
|
97 |
|
|
for from_, to in map_: str_ = re.sub(from_, to, str_)
|
98 |
|
|
return str_
|
99 |
|
|
|
100 |
2761
|
aaronmk
|
##### Escaping
|
101 |
|
|
|
102 |
14538
|
aaronmk
|
def no_esc_prefix_re(esc='\\'):
|
103 |
|
|
esc_re = re.escape(esc)
|
104 |
|
|
return '(?<!'+esc_re+')((?:'+esc_re+'{2})*)' # an even # of escs
|
105 |
|
|
|
106 |
|
|
def escd_char_re(escd_char, esc):
|
107 |
|
|
assert escd_char != esc # not supported; use str_.replace() instead
|
108 |
|
|
return no_esc_prefix_re(esc)+re.escape(escd_char)
|
109 |
|
|
|
110 |
|
|
def esc_re_map(map_):
|
111 |
|
|
'''
|
112 |
|
|
for use with replace_all_re()
|
113 |
|
|
@param map_ [(escd_char, char), ...] # use flip_map() if needed
|
114 |
|
|
@return [(escd_char_re, escd_char_re_sub), ...]
|
115 |
|
|
'''
|
116 |
|
|
_1st_escd_char = map_[0][0] # 1st entry > escd_char
|
117 |
|
|
esc = _1st_escd_char[0] # esc char is 1st char
|
118 |
|
|
return [(escd_char_re(escd_char, esc), r'\1'+char)
|
119 |
|
|
for escd_char, char in map_]
|
120 |
|
|
|
121 |
5142
|
aaronmk
|
def esc_quotes(str_, quote='"', esc='\\', quote_esc=None):
|
122 |
|
|
if quote_esc == None: quote_esc = esc+quote
|
123 |
|
|
|
124 |
|
|
if esc != quote: str_ = str_.replace(esc, esc+esc)
|
125 |
|
|
str_ = str_.replace(quote, quote_esc)
|
126 |
|
|
return str_
|
127 |
|
|
|
128 |
14538
|
aaronmk
|
def unesc_quotes(str_, quote='"', esc='\\', quote_esc=None):
|
129 |
|
|
if quote_esc == None: quote_esc = esc+quote
|
130 |
|
|
|
131 |
|
|
# can't use `.decode('string_escape')` for this because it doesn't decode
|
132 |
|
|
# custom escapes, such as GWT's \! for |
|
133 |
|
|
return replace_all_re(esc_re_map([(quote_esc, quote)]),
|
134 |
|
|
str_).replace(esc+esc, esc)
|
135 |
|
|
|
136 |
5161
|
aaronmk
|
json_encode_map = [
|
137 |
|
|
('\n', r'\n'),
|
138 |
|
|
('\r', r'\r'),
|
139 |
|
|
]
|
140 |
5143
|
aaronmk
|
|
141 |
5161
|
aaronmk
|
def json_encode(str_):
|
142 |
|
|
return replace_all(json_encode_map, esc_quotes(str_, '"'))
|
143 |
|
|
|
144 |
14537
|
aaronmk
|
def json_decode(str_): return str_.decode('string_escape')
|
145 |
|
|
|
146 |
2761
|
aaronmk
|
def esc_for_mogrify(query):
|
147 |
|
|
'''Escapes a query right before being passed to a mogrifying function.'''
|
148 |
|
|
return query.replace('%', '%%')
|
149 |
|
|
|
150 |
5148
|
aaronmk
|
def regexp_repl_esc(str_): return lambda m: str_
|
151 |
|
|
|
152 |
1401
|
aaronmk
|
##### Unicode
|
153 |
|
|
|
154 |
2882
|
aaronmk
|
def to_raw_str(str_):
|
155 |
|
|
if isinstance(str_, unicode): str_ = str_.encode('utf_8')
|
156 |
|
|
return str_
|
157 |
|
|
|
158 |
1295
|
aaronmk
|
unicode_reader = codecs.getreader('utf_8')
|
159 |
|
|
|
160 |
73
|
aaronmk
|
def to_unicode(str_):
|
161 |
|
|
if isinstance(str_, unicode): return str_
|
162 |
|
|
encodings = ['utf_8', 'latin_1']
|
163 |
|
|
for encoding in encodings:
|
164 |
|
|
try: return unicode(str_, encoding)
|
165 |
|
|
except UnicodeDecodeError, e: pass
|
166 |
|
|
raise AssertionError(encoding+' is not a catch-all encoding')
|
167 |
340
|
aaronmk
|
|
168 |
3748
|
aaronmk
|
def ustr(value):
|
169 |
1232
|
aaronmk
|
'''Like built-in str() but converts to unicode object'''
|
170 |
3748
|
aaronmk
|
if util.is_str(value): str_ = value # already a string
|
171 |
|
|
elif hasattr(value, '__str__'): str_ = value.__str__()
|
172 |
|
|
else: str_ = str(value)
|
173 |
|
|
return to_unicode(str_)
|
174 |
1232
|
aaronmk
|
|
175 |
3747
|
aaronmk
|
def urepr(value):
|
176 |
|
|
'''Like built-in repr() but converts to unicode object'''
|
177 |
|
|
if hasattr(value, '__repr__'): str_ = value.__repr__()
|
178 |
|
|
else: str_ = repr(value)
|
179 |
|
|
return to_unicode(str_)
|
180 |
|
|
|
181 |
2502
|
aaronmk
|
def repr_no_u(value):
|
182 |
|
|
'''Like built-in repr() but removes the "u" in `u'...'`'''
|
183 |
3747
|
aaronmk
|
return re.sub(r"^u(?=')", r'', urepr(value))
|
184 |
2502
|
aaronmk
|
|
185 |
1401
|
aaronmk
|
##### Line endings
|
186 |
|
|
|
187 |
1622
|
aaronmk
|
def extract_line_ending(line):
|
188 |
|
|
'''@return tuple (contents, ending)'''
|
189 |
|
|
contents = remove_suffix('\r', remove_suffix('\n', line))
|
190 |
|
|
return (contents, line[len(contents):])
|
191 |
714
|
aaronmk
|
|
192 |
1622
|
aaronmk
|
def remove_line_ending(line): return extract_line_ending(line)[0]
|
193 |
|
|
|
194 |
|
|
def ensure_newl(str_): return remove_line_ending(str_)+'\n'
|
195 |
|
|
|
196 |
856
|
aaronmk
|
def is_multiline(str_):
|
197 |
|
|
newl_idx = str_.find('\n')
|
198 |
|
|
return newl_idx >= 0 and newl_idx != len(str_)-1 # has newline before end
|
199 |
|
|
|
200 |
860
|
aaronmk
|
def remove_extra_newl(str_):
|
201 |
856
|
aaronmk
|
if is_multiline(str_): return str_
|
202 |
2480
|
aaronmk
|
else: return str_.rstrip('\n')
|
203 |
856
|
aaronmk
|
|
204 |
714
|
aaronmk
|
def std_newl(str_): return str_.replace('\r\n', '\n').replace('\r', '\n')
|
205 |
|
|
|
206 |
3255
|
aaronmk
|
def join_lines(lines): return ''.join((l+'\n' for l in lines))
|
207 |
|
|
|
208 |
1401
|
aaronmk
|
##### Whitespace
|
209 |
|
|
|
210 |
714
|
aaronmk
|
def cleanup(str_): return std_newl(str_.strip())
|
211 |
861
|
aaronmk
|
|
212 |
1364
|
aaronmk
|
def single_space(str_): return re.sub(r' {2,}', r' ', str_.strip())
|
213 |
|
|
|
214 |
861
|
aaronmk
|
def one_line(str_): return re.sub(r'\n *', r' ', cleanup(str_))
|
215 |
1761
|
aaronmk
|
|
216 |
|
|
##### Control characters
|
217 |
|
|
|
218 |
|
|
def is_ctrl(char):
|
219 |
|
|
'''Whether char is a (non-printable) control character'''
|
220 |
|
|
return ord(char) < 32 and not char.isspace()
|
221 |
|
|
|
222 |
|
|
def strip_ctrl(str_):
|
223 |
|
|
'''Strips (non-printable) control characters'''
|
224 |
|
|
return ''.join(filter(lambda c: not is_ctrl(c), str_))
|
225 |
2471
|
aaronmk
|
|
226 |
3172
|
aaronmk
|
##### Text
|
227 |
|
|
|
228 |
|
|
def first_word(str_): return str_.partition(' ')[0]
|
229 |
|
|
|
230 |
2471
|
aaronmk
|
##### Formatting
|
231 |
|
|
|
232 |
3466
|
aaronmk
|
def indent(str_, level=1, indent_str=' '):
|
233 |
|
|
indent_str *= level
|
234 |
|
|
return ('\n'.join((indent_str+l for l in str_.rstrip().split('\n'))))+'\n'
|
235 |
|
|
|
236 |
2477
|
aaronmk
|
def as_tt(str_): return '@'+str_+'@'
|
237 |
|
|
|
238 |
2475
|
aaronmk
|
def as_code(str_, lang=None, multiline=True):
|
239 |
2471
|
aaronmk
|
'''Wraps a string in Redmine tags to syntax-highlight it.'''
|
240 |
2480
|
aaronmk
|
str_ = '\n'+str_.rstrip('\n')+'\n'
|
241 |
2471
|
aaronmk
|
if lang != None: str_ = '<code class="'+lang+'">'+str_+'</code>'
|
242 |
2475
|
aaronmk
|
if multiline: str_ = '<pre>'+str_+'</pre>'
|
243 |
|
|
return str_
|
244 |
2477
|
aaronmk
|
|
245 |
2504
|
aaronmk
|
def as_inline_table(dict_, key_label='Output', value_label='Input', ustr=ustr):
|
246 |
2477
|
aaronmk
|
'''Wraps a dict in Redmine tags to format it as a table.'''
|
247 |
|
|
str_ = ''
|
248 |
2481
|
aaronmk
|
def row(entry): return (': '.join(entry))+'\n'
|
249 |
2477
|
aaronmk
|
str_ += row([key_label, value_label])
|
250 |
2481
|
aaronmk
|
for entry in dict_.iteritems(): str_ += row([ustr(v) for v in entry])
|
251 |
|
|
return '<pre>\n'+str_+'</pre>'
|
252 |
2482
|
aaronmk
|
|
253 |
2504
|
aaronmk
|
def as_table(dict_, key_label='Output', value_label='Input', ustr=ustr):
|
254 |
2482
|
aaronmk
|
'''Wraps a dict in Redmine tags to format it as a table.'''
|
255 |
|
|
str_ = ''
|
256 |
|
|
def row(entry): return ('|'.join(['']+entry+['']))+'\n'# '' for outer border
|
257 |
|
|
str_ += row([key_label, value_label])
|
258 |
|
|
for entry in dict_.iteritems(): str_ += row([as_tt(ustr(v)) for v in entry])
|
259 |
|
|
return '\n'+str_+' ' # space protects last \n so blank line ends table
|