Project

General

Profile

1 73 aaronmk
# String manipulation
2
3 1295 aaronmk
import codecs
4 861 aaronmk
import re
5
6 1232 aaronmk
import util
7
8 4028 aaronmk
##### Sentinel values
9
10
class NonInternedStr(str):
11
    '''Each instance is unique and does not compare identical with `is`.'''
12
    pass
13
14
none_str = NonInternedStr()
15 4032 aaronmk
isspace_none_str = '\r\n\t\v' # clone-safe, but must be compared with ==
16 4028 aaronmk
17 1622 aaronmk
##### Parsing
18
19 4115 aaronmk
def raw_extra_len(str_):
20
    '''Calculates the difference between the character length and byte length of
21
    a string. These lengths will differ for Unicode strings containing
22
    multi-byte characters.'''
23
    return len(to_raw_str(str_))-len(str_)
24
25 3749 aaronmk
def concat(str0, str1, max_len):
26 4116 aaronmk
    '''Concatenates two strings, ensuring that the combined byte length is no
27
    greater than the provided length limit.
28
    Note that if the combined length is longer than max_len, the truncated
29
    length may sometimes be shorter than max_len if there are multi-byte
30
    characters after str0's truncation point. Thus, you *cannot determine if the
31
    string was truncated* by checking if the new length equals max_len.
32
    '''
33
    # Use raw_extra_len() because Unicode characters can be multi-byte, and
34
    # length limits often apply to the byte length, not the character length.
35
    max_len -= raw_extra_len(str0)+raw_extra_len(str1)
36 5299 aaronmk
    str0_new_len = max_len - len(str1)
37
    if str0_new_len < 0: return str1[:max_len] # str1 too long
38
    return str0[:str0_new_len]+str1
39 2932 aaronmk
40 1622 aaronmk
def split(sep, str_):
41
    '''Returns [] if str_ == ""'''
42
    if str_ == '': return []
43
    else: return str_.split(sep)
44
45 4937 aaronmk
def remove_prefix(prefix, str_, removed_ref=None, require=False):
46 2221 aaronmk
    if removed_ref == None: removed_ref = [False]
47
48
    removed_ref[0] = str_.startswith(prefix)
49
    if removed_ref[0]: return str_[len(prefix):]
50 4937 aaronmk
    elif require: raise Exception(urepr(str_)+' needs '+urepr(prefix)+' prefix')
51 1622 aaronmk
    else: return str_
52
53 1680 aaronmk
def remove_prefixes(prefixes, str_):
54
    for prefix in prefixes: str_ = remove_prefix(prefix, str_)
55
    return str_
56
57
def with_prefixes(prefixes, str_): return (p+str_ for p in prefixes)
58
59 4937 aaronmk
def remove_suffix(suffix, str_, removed_ref=None, require=False):
60 2221 aaronmk
    if removed_ref == None: removed_ref = [False]
61
62
    removed_ref[0] = str_.endswith(suffix)
63
    if removed_ref[0]: return str_[:-len(suffix)]
64 4937 aaronmk
    elif require: raise Exception(urepr(str_)+' needs '+urepr(suffix)+' suffix')
65 1622 aaronmk
    else: return str_
66
67 1959 aaronmk
def contains_any(haystack, needles):
68
    for needle in needles:
69
        if haystack.find(needle) >= 0: return True
70
    return False
71
72 1622 aaronmk
def overlaps(str0, str1): return str0.find(str1) >= 0 or str1.find(str0) >= 0
73
74 5147 aaronmk
def flip_map(map_):
75
    '''
76
    For use with replace_all(), replace_all_re().
77
    @param map_ [(from_, to), ...]
78
    @return [(to, from_), ...]
79
    '''
80
    return [(to, from_) for from_, to in map_]
81
82
def replace_all(map_, str_):
83
    '''
84
    @param map_ [(from_, to), ...]
85
    '''
86
    for from_, to in map_: str_ = str_.replace(from_, to)
87
    return str_
88
89
def replace_all_re(map_, str_):
90
    '''
91
    @param map_ [(from_, to), ...]
92
    '''
93
    for from_, to in map_: str_ = re.sub(from_, to, str_)
94
    return str_
95
96 2761 aaronmk
##### Escaping
97
98 5142 aaronmk
def esc_quotes(str_, quote='"', esc='\\', quote_esc=None):
99
    if quote_esc == None: quote_esc = esc+quote
100
101
    if esc != quote: str_ = str_.replace(esc, esc+esc)
102
    str_ = str_.replace(quote, quote_esc)
103
    return str_
104
105 5161 aaronmk
json_encode_map = [
106
    ('\n', r'\n'),
107
    ('\r', r'\r'),
108
]
109 5143 aaronmk
110 5161 aaronmk
def json_encode(str_):
111
    return replace_all(json_encode_map, esc_quotes(str_, '"'))
112
113 2761 aaronmk
def esc_for_mogrify(query):
114
    '''Escapes a query right before being passed to a mogrifying function.'''
115
    return query.replace('%', '%%')
116
117 5148 aaronmk
def regexp_repl_esc(str_): return lambda m: str_
118
119 1401 aaronmk
##### Unicode
120
121 2882 aaronmk
def to_raw_str(str_):
122
    if isinstance(str_, unicode): str_ = str_.encode('utf_8')
123
    return str_
124
125 1295 aaronmk
unicode_reader = codecs.getreader('utf_8')
126
127 73 aaronmk
def to_unicode(str_):
128
    if isinstance(str_, unicode): return str_
129
    encodings = ['utf_8', 'latin_1']
130
    for encoding in encodings:
131
        try: return unicode(str_, encoding)
132
        except UnicodeDecodeError, e: pass
133
    raise AssertionError(encoding+' is not a catch-all encoding')
134 340 aaronmk
135 3748 aaronmk
def ustr(value):
136 1232 aaronmk
    '''Like built-in str() but converts to unicode object'''
137 3748 aaronmk
    if util.is_str(value): str_ = value # already a string
138
    elif hasattr(value, '__str__'): str_ = value.__str__()
139
    else: str_ = str(value)
140
    return to_unicode(str_)
141 1232 aaronmk
142 3747 aaronmk
def urepr(value):
143
    '''Like built-in repr() but converts to unicode object'''
144
    if hasattr(value, '__repr__'): str_ = value.__repr__()
145
    else: str_ = repr(value)
146
    return to_unicode(str_)
147
148 2502 aaronmk
def repr_no_u(value):
149
    '''Like built-in repr() but removes the "u" in `u'...'`'''
150 3747 aaronmk
    return re.sub(r"^u(?=')", r'', urepr(value))
151 2502 aaronmk
152 1401 aaronmk
##### Line endings
153
154 1622 aaronmk
def extract_line_ending(line):
155
    '''@return tuple (contents, ending)'''
156
    contents = remove_suffix('\r', remove_suffix('\n', line))
157
    return (contents, line[len(contents):])
158 714 aaronmk
159 1622 aaronmk
def remove_line_ending(line): return extract_line_ending(line)[0]
160
161
def ensure_newl(str_): return remove_line_ending(str_)+'\n'
162
163 856 aaronmk
def is_multiline(str_):
164
    newl_idx = str_.find('\n')
165
    return newl_idx >= 0 and newl_idx != len(str_)-1 # has newline before end
166
167 860 aaronmk
def remove_extra_newl(str_):
168 856 aaronmk
    if is_multiline(str_): return str_
169 2480 aaronmk
    else: return str_.rstrip('\n')
170 856 aaronmk
171 714 aaronmk
def std_newl(str_): return str_.replace('\r\n', '\n').replace('\r', '\n')
172
173 3255 aaronmk
def join_lines(lines): return ''.join((l+'\n' for l in lines))
174
175 1401 aaronmk
##### Whitespace
176
177 714 aaronmk
def cleanup(str_): return std_newl(str_.strip())
178 861 aaronmk
179 1364 aaronmk
def single_space(str_): return re.sub(r' {2,}', r' ', str_.strip())
180
181 861 aaronmk
def one_line(str_): return re.sub(r'\n *', r' ', cleanup(str_))
182 1761 aaronmk
183
##### Control characters
184
185
def is_ctrl(char):
186
    '''Whether char is a (non-printable) control character'''
187
    return ord(char) < 32 and not char.isspace()
188
189
def strip_ctrl(str_):
190
    '''Strips (non-printable) control characters'''
191
    return ''.join(filter(lambda c: not is_ctrl(c), str_))
192 2471 aaronmk
193 3172 aaronmk
##### Text
194
195
def first_word(str_): return str_.partition(' ')[0]
196
197 2471 aaronmk
##### Formatting
198
199 3466 aaronmk
def indent(str_, level=1, indent_str='    '):
200
    indent_str *= level
201
    return ('\n'.join((indent_str+l for l in str_.rstrip().split('\n'))))+'\n'
202
203 2477 aaronmk
def as_tt(str_): return '@'+str_+'@'
204
205 2475 aaronmk
def as_code(str_, lang=None, multiline=True):
206 2471 aaronmk
    '''Wraps a string in Redmine tags to syntax-highlight it.'''
207 2480 aaronmk
    str_ = '\n'+str_.rstrip('\n')+'\n'
208 2471 aaronmk
    if lang != None: str_ = '<code class="'+lang+'">'+str_+'</code>'
209 2475 aaronmk
    if multiline: str_ = '<pre>'+str_+'</pre>'
210
    return str_
211 2477 aaronmk
212 2504 aaronmk
def as_inline_table(dict_, key_label='Output', value_label='Input', ustr=ustr):
213 2477 aaronmk
    '''Wraps a dict in Redmine tags to format it as a table.'''
214
    str_ = ''
215 2481 aaronmk
    def row(entry): return (': '.join(entry))+'\n'
216 2477 aaronmk
    str_ += row([key_label, value_label])
217 2481 aaronmk
    for entry in dict_.iteritems(): str_ += row([ustr(v) for v in entry])
218
    return '<pre>\n'+str_+'</pre>'
219 2482 aaronmk
220 2504 aaronmk
def as_table(dict_, key_label='Output', value_label='Input', ustr=ustr):
221 2482 aaronmk
    '''Wraps a dict in Redmine tags to format it as a table.'''
222
    str_ = ''
223
    def row(entry): return ('|'.join(['']+entry+['']))+'\n'# '' for outer border
224
    str_ += row([key_label, value_label])
225
    for entry in dict_.iteritems(): str_ += row([as_tt(ustr(v)) for v in entry])
226
    return '\n'+str_+' ' # space protects last \n so blank line ends table