Project

General

Profile

1 73 aaronmk
# String manipulation
2
3 1295 aaronmk
import codecs
4 861 aaronmk
import re
5
6 1232 aaronmk
import util
7
8 4028 aaronmk
##### Sentinel values
9
10
class NonInternedStr(str):
11
    '''Each instance is unique and does not compare identical with `is`.'''
12
    pass
13
14
none_str = NonInternedStr()
15 4032 aaronmk
isspace_none_str = '\r\n\t\v' # clone-safe, but must be compared with ==
16 4028 aaronmk
17 1622 aaronmk
##### Parsing
18
19 4115 aaronmk
def raw_extra_len(str_):
20
    '''Calculates the difference between the character length and byte length of
21
    a string. These lengths will differ for Unicode strings containing
22
    multi-byte characters.'''
23
    return len(to_raw_str(str_))-len(str_)
24
25 3749 aaronmk
def concat(str0, str1, max_len):
26 4116 aaronmk
    '''Concatenates two strings, ensuring that the combined byte length is no
27
    greater than the provided length limit.
28
    Note that if the combined length is longer than max_len, the truncated
29
    length may sometimes be shorter than max_len if there are multi-byte
30
    characters after str0's truncation point. Thus, you *cannot determine if the
31
    string was truncated* by checking if the new length equals max_len.
32
    '''
33
    # Use raw_extra_len() because Unicode characters can be multi-byte, and
34
    # length limits often apply to the byte length, not the character length.
35
    max_len -= raw_extra_len(str0)+raw_extra_len(str1)
36
    return str0[:max_len-len(str1)]+str1
37 2932 aaronmk
38 1622 aaronmk
def split(sep, str_):
39
    '''Returns [] if str_ == ""'''
40
    if str_ == '': return []
41
    else: return str_.split(sep)
42
43 4937 aaronmk
def remove_prefix(prefix, str_, removed_ref=None, require=False):
44 2221 aaronmk
    if removed_ref == None: removed_ref = [False]
45
46
    removed_ref[0] = str_.startswith(prefix)
47
    if removed_ref[0]: return str_[len(prefix):]
48 4937 aaronmk
    elif require: raise Exception(urepr(str_)+' needs '+urepr(prefix)+' prefix')
49 1622 aaronmk
    else: return str_
50
51 1680 aaronmk
def remove_prefixes(prefixes, str_):
52
    for prefix in prefixes: str_ = remove_prefix(prefix, str_)
53
    return str_
54
55
def with_prefixes(prefixes, str_): return (p+str_ for p in prefixes)
56
57 4937 aaronmk
def remove_suffix(suffix, str_, removed_ref=None, require=False):
58 2221 aaronmk
    if removed_ref == None: removed_ref = [False]
59
60
    removed_ref[0] = str_.endswith(suffix)
61
    if removed_ref[0]: return str_[:-len(suffix)]
62 4937 aaronmk
    elif require: raise Exception(urepr(str_)+' needs '+urepr(suffix)+' suffix')
63 1622 aaronmk
    else: return str_
64
65 1959 aaronmk
def contains_any(haystack, needles):
66
    for needle in needles:
67
        if haystack.find(needle) >= 0: return True
68
    return False
69
70 1622 aaronmk
def overlaps(str0, str1): return str0.find(str1) >= 0 or str1.find(str0) >= 0
71
72 5147 aaronmk
def flip_map(map_):
73
    '''
74
    For use with replace_all(), replace_all_re().
75
    @param map_ [(from_, to), ...]
76
    @return [(to, from_), ...]
77
    '''
78
    return [(to, from_) for from_, to in map_]
79
80
def replace_all(map_, str_):
81
    '''
82
    @param map_ [(from_, to), ...]
83
    '''
84
    for from_, to in map_: str_ = str_.replace(from_, to)
85
    return str_
86
87
def replace_all_re(map_, str_):
88
    '''
89
    @param map_ [(from_, to), ...]
90
    '''
91
    for from_, to in map_: str_ = re.sub(from_, to, str_)
92
    return str_
93
94 2761 aaronmk
##### Escaping
95
96 5142 aaronmk
def esc_quotes(str_, quote='"', esc='\\', quote_esc=None):
97
    if quote_esc == None: quote_esc = esc+quote
98
99
    if esc != quote: str_ = str_.replace(esc, esc+esc)
100
    str_ = str_.replace(quote, quote_esc)
101
    return str_
102
103 5161 aaronmk
json_encode_map = [
104
    ('\n', r'\n'),
105
    ('\r', r'\r'),
106
]
107 5143 aaronmk
108 5161 aaronmk
def json_encode(str_):
109
    return replace_all(json_encode_map, esc_quotes(str_, '"'))
110
111 2761 aaronmk
def esc_for_mogrify(query):
112
    '''Escapes a query right before being passed to a mogrifying function.'''
113
    return query.replace('%', '%%')
114
115 5148 aaronmk
def regexp_repl_esc(str_): return lambda m: str_
116
117 1401 aaronmk
##### Unicode
118
119 2882 aaronmk
def to_raw_str(str_):
120
    if isinstance(str_, unicode): str_ = str_.encode('utf_8')
121
    return str_
122
123 1295 aaronmk
unicode_reader = codecs.getreader('utf_8')
124
125 73 aaronmk
def to_unicode(str_):
126
    if isinstance(str_, unicode): return str_
127
    encodings = ['utf_8', 'latin_1']
128
    for encoding in encodings:
129
        try: return unicode(str_, encoding)
130
        except UnicodeDecodeError, e: pass
131
    raise AssertionError(encoding+' is not a catch-all encoding')
132 340 aaronmk
133 3748 aaronmk
def ustr(value):
134 1232 aaronmk
    '''Like built-in str() but converts to unicode object'''
135 3748 aaronmk
    if util.is_str(value): str_ = value # already a string
136
    elif hasattr(value, '__str__'): str_ = value.__str__()
137
    else: str_ = str(value)
138
    return to_unicode(str_)
139 1232 aaronmk
140 3747 aaronmk
def urepr(value):
141
    '''Like built-in repr() but converts to unicode object'''
142
    if hasattr(value, '__repr__'): str_ = value.__repr__()
143
    else: str_ = repr(value)
144
    return to_unicode(str_)
145
146 2502 aaronmk
def repr_no_u(value):
147
    '''Like built-in repr() but removes the "u" in `u'...'`'''
148 3747 aaronmk
    return re.sub(r"^u(?=')", r'', urepr(value))
149 2502 aaronmk
150 1401 aaronmk
##### Line endings
151
152 1622 aaronmk
def extract_line_ending(line):
153
    '''@return tuple (contents, ending)'''
154
    contents = remove_suffix('\r', remove_suffix('\n', line))
155
    return (contents, line[len(contents):])
156 714 aaronmk
157 1622 aaronmk
def remove_line_ending(line): return extract_line_ending(line)[0]
158
159
def ensure_newl(str_): return remove_line_ending(str_)+'\n'
160
161 856 aaronmk
def is_multiline(str_):
162
    newl_idx = str_.find('\n')
163
    return newl_idx >= 0 and newl_idx != len(str_)-1 # has newline before end
164
165 860 aaronmk
def remove_extra_newl(str_):
166 856 aaronmk
    if is_multiline(str_): return str_
167 2480 aaronmk
    else: return str_.rstrip('\n')
168 856 aaronmk
169 714 aaronmk
def std_newl(str_): return str_.replace('\r\n', '\n').replace('\r', '\n')
170
171 3255 aaronmk
def join_lines(lines): return ''.join((l+'\n' for l in lines))
172
173 1401 aaronmk
##### Whitespace
174
175 714 aaronmk
def cleanup(str_): return std_newl(str_.strip())
176 861 aaronmk
177 1364 aaronmk
def single_space(str_): return re.sub(r' {2,}', r' ', str_.strip())
178
179 861 aaronmk
def one_line(str_): return re.sub(r'\n *', r' ', cleanup(str_))
180 1761 aaronmk
181
##### Control characters
182
183
def is_ctrl(char):
184
    '''Whether char is a (non-printable) control character'''
185
    return ord(char) < 32 and not char.isspace()
186
187
def strip_ctrl(str_):
188
    '''Strips (non-printable) control characters'''
189
    return ''.join(filter(lambda c: not is_ctrl(c), str_))
190 2471 aaronmk
191 3172 aaronmk
##### Text
192
193
def first_word(str_): return str_.partition(' ')[0]
194
195 2471 aaronmk
##### Formatting
196
197 3466 aaronmk
def indent(str_, level=1, indent_str='    '):
198
    indent_str *= level
199
    return ('\n'.join((indent_str+l for l in str_.rstrip().split('\n'))))+'\n'
200
201 2477 aaronmk
def as_tt(str_): return '@'+str_+'@'
202
203 2475 aaronmk
def as_code(str_, lang=None, multiline=True):
204 2471 aaronmk
    '''Wraps a string in Redmine tags to syntax-highlight it.'''
205 2480 aaronmk
    str_ = '\n'+str_.rstrip('\n')+'\n'
206 2471 aaronmk
    if lang != None: str_ = '<code class="'+lang+'">'+str_+'</code>'
207 2475 aaronmk
    if multiline: str_ = '<pre>'+str_+'</pre>'
208
    return str_
209 2477 aaronmk
210 2504 aaronmk
def as_inline_table(dict_, key_label='Output', value_label='Input', ustr=ustr):
211 2477 aaronmk
    '''Wraps a dict in Redmine tags to format it as a table.'''
212
    str_ = ''
213 2481 aaronmk
    def row(entry): return (': '.join(entry))+'\n'
214 2477 aaronmk
    str_ += row([key_label, value_label])
215 2481 aaronmk
    for entry in dict_.iteritems(): str_ += row([ustr(v) for v in entry])
216
    return '<pre>\n'+str_+'</pre>'
217 2482 aaronmk
218 2504 aaronmk
def as_table(dict_, key_label='Output', value_label='Input', ustr=ustr):
219 2482 aaronmk
    '''Wraps a dict in Redmine tags to format it as a table.'''
220
    str_ = ''
221
    def row(entry): return ('|'.join(['']+entry+['']))+'\n'# '' for outer border
222
    str_ += row([key_label, value_label])
223
    for entry in dict_.iteritems(): str_ += row([as_tt(ustr(v)) for v in entry])
224
    return '\n'+str_+' ' # space protects last \n so blank line ends table