Project

General

Profile

1 73 aaronmk
# String manipulation
2
3 1295 aaronmk
import codecs
4 861 aaronmk
import re
5
6 1232 aaronmk
import util
7
8 4028 aaronmk
##### Sentinel values
9
10
class NonInternedStr(str):
11
    '''Each instance is unique and does not compare identical with `is`.'''
12
    pass
13
14
none_str = NonInternedStr()
15 4032 aaronmk
isspace_none_str = '\r\n\t\v' # clone-safe, but must be compared with ==
16 4028 aaronmk
17 1622 aaronmk
##### Parsing
18
19 4115 aaronmk
def raw_extra_len(str_):
20
    '''Calculates the difference between the character length and byte length of
21
    a string. These lengths will differ for Unicode strings containing
22
    multi-byte characters.'''
23
    return len(to_raw_str(str_))-len(str_)
24
25 3749 aaronmk
def concat(str0, str1, max_len):
26 4116 aaronmk
    '''Concatenates two strings, ensuring that the combined byte length is no
27
    greater than the provided length limit.
28
    Note that if the combined length is longer than max_len, the truncated
29
    length may sometimes be shorter than max_len if there are multi-byte
30
    characters after str0's truncation point. Thus, you *cannot determine if the
31
    string was truncated* by checking if the new length equals max_len.
32
    '''
33
    # Use raw_extra_len() because Unicode characters can be multi-byte, and
34
    # length limits often apply to the byte length, not the character length.
35
    max_len -= raw_extra_len(str0)+raw_extra_len(str1)
36
    return str0[:max_len-len(str1)]+str1
37 2932 aaronmk
38 1622 aaronmk
def split(sep, str_):
39
    '''Returns [] if str_ == ""'''
40
    if str_ == '': return []
41
    else: return str_.split(sep)
42
43 4937 aaronmk
def remove_prefix(prefix, str_, removed_ref=None, require=False):
44 2221 aaronmk
    if removed_ref == None: removed_ref = [False]
45
46
    removed_ref[0] = str_.startswith(prefix)
47
    if removed_ref[0]: return str_[len(prefix):]
48 4937 aaronmk
    elif require: raise Exception(urepr(str_)+' needs '+urepr(prefix)+' prefix')
49 1622 aaronmk
    else: return str_
50
51 1680 aaronmk
def remove_prefixes(prefixes, str_):
52
    for prefix in prefixes: str_ = remove_prefix(prefix, str_)
53
    return str_
54
55
def with_prefixes(prefixes, str_): return (p+str_ for p in prefixes)
56
57 4937 aaronmk
def remove_suffix(suffix, str_, removed_ref=None, require=False):
58 2221 aaronmk
    if removed_ref == None: removed_ref = [False]
59
60
    removed_ref[0] = str_.endswith(suffix)
61
    if removed_ref[0]: return str_[:-len(suffix)]
62 4937 aaronmk
    elif require: raise Exception(urepr(str_)+' needs '+urepr(suffix)+' suffix')
63 1622 aaronmk
    else: return str_
64
65 1959 aaronmk
def contains_any(haystack, needles):
66
    for needle in needles:
67
        if haystack.find(needle) >= 0: return True
68
    return False
69
70 1622 aaronmk
def overlaps(str0, str1): return str0.find(str1) >= 0 or str1.find(str0) >= 0
71
72 2761 aaronmk
##### Escaping
73
74 5142 aaronmk
def esc_quotes(str_, quote='"', esc='\\', quote_esc=None):
75
    if quote_esc == None: quote_esc = esc+quote
76
77
    if esc != quote: str_ = str_.replace(esc, esc+esc)
78
    str_ = str_.replace(quote, quote_esc)
79
    return str_
80
81 5143 aaronmk
def json_encode(str_): return esc_quotes(str_, '"')
82
83 2761 aaronmk
def esc_for_mogrify(query):
84
    '''Escapes a query right before being passed to a mogrifying function.'''
85
    return query.replace('%', '%%')
86
87 1401 aaronmk
##### Unicode
88
89 2882 aaronmk
def to_raw_str(str_):
90
    if isinstance(str_, unicode): str_ = str_.encode('utf_8')
91
    return str_
92
93 1295 aaronmk
unicode_reader = codecs.getreader('utf_8')
94
95 73 aaronmk
def to_unicode(str_):
96
    if isinstance(str_, unicode): return str_
97
    encodings = ['utf_8', 'latin_1']
98
    for encoding in encodings:
99
        try: return unicode(str_, encoding)
100
        except UnicodeDecodeError, e: pass
101
    raise AssertionError(encoding+' is not a catch-all encoding')
102 340 aaronmk
103 3748 aaronmk
def ustr(value):
104 1232 aaronmk
    '''Like built-in str() but converts to unicode object'''
105 3748 aaronmk
    if util.is_str(value): str_ = value # already a string
106
    elif hasattr(value, '__str__'): str_ = value.__str__()
107
    else: str_ = str(value)
108
    return to_unicode(str_)
109 1232 aaronmk
110 3747 aaronmk
def urepr(value):
111
    '''Like built-in repr() but converts to unicode object'''
112
    if hasattr(value, '__repr__'): str_ = value.__repr__()
113
    else: str_ = repr(value)
114
    return to_unicode(str_)
115
116 2502 aaronmk
def repr_no_u(value):
117
    '''Like built-in repr() but removes the "u" in `u'...'`'''
118 3747 aaronmk
    return re.sub(r"^u(?=')", r'', urepr(value))
119 2502 aaronmk
120 1401 aaronmk
##### Line endings
121
122 1622 aaronmk
def extract_line_ending(line):
123
    '''@return tuple (contents, ending)'''
124
    contents = remove_suffix('\r', remove_suffix('\n', line))
125
    return (contents, line[len(contents):])
126 714 aaronmk
127 1622 aaronmk
def remove_line_ending(line): return extract_line_ending(line)[0]
128
129
def ensure_newl(str_): return remove_line_ending(str_)+'\n'
130
131 856 aaronmk
def is_multiline(str_):
132
    newl_idx = str_.find('\n')
133
    return newl_idx >= 0 and newl_idx != len(str_)-1 # has newline before end
134
135 860 aaronmk
def remove_extra_newl(str_):
136 856 aaronmk
    if is_multiline(str_): return str_
137 2480 aaronmk
    else: return str_.rstrip('\n')
138 856 aaronmk
139 714 aaronmk
def std_newl(str_): return str_.replace('\r\n', '\n').replace('\r', '\n')
140
141 3255 aaronmk
def join_lines(lines): return ''.join((l+'\n' for l in lines))
142
143 1401 aaronmk
##### Whitespace
144
145 714 aaronmk
def cleanup(str_): return std_newl(str_.strip())
146 861 aaronmk
147 1364 aaronmk
def single_space(str_): return re.sub(r' {2,}', r' ', str_.strip())
148
149 861 aaronmk
def one_line(str_): return re.sub(r'\n *', r' ', cleanup(str_))
150 1761 aaronmk
151
##### Control characters
152
153
def is_ctrl(char):
154
    '''Whether char is a (non-printable) control character'''
155
    return ord(char) < 32 and not char.isspace()
156
157
def strip_ctrl(str_):
158
    '''Strips (non-printable) control characters'''
159
    return ''.join(filter(lambda c: not is_ctrl(c), str_))
160 2471 aaronmk
161 3172 aaronmk
##### Text
162
163
def first_word(str_): return str_.partition(' ')[0]
164
165 2471 aaronmk
##### Formatting
166
167 3466 aaronmk
def indent(str_, level=1, indent_str='    '):
168
    indent_str *= level
169
    return ('\n'.join((indent_str+l for l in str_.rstrip().split('\n'))))+'\n'
170
171 2477 aaronmk
def as_tt(str_): return '@'+str_+'@'
172
173 2475 aaronmk
def as_code(str_, lang=None, multiline=True):
174 2471 aaronmk
    '''Wraps a string in Redmine tags to syntax-highlight it.'''
175 2480 aaronmk
    str_ = '\n'+str_.rstrip('\n')+'\n'
176 2471 aaronmk
    if lang != None: str_ = '<code class="'+lang+'">'+str_+'</code>'
177 2475 aaronmk
    if multiline: str_ = '<pre>'+str_+'</pre>'
178
    return str_
179 2477 aaronmk
180 2504 aaronmk
def as_inline_table(dict_, key_label='Output', value_label='Input', ustr=ustr):
181 2477 aaronmk
    '''Wraps a dict in Redmine tags to format it as a table.'''
182
    str_ = ''
183 2481 aaronmk
    def row(entry): return (': '.join(entry))+'\n'
184 2477 aaronmk
    str_ += row([key_label, value_label])
185 2481 aaronmk
    for entry in dict_.iteritems(): str_ += row([ustr(v) for v in entry])
186
    return '<pre>\n'+str_+'</pre>'
187 2482 aaronmk
188 2504 aaronmk
def as_table(dict_, key_label='Output', value_label='Input', ustr=ustr):
189 2482 aaronmk
    '''Wraps a dict in Redmine tags to format it as a table.'''
190
    str_ = ''
191
    def row(entry): return ('|'.join(['']+entry+['']))+'\n'# '' for outer border
192
    str_ += row([key_label, value_label])
193
    for entry in dict_.iteritems(): str_ += row([as_tt(ustr(v)) for v in entry])
194
    return '\n'+str_+' ' # space protects last \n so blank line ends table