##// END OF EJS Templates
str_utils: added common non-ascii replacer
super-admin -
r4989:b0280397 default
parent child Browse files
Show More
@@ -1,135 +1,147 b''
1 1 # -*- coding: utf-8 -*-
2 2
3 3 # Copyright (C) 2011-2020 RhodeCode GmbH
4 4 #
5 5 # This program is free software: you can redistribute it and/or modify
6 6 # it under the terms of the GNU Affero General Public License, version 3
7 7 # (only), as published by the Free Software Foundation.
8 8 #
9 9 # This program is distributed in the hope that it will be useful,
10 10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 12 # GNU General Public License for more details.
13 13 #
14 14 # You should have received a copy of the GNU Affero General Public License
15 15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 16 #
17 17 # This program is dual-licensed. If you wish to learn more about the
18 18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20 20
21 21 import logging
22 22 import rhodecode
23 import unicodedata
23 24 from rhodecode.lib.type_utils import aslist
24 25
25 26 log = logging.getLogger(__name__)
26 27
27 28
28 29 def safe_int(val, default=None) -> int:
29 30 """
30 31 Returns int() of val if val is not convertable to int use default
31 32 instead
32 33
33 34 :param val:
34 35 :param default:
35 36 """
36 37
37 38 try:
38 39 val = int(val)
39 40 except (ValueError, TypeError):
40 41 val = default
41 42
42 43 return val
43 44
44 45
45 46 def get_default_encodings():
46 47 return aslist(rhodecode.CONFIG.get('default_encoding', 'utf8'), sep=',')
47 48
48 49
49 50 def safe_str(str_, to_encoding=None) -> str:
50 51 """
51 52 safe str function. Does few trick to turn unicode_ into string
52 53
53 54 :param str_: str to encode
54 55 :param to_encoding: encode to this type UTF8 default
55 56 :rtype: str
56 57 :returns: str object
57 58 """
58 59 if isinstance(str_, str):
59 60 return str_
60 61
61 62 # if it's bytes cast to str
62 63 if not isinstance(str_, bytes):
63 64 return str(str_)
64 65
65 66 to_encoding = to_encoding or get_default_encodings()
66 67 if not isinstance(to_encoding, (list, tuple)):
67 68 to_encoding = [to_encoding]
68 69
69 70 for enc in to_encoding:
70 71 try:
71 72 return str(str_, enc)
72 73 except UnicodeDecodeError:
73 74 pass
74 75
75 76 return str(str_, to_encoding[0], 'replace')
76 77
77 78
78 79 def safe_bytes(str_, from_encoding=None) -> bytes:
79 80 """
80 81 safe bytes function. Does few trick to turn str_ into bytes string:
81 82
82 83 :param str_: string to decode
83 84 :param from_encoding: encode from this type UTF8 default
84 85 :rtype: unicode
85 86 :returns: unicode object
86 87 """
87 88 if isinstance(str_, bytes):
88 89 return str_
89 90
90 91 if not isinstance(str_, str):
91 92 raise ValueError('safe_bytes cannot convert other types than str: got: {}'.format(type(str_)))
92 93
93 94 from_encoding = from_encoding or get_default_encodings()
94 95 if not isinstance(from_encoding, (list, tuple)):
95 96 from_encoding = [from_encoding]
96 97
97 98 for enc in from_encoding:
98 99 try:
99 100 return str_.encode(enc)
100 101 except UnicodeDecodeError:
101 102 pass
102 103
103 104 return str_.encode(from_encoding[0], 'replace')
104 105
105 106
106 107 def ascii_bytes(str_, allow_bytes=False) -> bytes:
107 108 """
108 109 Simple conversion from str to bytes, with assumption that str_ is pure ASCII.
109 110 Fails with UnicodeError on invalid input.
110 111 This should be used where encoding and "safe" ambiguity should be avoided.
111 112 Where strings already have been encoded in other ways but still are unicode
112 113 string - for example to hex, base64, json, urlencoding, or are known to be
113 114 identifiers.
114 115 """
115 116 if allow_bytes and isinstance(str_, bytes):
116 117 return str_
117 118
118 119 if not isinstance(str_, str):
119 120 raise ValueError('ascii_bytes cannot convert other types than str: got: {}'.format(type(str_)))
120 121 return str_.encode('ascii')
121 122
122 123
123 124 def ascii_str(str_):
124 125 """
125 126 Simple conversion from bytes to str, with assumption that str_ is pure ASCII.
126 127 Fails with UnicodeError on invalid input.
127 128 This should be used where encoding and "safe" ambiguity should be avoided.
128 129 Where strings are encoded but also in other ways are known to be ASCII, and
129 130 where a unicode string is wanted without caring about encoding. For example
130 131 to hex, base64, urlencoding, or are known to be identifiers.
131 132 """
132 133
133 134 if not isinstance(str_, bytes):
134 135 raise ValueError('ascii_str cannot convert other types than bytes: got: {}'.format(type(str_)))
135 136 return str_.decode('ascii')
137
138
139 def convert_special_chars(str_):
140 """
141 trie to replace non-ascii letters to their ascii representation eg::
142
143 `żołw` converts into `zolw`
144 """
145 value = safe_str(str_)
146 converted_value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
147 return converted_value
General Comments 0
You need to be logged in to leave comments. Login now