##// END OF EJS Templates
str_utils: added common non-ascii replacer
super-admin -
r4989:b0280397 default
parent child Browse files
Show More
@@ -1,135 +1,147 b''
1 # -*- coding: utf-8 -*-
1 # -*- coding: utf-8 -*-
2
2
3 # Copyright (C) 2011-2020 RhodeCode GmbH
3 # Copyright (C) 2011-2020 RhodeCode GmbH
4 #
4 #
5 # This program is free software: you can redistribute it and/or modify
5 # This program is free software: you can redistribute it and/or modify
6 # it under the terms of the GNU Affero General Public License, version 3
6 # it under the terms of the GNU Affero General Public License, version 3
7 # (only), as published by the Free Software Foundation.
7 # (only), as published by the Free Software Foundation.
8 #
8 #
9 # This program is distributed in the hope that it will be useful,
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
12 # GNU General Public License for more details.
13 #
13 #
14 # You should have received a copy of the GNU Affero General Public License
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16 #
16 #
17 # This program is dual-licensed. If you wish to learn more about the
17 # This program is dual-licensed. If you wish to learn more about the
18 # RhodeCode Enterprise Edition, including its added features, Support services,
18 # RhodeCode Enterprise Edition, including its added features, Support services,
19 # and proprietary license terms, please see https://rhodecode.com/licenses/
19 # and proprietary license terms, please see https://rhodecode.com/licenses/
20
20
21 import logging
21 import logging
22 import rhodecode
22 import rhodecode
23 import unicodedata
23 from rhodecode.lib.type_utils import aslist
24 from rhodecode.lib.type_utils import aslist
24
25
25 log = logging.getLogger(__name__)
26 log = logging.getLogger(__name__)
26
27
27
28
28 def safe_int(val, default=None) -> int:
29 def safe_int(val, default=None) -> int:
29 """
30 """
30 Returns int() of val if val is not convertable to int use default
31 Returns int() of val if val is not convertable to int use default
31 instead
32 instead
32
33
33 :param val:
34 :param val:
34 :param default:
35 :param default:
35 """
36 """
36
37
37 try:
38 try:
38 val = int(val)
39 val = int(val)
39 except (ValueError, TypeError):
40 except (ValueError, TypeError):
40 val = default
41 val = default
41
42
42 return val
43 return val
43
44
44
45
45 def get_default_encodings():
46 def get_default_encodings():
46 return aslist(rhodecode.CONFIG.get('default_encoding', 'utf8'), sep=',')
47 return aslist(rhodecode.CONFIG.get('default_encoding', 'utf8'), sep=',')
47
48
48
49
49 def safe_str(str_, to_encoding=None) -> str:
50 def safe_str(str_, to_encoding=None) -> str:
50 """
51 """
51 safe str function. Does few trick to turn unicode_ into string
52 safe str function. Does few trick to turn unicode_ into string
52
53
53 :param str_: str to encode
54 :param str_: str to encode
54 :param to_encoding: encode to this type UTF8 default
55 :param to_encoding: encode to this type UTF8 default
55 :rtype: str
56 :rtype: str
56 :returns: str object
57 :returns: str object
57 """
58 """
58 if isinstance(str_, str):
59 if isinstance(str_, str):
59 return str_
60 return str_
60
61
61 # if it's bytes cast to str
62 # if it's bytes cast to str
62 if not isinstance(str_, bytes):
63 if not isinstance(str_, bytes):
63 return str(str_)
64 return str(str_)
64
65
65 to_encoding = to_encoding or get_default_encodings()
66 to_encoding = to_encoding or get_default_encodings()
66 if not isinstance(to_encoding, (list, tuple)):
67 if not isinstance(to_encoding, (list, tuple)):
67 to_encoding = [to_encoding]
68 to_encoding = [to_encoding]
68
69
69 for enc in to_encoding:
70 for enc in to_encoding:
70 try:
71 try:
71 return str(str_, enc)
72 return str(str_, enc)
72 except UnicodeDecodeError:
73 except UnicodeDecodeError:
73 pass
74 pass
74
75
75 return str(str_, to_encoding[0], 'replace')
76 return str(str_, to_encoding[0], 'replace')
76
77
77
78
78 def safe_bytes(str_, from_encoding=None) -> bytes:
79 def safe_bytes(str_, from_encoding=None) -> bytes:
79 """
80 """
80 safe bytes function. Does few trick to turn str_ into bytes string:
81 safe bytes function. Does few trick to turn str_ into bytes string:
81
82
82 :param str_: string to decode
83 :param str_: string to decode
83 :param from_encoding: encode from this type UTF8 default
84 :param from_encoding: encode from this type UTF8 default
84 :rtype: unicode
85 :rtype: unicode
85 :returns: unicode object
86 :returns: unicode object
86 """
87 """
87 if isinstance(str_, bytes):
88 if isinstance(str_, bytes):
88 return str_
89 return str_
89
90
90 if not isinstance(str_, str):
91 if not isinstance(str_, str):
91 raise ValueError('safe_bytes cannot convert other types than str: got: {}'.format(type(str_)))
92 raise ValueError('safe_bytes cannot convert other types than str: got: {}'.format(type(str_)))
92
93
93 from_encoding = from_encoding or get_default_encodings()
94 from_encoding = from_encoding or get_default_encodings()
94 if not isinstance(from_encoding, (list, tuple)):
95 if not isinstance(from_encoding, (list, tuple)):
95 from_encoding = [from_encoding]
96 from_encoding = [from_encoding]
96
97
97 for enc in from_encoding:
98 for enc in from_encoding:
98 try:
99 try:
99 return str_.encode(enc)
100 return str_.encode(enc)
100 except UnicodeDecodeError:
101 except UnicodeDecodeError:
101 pass
102 pass
102
103
103 return str_.encode(from_encoding[0], 'replace')
104 return str_.encode(from_encoding[0], 'replace')
104
105
105
106
106 def ascii_bytes(str_, allow_bytes=False) -> bytes:
107 def ascii_bytes(str_, allow_bytes=False) -> bytes:
107 """
108 """
108 Simple conversion from str to bytes, with assumption that str_ is pure ASCII.
109 Simple conversion from str to bytes, with assumption that str_ is pure ASCII.
109 Fails with UnicodeError on invalid input.
110 Fails with UnicodeError on invalid input.
110 This should be used where encoding and "safe" ambiguity should be avoided.
111 This should be used where encoding and "safe" ambiguity should be avoided.
111 Where strings already have been encoded in other ways but still are unicode
112 Where strings already have been encoded in other ways but still are unicode
112 string - for example to hex, base64, json, urlencoding, or are known to be
113 string - for example to hex, base64, json, urlencoding, or are known to be
113 identifiers.
114 identifiers.
114 """
115 """
115 if allow_bytes and isinstance(str_, bytes):
116 if allow_bytes and isinstance(str_, bytes):
116 return str_
117 return str_
117
118
118 if not isinstance(str_, str):
119 if not isinstance(str_, str):
119 raise ValueError('ascii_bytes cannot convert other types than str: got: {}'.format(type(str_)))
120 raise ValueError('ascii_bytes cannot convert other types than str: got: {}'.format(type(str_)))
120 return str_.encode('ascii')
121 return str_.encode('ascii')
121
122
122
123
123 def ascii_str(str_):
124 def ascii_str(str_):
124 """
125 """
125 Simple conversion from bytes to str, with assumption that str_ is pure ASCII.
126 Simple conversion from bytes to str, with assumption that str_ is pure ASCII.
126 Fails with UnicodeError on invalid input.
127 Fails with UnicodeError on invalid input.
127 This should be used where encoding and "safe" ambiguity should be avoided.
128 This should be used where encoding and "safe" ambiguity should be avoided.
128 Where strings are encoded but also in other ways are known to be ASCII, and
129 Where strings are encoded but also in other ways are known to be ASCII, and
129 where a unicode string is wanted without caring about encoding. For example
130 where a unicode string is wanted without caring about encoding. For example
130 to hex, base64, urlencoding, or are known to be identifiers.
131 to hex, base64, urlencoding, or are known to be identifiers.
131 """
132 """
132
133
133 if not isinstance(str_, bytes):
134 if not isinstance(str_, bytes):
134 raise ValueError('ascii_str cannot convert other types than bytes: got: {}'.format(type(str_)))
135 raise ValueError('ascii_str cannot convert other types than bytes: got: {}'.format(type(str_)))
135 return str_.decode('ascii')
136 return str_.decode('ascii')
137
138
139 def convert_special_chars(str_):
140 """
141 trie to replace non-ascii letters to their ascii representation eg::
142
143 `żołw` converts into `zolw`
144 """
145 value = safe_str(str_)
146 converted_value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode()
147 return converted_value
General Comments 0
You need to be logged in to leave comments. Login now