str_utils.py
147 lines
| 4.5 KiB
| text/x-python
|
PythonLexer
r4915 | # -*- coding: utf-8 -*- | |||
# Copyright (C) 2011-2020 RhodeCode GmbH | ||||
# | ||||
# This program is free software: you can redistribute it and/or modify | ||||
# it under the terms of the GNU Affero General Public License, version 3 | ||||
# (only), as published by the Free Software Foundation. | ||||
# | ||||
# This program is distributed in the hope that it will be useful, | ||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||||
# GNU General Public License for more details. | ||||
# | ||||
# You should have received a copy of the GNU Affero General Public License | ||||
# along with this program. If not, see <http://www.gnu.org/licenses/>. | ||||
# | ||||
# This program is dual-licensed. If you wish to learn more about the | ||||
# RhodeCode Enterprise Edition, including its added features, Support services, | ||||
# and proprietary license terms, please see https://rhodecode.com/licenses/ | ||||
import logging | ||||
import rhodecode | ||||
r4989 | import unicodedata | |||
r4915 | from rhodecode.lib.type_utils import aslist | |||
log = logging.getLogger(__name__) | ||||
def safe_int(val, default=None) -> int: | ||||
""" | ||||
Returns int() of val if val is not convertable to int use default | ||||
instead | ||||
:param val: | ||||
:param default: | ||||
""" | ||||
try: | ||||
val = int(val) | ||||
except (ValueError, TypeError): | ||||
val = default | ||||
return val | ||||
def get_default_encodings(): | ||||
return aslist(rhodecode.CONFIG.get('default_encoding', 'utf8'), sep=',') | ||||
def safe_str(str_, to_encoding=None) -> str: | ||||
""" | ||||
safe str function. Does few trick to turn unicode_ into string | ||||
:param str_: str to encode | ||||
:param to_encoding: encode to this type UTF8 default | ||||
:rtype: str | ||||
:returns: str object | ||||
""" | ||||
if isinstance(str_, str): | ||||
return str_ | ||||
# if it's bytes cast to str | ||||
if not isinstance(str_, bytes): | ||||
return str(str_) | ||||
to_encoding = to_encoding or get_default_encodings() | ||||
if not isinstance(to_encoding, (list, tuple)): | ||||
to_encoding = [to_encoding] | ||||
for enc in to_encoding: | ||||
try: | ||||
return str(str_, enc) | ||||
except UnicodeDecodeError: | ||||
pass | ||||
return str(str_, to_encoding[0], 'replace') | ||||
def safe_bytes(str_, from_encoding=None) -> bytes: | ||||
""" | ||||
safe bytes function. Does few trick to turn str_ into bytes string: | ||||
:param str_: string to decode | ||||
:param from_encoding: encode from this type UTF8 default | ||||
:rtype: unicode | ||||
:returns: unicode object | ||||
""" | ||||
if isinstance(str_, bytes): | ||||
return str_ | ||||
if not isinstance(str_, str): | ||||
raise ValueError('safe_bytes cannot convert other types than str: got: {}'.format(type(str_))) | ||||
from_encoding = from_encoding or get_default_encodings() | ||||
if not isinstance(from_encoding, (list, tuple)): | ||||
from_encoding = [from_encoding] | ||||
for enc in from_encoding: | ||||
try: | ||||
return str_.encode(enc) | ||||
except UnicodeDecodeError: | ||||
pass | ||||
return str_.encode(from_encoding[0], 'replace') | ||||
def ascii_bytes(str_, allow_bytes=False) -> bytes: | ||||
""" | ||||
Simple conversion from str to bytes, with assumption that str_ is pure ASCII. | ||||
Fails with UnicodeError on invalid input. | ||||
This should be used where encoding and "safe" ambiguity should be avoided. | ||||
Where strings already have been encoded in other ways but still are unicode | ||||
string - for example to hex, base64, json, urlencoding, or are known to be | ||||
identifiers. | ||||
""" | ||||
if allow_bytes and isinstance(str_, bytes): | ||||
return str_ | ||||
if not isinstance(str_, str): | ||||
raise ValueError('ascii_bytes cannot convert other types than str: got: {}'.format(type(str_))) | ||||
return str_.encode('ascii') | ||||
def ascii_str(str_): | ||||
""" | ||||
Simple conversion from bytes to str, with assumption that str_ is pure ASCII. | ||||
Fails with UnicodeError on invalid input. | ||||
This should be used where encoding and "safe" ambiguity should be avoided. | ||||
Where strings are encoded but also in other ways are known to be ASCII, and | ||||
where a unicode string is wanted without caring about encoding. For example | ||||
to hex, base64, urlencoding, or are known to be identifiers. | ||||
""" | ||||
if not isinstance(str_, bytes): | ||||
raise ValueError('ascii_str cannot convert other types than bytes: got: {}'.format(type(str_))) | ||||
return str_.decode('ascii') | ||||
r4989 | ||||
def convert_special_chars(str_): | ||||
""" | ||||
trie to replace non-ascii letters to their ascii representation eg:: | ||||
`żołw` converts into `zolw` | ||||
""" | ||||
value = safe_str(str_) | ||||
converted_value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode() | ||||
return converted_value | ||||