diff --git a/scripts/i18n b/scripts/i18n --- a/scripts/i18n +++ b/scripts/i18n @@ -21,6 +21,14 @@ import i18n_utils """ Tool for maintenance of .po and .pot files + +Normally, the i18n-related files contain for each translatable string a +reference to all the source code locations where this string is found. This +meta data is useful for translators to assess how strings are used, but is not +relevant for normal development nor for running Kallithea. Such meta data, or +derived data like kallithea.pot, will inherently be outdated, and create +unnecessary churn and repository growth, making it harder to spot actual and +important changes. """ @click.group() @@ -30,5 +38,16 @@ def cli(debug): i18n_utils.do_debug = True pass +@cli.command() +@click.argument('po_files', nargs=-1) +def normalize_po_files(po_files): + """Normalize the specified .po and .pot files. + + Only actual translations and essential headers will be preserved. + """ + for po_file in po_files: + i18n_utils._normalize_po_file(po_file, strip=True) + + if __name__ == '__main__': cli() diff --git a/scripts/i18n_utils.py b/scripts/i18n_utils.py --- a/scripts/i18n_utils.py +++ b/scripts/i18n_utils.py @@ -13,6 +13,8 @@ from __future__ import print_function +import os +import re import subprocess @@ -25,3 +27,141 @@ def debug(*args, **kwargs): def runcmd(cmd, *args, **kwargs): debug('... Executing command: %s' % ' '.join(cmd)) subprocess.check_call(cmd, *args, **kwargs) + +header_comment_strip_re = re.compile(r''' + ^ + [#][ ]Translations[ ]template[ ]for[ ]Kallithea[.] \n + | + ^ + [#][ ]FIRST[ ]AUTHOR[ ],[ ]\d+[.] \n + [#] \n + [#],[ ]fuzzy \n + ''', + re.MULTILINE|re.VERBOSE) + +header_normalize_re = re.compile(r''' + ^ " + (POT-Creation-Date|PO-Revision-Date|Last-Translator|Language-Team|X-Generator|Generated-By|Project-Id-Version): + [ ][^\\]*\\n + " \n + ''', + re.MULTILINE|re.IGNORECASE|re.VERBOSE) + +def _normalize_po(raw_content): + r""" + >>> print(_normalize_po(r''' + ... # header comment + ... + ... + ... # comment before header + ... msgid "" + ... msgstr "yada" + ... "POT-Creation-Date: 2019-05-04 21:13+0200\n" + ... "MIME-Version: " + ... "1.0\n" + ... "Last-Translator: Jabba" + ... "the Hutt\n" + ... "X-Generator: Weblate 1.2.3\n" + ... + ... # comment, but not in header + ... msgid "None" + ... msgstr "Ingen" + ... + ... + ... line 2 + ... # third comment + ... + ... msgid "Special" + ... msgstr "" + ... + ... msgid "Specialist" + ... # odd comment + ... msgstr "" + ... "Expert" + ... + ... # crazy fuzzy auto translation by msgmerge, using foo for bar + ... #, fuzzy + ... #| msgid "some foo string" + ... msgid "some bar string." + ... msgstr "translation of foo string" + ... + ... msgid "%d minute" + ... msgid_plural "%d minutes" + ... msgstr[0] "minut" + ... msgstr[1] "minutter" + ... msgstr[2] "" + ... + ... msgid "%d year" + ... msgid_plural "%d years" + ... msgstr[0] "" + ... msgstr[1] "" + ... + ... # last comment + ... ''') + '^^^') + # header comment + + + # comment before header + + msgid "" + msgstr "yada" + "MIME-Version: " + "1.0\n" + + msgid "None" + msgstr "Ingen" + + line 2 + + msgid "Specialist" + msgstr "" + "Expert" + + msgid "%d minute" + msgid_plural "%d minutes" + msgstr[0] "minut" + msgstr[1] "minutter" + msgstr[2] "" + ^^^ + """ + header_start = raw_content.find('\nmsgid ""\n') + 1 + header_end = raw_content.find('\n\n', header_start) + 1 or len(raw_content) + chunks = [ + header_comment_strip_re.sub('', raw_content[0:header_start]) + .strip(), + '', + header_normalize_re.sub('', raw_content[header_start:header_end]) + .strip(), + ''] # preserve normalized header + # all chunks are separated by empty line + for raw_chunk in raw_content[header_end:].split('\n\n'): + if '\n#, fuzzy' in raw_chunk: # might be like "#, fuzzy, python-format" + continue # drop crazy auto translation that is worse than useless + # strip all comment lines from chunk + chunk_lines = [ + line + for line in raw_chunk.splitlines() + if line + and not line.startswith('#') + ] + if not chunk_lines: + continue + # check lines starting from first msgstr, skip chunk if no translation lines + msgstr_i = [i for i, line in enumerate(chunk_lines) if line.startswith('msgstr')] + if ( + chunk_lines[0].startswith('msgid') and + msgstr_i and + all(line.endswith(' ""') for line in chunk_lines[msgstr_i[0]:]) + ): # skip translation chunks that doesn't have any actual translations + continue + chunks.append('\n'.join(chunk_lines) + '\n') + return '\n'.join(chunks) + +def _normalize_po_file(po_file, strip=False): + if strip: + po_tmp = po_file + '.tmp' + with open(po_file, 'r') as src, open(po_tmp, 'w') as dest: + raw_content = src.read() + normalized_content = _normalize_po(raw_content) + dest.write(normalized_content) + os.rename(po_tmp, po_file)