Show More
polib.py
1554 lines
| 50.9 KiB
| text/x-python
|
PythonLexer
/ i18n / polib.py
Wagner Bruna
|
r11387 | # -*- coding: utf-8 -*- | ||
Matt Mackall
|
r11432 | # no-check-code | ||
Wagner Bruna
|
r11387 | # | ||
# License: MIT (see LICENSE file provided) | ||||
# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: | ||||
""" | ||||
Wagner Bruna
|
r15290 | **polib** allows you to manipulate, create, modify gettext files (pot, po and | ||
mo files). You can load existing files, iterate through it's entries, add, | ||||
modify entries, comments or metadata, etc. or create new po files from scratch. | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | **polib** provides a simple and pythonic API via the :func:`~polib.pofile` and | ||
:func:`~polib.mofile` convenience functions. | ||||
Wagner Bruna
|
r11387 | """ | ||
Pulkit Goyal
|
r29485 | from __future__ import absolute_import | ||
Wagner Bruna
|
r15290 | __author__ = 'David Jean Louis <izimobil@gmail.com>' | ||
__version__ = '0.6.4' | ||||
Wagner Bruna
|
r11387 | __all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry', | ||
'detect_encoding', 'escape', 'unescape', 'detect_encoding',] | ||||
Wagner Bruna
|
r15290 | import array | ||
Wagner Bruna
|
r11387 | import codecs | ||
Wagner Bruna
|
r15290 | import os | ||
import re | ||||
Wagner Bruna
|
r11387 | import struct | ||
Wagner Bruna
|
r15290 | import sys | ||
Wagner Bruna
|
r11387 | import textwrap | ||
import types | ||||
Wagner Bruna
|
r15290 | |||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | # the default encoding to use when encoding cannot be detected | ||
Wagner Bruna
|
r11387 | default_encoding = 'utf-8' | ||
Wagner Bruna
|
r15290 | # _pofile_or_mofile {{{ | ||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | def _pofile_or_mofile(f, type, **kwargs): | ||
""" | ||||
Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to | ||||
honor the DRY concept. | ||||
Wagner Bruna
|
r11387 | """ | ||
Wagner Bruna
|
r15290 | # get the file encoding | ||
enc = kwargs.get('encoding') | ||||
if enc is None: | ||||
enc = detect_encoding(f, type == 'mofile') | ||||
# parse the file | ||||
kls = type == 'pofile' and _POFileParser or _MOFileParser | ||||
parser = kls( | ||||
f, | ||||
Wagner Bruna
|
r11387 | encoding=enc, | ||
check_for_duplicates=kwargs.get('check_for_duplicates', False) | ||||
) | ||||
instance = parser.parse() | ||||
instance.wrapwidth = kwargs.get('wrapwidth', 78) | ||||
return instance | ||||
# }}} | ||||
Wagner Bruna
|
r15290 | # function pofile() {{{ | ||
def pofile(pofile, **kwargs): | ||||
""" | ||||
Convenience function that parses the po or pot file ``pofile`` and returns | ||||
a :class:`~polib.POFile` instance. | ||||
Arguments: | ||||
``pofile`` | ||||
string, full or relative path to the po/pot file or its content (data). | ||||
``wrapwidth`` | ||||
integer, the wrap width, only useful when the ``-w`` option was passed | ||||
to xgettext (optional, default: ``78``). | ||||
``encoding`` | ||||
string, the encoding to use (e.g. "utf-8") (default: ``None``, the | ||||
encoding will be auto-detected). | ||||
``check_for_duplicates`` | ||||
whether to check for duplicate entries when adding entries to the | ||||
file (optional, default: ``False``). | ||||
""" | ||||
return _pofile_or_mofile(pofile, 'pofile', **kwargs) | ||||
# }}} | ||||
Wagner Bruna
|
r11387 | # function mofile() {{{ | ||
Wagner Bruna
|
r15290 | def mofile(mofile, **kwargs): | ||
Wagner Bruna
|
r11387 | """ | ||
Wagner Bruna
|
r15290 | Convenience function that parses the mo file ``mofile`` and returns a | ||
:class:`~polib.MOFile` instance. | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | Arguments: | ||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | ``mofile`` | ||
string, full or relative path to the mo file or its content (data). | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | ``wrapwidth`` | ||
integer, the wrap width, only useful when the ``-w`` option was passed | ||||
to xgettext to generate the po file that was used to format the mo file | ||||
(optional, default: ``78``). | ||||
``encoding`` | ||||
string, the encoding to use (e.g. "utf-8") (default: ``None``, the | ||||
encoding will be auto-detected). | ||||
``check_for_duplicates`` | ||||
whether to check for duplicate entries when adding entries to the | ||||
file (optional, default: ``False``). | ||||
Wagner Bruna
|
r11387 | """ | ||
Wagner Bruna
|
r15290 | return _pofile_or_mofile(mofile, 'mofile', **kwargs) | ||
Wagner Bruna
|
r11387 | |||
# }}} | ||||
# function detect_encoding() {{{ | ||||
Wagner Bruna
|
r15290 | def detect_encoding(file, binary_mode=False): | ||
Wagner Bruna
|
r11387 | """ | ||
Wagner Bruna
|
r15290 | Try to detect the encoding used by the ``file``. The ``file`` argument can | ||
be a PO or MO file path or a string containing the contents of the file. | ||||
If the encoding cannot be detected, the function will return the value of | ||||
``default_encoding``. | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | Arguments: | ||
``file`` | ||||
string, full or relative path to the po/mo file or its content. | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | ``binary_mode`` | ||
boolean, set this to True if ``file`` is a mo file. | ||||
""" | ||||
rx = re.compile(r'"?Content-Type:.+? charset=([\w_\-:\.]+)') | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | def charset_exists(charset): | ||
"""Check whether ``charset`` is valid or not.""" | ||||
try: | ||||
codecs.lookup(charset) | ||||
except LookupError: | ||||
return False | ||||
return True | ||||
if not os.path.exists(file): | ||||
match = rx.search(file) | ||||
if match: | ||||
enc = match.group(1).strip() | ||||
if charset_exists(enc): | ||||
return enc | ||||
Wagner Bruna
|
r11387 | else: | ||
Wagner Bruna
|
r15290 | if binary_mode: | ||
mode = 'rb' | ||||
else: | ||||
mode = 'r' | ||||
f = open(file, mode) | ||||
for l in f.readlines(): | ||||
match = rx.search(l) | ||||
if match: | ||||
f.close() | ||||
enc = match.group(1).strip() | ||||
if charset_exists(enc): | ||||
return enc | ||||
f.close() | ||||
Wagner Bruna
|
r11387 | return default_encoding | ||
# }}} | ||||
# function escape() {{{ | ||||
def escape(st): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in | ||
the given string ``st`` and returns it. | ||||
Wagner Bruna
|
r11387 | """ | ||
return st.replace('\\', r'\\')\ | ||||
.replace('\t', r'\t')\ | ||||
.replace('\r', r'\r')\ | ||||
.replace('\n', r'\n')\ | ||||
.replace('\"', r'\"') | ||||
# }}} | ||||
# function unescape() {{{ | ||||
def unescape(st): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in | ||
the given string ``st`` and returns it. | ||||
Wagner Bruna
|
r11387 | """ | ||
Martin Geisler
|
r11388 | def unescape_repl(m): | ||
m = m.group(1) | ||||
if m == 'n': | ||||
return '\n' | ||||
if m == 't': | ||||
return '\t' | ||||
if m == 'r': | ||||
return '\r' | ||||
if m == '\\': | ||||
return '\\' | ||||
return m # handles escaped double quote | ||||
return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st) | ||||
Wagner Bruna
|
r11387 | |||
# }}} | ||||
# class _BaseFile {{{ | ||||
class _BaseFile(list): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile` | ||
classes. This class should **not** be instanciated directly. | ||||
Wagner Bruna
|
r11387 | """ | ||
def __init__(self, *args, **kwargs): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Constructor, accepts the following keyword arguments: | ||
``pofile`` | ||||
string, the path to the po or mo file, or its content as a string. | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | ``wrapwidth`` | ||
integer, the wrap width, only useful when the ``-w`` option was | ||||
passed to xgettext (optional, default: ``78``). | ||||
``encoding`` | ||||
string, the encoding to use, defaults to ``default_encoding`` | ||||
global variable (optional). | ||||
``check_for_duplicates`` | ||||
whether to check for duplicate entries when adding entries to the | ||||
file, (optional, default: ``False``). | ||||
Wagner Bruna
|
r11387 | """ | ||
list.__init__(self) | ||||
# the opened file handle | ||||
Wagner Bruna
|
r15290 | pofile = kwargs.get('pofile', None) | ||
if pofile and os.path.exists(pofile): | ||||
self.fpath = pofile | ||||
else: | ||||
self.fpath = kwargs.get('fpath') | ||||
Wagner Bruna
|
r11387 | # the width at which lines should be wrapped | ||
self.wrapwidth = kwargs.get('wrapwidth', 78) | ||||
# the file encoding | ||||
self.encoding = kwargs.get('encoding', default_encoding) | ||||
# whether to check for duplicate entries or not | ||||
self.check_for_duplicates = kwargs.get('check_for_duplicates', False) | ||||
# header | ||||
self.header = '' | ||||
# both po and mo files have metadata | ||||
self.metadata = {} | ||||
self.metadata_is_fuzzy = 0 | ||||
Wagner Bruna
|
r15290 | def __unicode__(self): | ||
Wagner Bruna
|
r11387 | """ | ||
Wagner Bruna
|
r15290 | Returns the unicode representation of the file. | ||
Wagner Bruna
|
r11387 | """ | ||
ret = [] | ||||
entries = [self.metadata_as_entry()] + \ | ||||
[e for e in self if not e.obsolete] | ||||
for entry in entries: | ||||
Wagner Bruna
|
r15290 | ret.append(entry.__unicode__(self.wrapwidth)) | ||
Wagner Bruna
|
r11387 | for entry in self.obsolete_entries(): | ||
Wagner Bruna
|
r15290 | ret.append(entry.__unicode__(self.wrapwidth)) | ||
ret = '\n'.join(ret) | ||||
if type(ret) != types.UnicodeType: | ||||
return unicode(ret, self.encoding) | ||||
return ret | ||||
def __str__(self): | ||||
""" | ||||
Returns the string representation of the file. | ||||
""" | ||||
return unicode(self).encode(self.encoding) | ||||
Wagner Bruna
|
r11387 | |||
def __contains__(self, entry): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Overriden ``list`` method to implement the membership test (in and | ||
not in). | ||||
The method considers that an entry is in the file if it finds an entry | ||||
that has the same msgid (the test is **case sensitive**). | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | Argument: | ||
``entry`` | ||||
an instance of :class:`~polib._BaseEntry`. | ||||
Wagner Bruna
|
r11387 | """ | ||
return self.find(entry.msgid, by='msgid') is not None | ||||
Mads Kiilerich
|
r19023 | |||
Wagner Bruna
|
r15290 | def __eq__(self, other): | ||
return unicode(self) == unicode(other) | ||||
Wagner Bruna
|
r11387 | |||
def append(self, entry): | ||||
""" | ||||
Overriden method to check for duplicates entries, if a user tries to | ||||
Wagner Bruna
|
r15290 | add an entry that is already in the file, the method will raise a | ||
``ValueError`` exception. | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | Argument: | ||
``entry`` | ||||
an instance of :class:`~polib._BaseEntry`. | ||||
Wagner Bruna
|
r11387 | """ | ||
if self.check_for_duplicates and entry in self: | ||||
raise ValueError('Entry "%s" already exists' % entry.msgid) | ||||
super(_BaseFile, self).append(entry) | ||||
def insert(self, index, entry): | ||||
""" | ||||
Overriden method to check for duplicates entries, if a user tries to | ||||
Wagner Bruna
|
r15290 | add an entry that is already in the file, the method will raise a | ||
``ValueError`` exception. | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | Arguments: | ||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | ``index`` | ||
index at which the entry should be inserted. | ||||
``entry`` | ||||
an instance of :class:`~polib._BaseEntry`. | ||||
Wagner Bruna
|
r11387 | """ | ||
if self.check_for_duplicates and entry in self: | ||||
raise ValueError('Entry "%s" already exists' % entry.msgid) | ||||
super(_BaseFile, self).insert(index, entry) | ||||
def metadata_as_entry(self): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Returns the file metadata as a :class:`~polib.POFile` instance. | ||
Wagner Bruna
|
r11387 | """ | ||
e = POEntry(msgid='') | ||||
mdata = self.ordered_metadata() | ||||
if mdata: | ||||
strs = [] | ||||
for name, value in mdata: | ||||
# Strip whitespace off each line in a multi-line entry | ||||
strs.append('%s: %s' % (name, value)) | ||||
e.msgstr = '\n'.join(strs) + '\n' | ||||
if self.metadata_is_fuzzy: | ||||
e.flags.append('fuzzy') | ||||
return e | ||||
def save(self, fpath=None, repr_method='__str__'): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Saves the po file to ``fpath``. | ||
If it is an existing file and no ``fpath`` is provided, then the | ||||
existing file is rewritten with the modified data. | ||||
Keyword arguments: | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | ``fpath`` | ||
string, full or relative path to the file. | ||||
``repr_method`` | ||||
string, the method to use for output. | ||||
Wagner Bruna
|
r11387 | """ | ||
if self.fpath is None and fpath is None: | ||||
raise IOError('You must provide a file path to save() method') | ||||
contents = getattr(self, repr_method)() | ||||
if fpath is None: | ||||
fpath = self.fpath | ||||
if repr_method == 'to_binary': | ||||
fhandle = open(fpath, 'wb') | ||||
else: | ||||
fhandle = codecs.open(fpath, 'w', self.encoding) | ||||
if type(contents) != types.UnicodeType: | ||||
contents = contents.decode(self.encoding) | ||||
fhandle.write(contents) | ||||
fhandle.close() | ||||
Wagner Bruna
|
r15290 | # set the file path if not set | ||
if self.fpath is None and fpath: | ||||
self.fpath = fpath | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | def find(self, st, by='msgid', include_obsolete_entries=False, | ||
msgctxt=False): | ||||
""" | ||||
Find the entry which msgid (or property identified by the ``by`` | ||||
argument) matches the string ``st``. | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | Keyword arguments: | ||
``st`` | ||||
string, the string to search for. | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | ``by`` | ||
string, the property to use for comparison (default: ``msgid``). | ||||
``include_obsolete_entries`` | ||||
boolean, whether to also search in entries that are obsolete. | ||||
``msgctxt`` | ||||
string, allows to specify a specific message context for the | ||||
search. | ||||
Wagner Bruna
|
r11387 | """ | ||
Wagner Bruna
|
r15290 | if include_obsolete_entries: | ||
entries = self[:] | ||||
else: | ||||
entries = [e for e in self if not e.obsolete] | ||||
for e in entries: | ||||
Wagner Bruna
|
r11387 | if getattr(e, by) == st: | ||
Wagner Bruna
|
r15290 | if msgctxt and e.msgctxt != msgctxt: | ||
continue | ||||
Wagner Bruna
|
r11387 | return e | ||
return None | ||||
def ordered_metadata(self): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Convenience method that returns an ordered version of the metadata | ||
Mads Kiilerich
|
r23139 | dictionary. The return value is list of tuples (metadata name, | ||
Wagner Bruna
|
r15290 | metadata_value). | ||
Wagner Bruna
|
r11387 | """ | ||
# copy the dict first | ||||
metadata = self.metadata.copy() | ||||
data_order = [ | ||||
'Project-Id-Version', | ||||
'Report-Msgid-Bugs-To', | ||||
'POT-Creation-Date', | ||||
'PO-Revision-Date', | ||||
'Last-Translator', | ||||
'Language-Team', | ||||
'MIME-Version', | ||||
'Content-Type', | ||||
'Content-Transfer-Encoding' | ||||
] | ||||
ordered_data = [] | ||||
for data in data_order: | ||||
try: | ||||
value = metadata.pop(data) | ||||
ordered_data.append((data, value)) | ||||
except KeyError: | ||||
pass | ||||
Wagner Bruna
|
r15290 | # the rest of the metadata will be alphabetically ordered since there | ||
# are no specs for this AFAIK | ||||
Wagner Bruna
|
r11387 | keys = metadata.keys() | ||
Wagner Bruna
|
r15290 | keys.sort() | ||
Wagner Bruna
|
r11387 | for data in keys: | ||
value = metadata[data] | ||||
ordered_data.append((data, value)) | ||||
return ordered_data | ||||
def to_binary(self): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Return the binary representation of the file. | ||
Wagner Bruna
|
r11387 | """ | ||
offsets = [] | ||||
entries = self.translated_entries() | ||||
# the keys are sorted in the .mo file | ||||
def cmp(_self, other): | ||||
Wagner Bruna
|
r15290 | # msgfmt compares entries with msgctxt if it exists | ||
Jordi Gutiérrez Hermoso
|
r24306 | if _self.msgctxt: | ||
self_msgid = _self.msgctxt | ||||
else: | ||||
self_msgid = _self.msgid | ||||
if other.msgctxt: | ||||
other_msgid = other.msgctxt | ||||
else: | ||||
other_msgid = other.msgid | ||||
Wagner Bruna
|
r15290 | if self_msgid > other_msgid: | ||
Wagner Bruna
|
r11387 | return 1 | ||
Wagner Bruna
|
r15290 | elif self_msgid < other_msgid: | ||
Wagner Bruna
|
r11387 | return -1 | ||
else: | ||||
return 0 | ||||
# add metadata entry | ||||
entries.sort(cmp) | ||||
mentry = self.metadata_as_entry() | ||||
Wagner Bruna
|
r15290 | #mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip() | ||
Wagner Bruna
|
r11387 | entries = [mentry] + entries | ||
entries_len = len(entries) | ||||
ids, strs = '', '' | ||||
for e in entries: | ||||
# For each string, we need size and file offset. Each string is | ||||
# NUL terminated; the NUL does not count into the size. | ||||
Wagner Bruna
|
r15290 | msgid = '' | ||
if e.msgctxt: | ||||
# Contexts are stored by storing the concatenation of the | ||||
# context, a <EOT> byte, and the original string | ||||
msgid = self._encode(e.msgctxt + '\4') | ||||
Wagner Bruna
|
r11387 | if e.msgid_plural: | ||
indexes = e.msgstr_plural.keys() | ||||
indexes.sort() | ||||
msgstr = [] | ||||
for index in indexes: | ||||
msgstr.append(e.msgstr_plural[index]) | ||||
Wagner Bruna
|
r15290 | msgid += self._encode(e.msgid + '\0' + e.msgid_plural) | ||
Wagner Bruna
|
r11387 | msgstr = self._encode('\0'.join(msgstr)) | ||
else: | ||||
Wagner Bruna
|
r15290 | msgid += self._encode(e.msgid) | ||
Wagner Bruna
|
r11387 | msgstr = self._encode(e.msgstr) | ||
offsets.append((len(ids), len(msgid), len(strs), len(msgstr))) | ||||
ids += msgid + '\0' | ||||
strs += msgstr + '\0' | ||||
Wagner Bruna
|
r15290 | |||
Wagner Bruna
|
r11387 | # The header is 7 32-bit unsigned integers. | ||
keystart = 7*4+16*entries_len | ||||
# and the values start after the keys | ||||
valuestart = keystart + len(ids) | ||||
koffsets = [] | ||||
voffsets = [] | ||||
# The string table first has the list of keys, then the list of values. | ||||
# Each entry has first the size of the string, then the file offset. | ||||
for o1, l1, o2, l2 in offsets: | ||||
koffsets += [l1, o1+keystart] | ||||
voffsets += [l2, o2+valuestart] | ||||
offsets = koffsets + voffsets | ||||
Wagner Bruna
|
r15290 | # check endianness for magic number | ||
if struct.pack('@h', 1) == struct.pack('<h', 1): | ||||
magic_number = MOFile.LITTLE_ENDIAN | ||||
else: | ||||
magic_number = MOFile.BIG_ENDIAN | ||||
output = struct.pack( | ||||
"Iiiiiii", | ||||
magic_number, # Magic number | ||||
0, # Version | ||||
entries_len, # # of entries | ||||
7*4, # start of key index | ||||
7*4+entries_len*8, # start of value index | ||||
0, keystart # size and offset of hash table | ||||
# Important: we don't use hash tables | ||||
Mads Kiilerich
|
r19023 | ) | ||
Wagner Bruna
|
r15290 | output += array.array("i", offsets).tostring() | ||
Wagner Bruna
|
r11387 | output += ids | ||
output += strs | ||||
return output | ||||
def _encode(self, mixed): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Encodes the given ``mixed`` argument with the file encoding if and | ||
only if it's an unicode string and returns the encoded string. | ||||
Wagner Bruna
|
r11387 | """ | ||
if type(mixed) == types.UnicodeType: | ||||
return mixed.encode(self.encoding) | ||||
return mixed | ||||
# }}} | ||||
# class POFile {{{ | ||||
class POFile(_BaseFile): | ||||
Wagner Bruna
|
r15290 | """ | ||
Wagner Bruna
|
r11387 | Po (or Pot) file reader/writer. | ||
Wagner Bruna
|
r15290 | This class inherits the :class:`~polib._BaseFile` class and, by extension, | ||
the python ``list`` type. | ||||
""" | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | def __unicode__(self): | ||
""" | ||||
Returns the unicode representation of the po file. | ||||
""" | ||||
Wagner Bruna
|
r11387 | ret, headers = '', self.header.split('\n') | ||
for header in headers: | ||||
if header[:1] in [',', ':']: | ||||
ret += '#%s\n' % header | ||||
else: | ||||
ret += '# %s\n' % header | ||||
Wagner Bruna
|
r15290 | |||
if type(ret) != types.UnicodeType: | ||||
ret = unicode(ret, self.encoding) | ||||
return ret + _BaseFile.__unicode__(self) | ||||
Wagner Bruna
|
r11387 | |||
def save_as_mofile(self, fpath): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Saves the binary representation of the file to given ``fpath``. | ||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | Keyword argument: | ||
``fpath`` | ||||
string, full or relative path to the mo file. | ||||
Wagner Bruna
|
r11387 | """ | ||
_BaseFile.save(self, fpath, 'to_binary') | ||||
def percent_translated(self): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Convenience method that returns the percentage of translated | ||
Wagner Bruna
|
r11387 | messages. | ||
""" | ||||
total = len([e for e in self if not e.obsolete]) | ||||
if total == 0: | ||||
return 100 | ||||
translated = len(self.translated_entries()) | ||||
return int((100.00 / float(total)) * translated) | ||||
def translated_entries(self): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Convenience method that returns the list of translated entries. | ||
Wagner Bruna
|
r11387 | """ | ||
return [e for e in self if e.translated()] | ||||
def untranslated_entries(self): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Convenience method that returns the list of untranslated entries. | ||
Wagner Bruna
|
r11387 | """ | ||
return [e for e in self if not e.translated() and not e.obsolete \ | ||||
and not 'fuzzy' in e.flags] | ||||
def fuzzy_entries(self): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Convenience method that returns the list of fuzzy entries. | ||
Wagner Bruna
|
r11387 | """ | ||
return [e for e in self if 'fuzzy' in e.flags] | ||||
def obsolete_entries(self): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Convenience method that returns the list of obsolete entries. | ||
Wagner Bruna
|
r11387 | """ | ||
return [e for e in self if e.obsolete] | ||||
def merge(self, refpot): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Convenience method that merges the current pofile with the pot file | ||
Wagner Bruna
|
r11387 | provided. It behaves exactly as the gettext msgmerge utility: | ||
Wagner Bruna
|
r15290 | * comments of this file will be preserved, but extracted comments and | ||
occurrences will be discarded; | ||||
* any translations or comments in the file will be discarded, however, | ||||
dot comments and file positions will be preserved; | ||||
* the fuzzy flags are preserved. | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | Keyword argument: | ||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | ``refpot`` | ||
object POFile, the reference catalog. | ||||
Wagner Bruna
|
r11387 | """ | ||
for entry in refpot: | ||||
Wagner Bruna
|
r15290 | e = self.find(entry.msgid, include_obsolete_entries=True) | ||
Wagner Bruna
|
r11387 | if e is None: | ||
e = POEntry() | ||||
self.append(e) | ||||
e.merge(entry) | ||||
Wagner Bruna
|
r15290 | # ok, now we must "obsolete" entries that are not in the refpot anymore | ||
Wagner Bruna
|
r11387 | for entry in self: | ||
if refpot.find(entry.msgid) is None: | ||||
entry.obsolete = True | ||||
# }}} | ||||
# class MOFile {{{ | ||||
class MOFile(_BaseFile): | ||||
Wagner Bruna
|
r15290 | """ | ||
Wagner Bruna
|
r11387 | Mo file reader/writer. | ||
Wagner Bruna
|
r15290 | This class inherits the :class:`~polib._BaseFile` class and, by | ||
extension, the python ``list`` type. | ||||
""" | ||||
BIG_ENDIAN = 0xde120495 | ||||
LITTLE_ENDIAN = 0x950412de | ||||
Wagner Bruna
|
r11387 | |||
def __init__(self, *args, **kwargs): | ||||
""" | ||||
Mads Kiilerich
|
r19023 | Constructor, accepts all keywords arguments accepted by | ||
Wagner Bruna
|
r15290 | :class:`~polib._BaseFile` class. | ||
Wagner Bruna
|
r11387 | """ | ||
_BaseFile.__init__(self, *args, **kwargs) | ||||
self.magic_number = None | ||||
self.version = 0 | ||||
def save_as_pofile(self, fpath): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Saves the mofile as a pofile to ``fpath``. | ||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | Keyword argument: | ||
``fpath`` | ||||
string, full or relative path to the file. | ||||
Wagner Bruna
|
r11387 | """ | ||
_BaseFile.save(self, fpath) | ||||
Wagner Bruna
|
r15290 | def save(self, fpath=None): | ||
Wagner Bruna
|
r11387 | """ | ||
Wagner Bruna
|
r15290 | Saves the mofile to ``fpath``. | ||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | Keyword argument: | ||
``fpath`` | ||||
string, full or relative path to the file. | ||||
Wagner Bruna
|
r11387 | """ | ||
_BaseFile.save(self, fpath, 'to_binary') | ||||
def percent_translated(self): | ||||
""" | ||||
Convenience method to keep the same interface with POFile instances. | ||||
""" | ||||
return 100 | ||||
def translated_entries(self): | ||||
""" | ||||
Convenience method to keep the same interface with POFile instances. | ||||
""" | ||||
return self | ||||
def untranslated_entries(self): | ||||
""" | ||||
Convenience method to keep the same interface with POFile instances. | ||||
""" | ||||
return [] | ||||
def fuzzy_entries(self): | ||||
""" | ||||
Convenience method to keep the same interface with POFile instances. | ||||
""" | ||||
return [] | ||||
def obsolete_entries(self): | ||||
""" | ||||
Convenience method to keep the same interface with POFile instances. | ||||
""" | ||||
return [] | ||||
# }}} | ||||
# class _BaseEntry {{{ | ||||
class _BaseEntry(object): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes. | ||
This class should **not** be instanciated directly. | ||||
Wagner Bruna
|
r11387 | """ | ||
def __init__(self, *args, **kwargs): | ||||
Wagner Bruna
|
r15290 | """ | ||
Constructor, accepts the following keyword arguments: | ||||
``msgid`` | ||||
string, the entry msgid. | ||||
``msgstr`` | ||||
string, the entry msgstr. | ||||
``msgid_plural`` | ||||
string, the entry msgid_plural. | ||||
``msgstr_plural`` | ||||
list, the entry msgstr_plural lines. | ||||
``msgctxt`` | ||||
string, the entry context (msgctxt). | ||||
``obsolete`` | ||||
bool, whether the entry is "obsolete" or not. | ||||
``encoding`` | ||||
string, the encoding to use, defaults to ``default_encoding`` | ||||
global variable (optional). | ||||
""" | ||||
Wagner Bruna
|
r11387 | self.msgid = kwargs.get('msgid', '') | ||
self.msgstr = kwargs.get('msgstr', '') | ||||
self.msgid_plural = kwargs.get('msgid_plural', '') | ||||
self.msgstr_plural = kwargs.get('msgstr_plural', {}) | ||||
Wagner Bruna
|
r15290 | self.msgctxt = kwargs.get('msgctxt', None) | ||
Wagner Bruna
|
r11387 | self.obsolete = kwargs.get('obsolete', False) | ||
self.encoding = kwargs.get('encoding', default_encoding) | ||||
Wagner Bruna
|
r15290 | def __unicode__(self, wrapwidth=78): | ||
Wagner Bruna
|
r11387 | """ | ||
Wagner Bruna
|
r15290 | Returns the unicode representation of the entry. | ||
Wagner Bruna
|
r11387 | """ | ||
if self.obsolete: | ||||
delflag = '#~ ' | ||||
else: | ||||
delflag = '' | ||||
ret = [] | ||||
# write the msgctxt if any | ||||
if self.msgctxt is not None: | ||||
Wagner Bruna
|
r15290 | ret += self._str_field("msgctxt", delflag, "", self.msgctxt, wrapwidth) | ||
Wagner Bruna
|
r11387 | # write the msgid | ||
Wagner Bruna
|
r15290 | ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth) | ||
Wagner Bruna
|
r11387 | # write the msgid_plural if any | ||
if self.msgid_plural: | ||||
Wagner Bruna
|
r15290 | ret += self._str_field("msgid_plural", delflag, "", self.msgid_plural, wrapwidth) | ||
Wagner Bruna
|
r11387 | if self.msgstr_plural: | ||
# write the msgstr_plural if any | ||||
msgstrs = self.msgstr_plural | ||||
keys = list(msgstrs) | ||||
keys.sort() | ||||
for index in keys: | ||||
msgstr = msgstrs[index] | ||||
plural_index = '[%s]' % index | ||||
Wagner Bruna
|
r15290 | ret += self._str_field("msgstr", delflag, plural_index, msgstr, wrapwidth) | ||
Wagner Bruna
|
r11387 | else: | ||
# otherwise write the msgstr | ||||
Wagner Bruna
|
r15290 | ret += self._str_field("msgstr", delflag, "", self.msgstr, wrapwidth) | ||
Wagner Bruna
|
r11387 | ret.append('') | ||
Wagner Bruna
|
r15290 | ret = '\n'.join(ret) | ||
if type(ret) != types.UnicodeType: | ||||
return unicode(ret, self.encoding) | ||||
return ret | ||||
def __str__(self): | ||||
""" | ||||
Returns the string representation of the entry. | ||||
""" | ||||
return unicode(self).encode(self.encoding) | ||||
Mads Kiilerich
|
r19023 | |||
Wagner Bruna
|
r15290 | def __eq__(self, other): | ||
return unicode(self) == unicode(other) | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78): | ||
lines = field.splitlines(True) | ||||
if len(lines) > 1: | ||||
lines = [''] + lines # start with initial empty line | ||||
Wagner Bruna
|
r11387 | else: | ||
Wagner Bruna
|
r15290 | escaped_field = escape(field) | ||
specialchars_count = 0 | ||||
for c in ['\\', '\n', '\r', '\t', '"']: | ||||
specialchars_count += field.count(c) | ||||
Mads Kiilerich
|
r19023 | # comparison must take into account fieldname length + one space | ||
Wagner Bruna
|
r15290 | # + 2 quotes (eg. msgid "<string>") | ||
flength = len(fieldname) + 3 | ||||
if plural_index: | ||||
flength += len(plural_index) | ||||
real_wrapwidth = wrapwidth - flength + specialchars_count | ||||
if wrapwidth > 0 and len(field) > real_wrapwidth: | ||||
# Wrap the line but take field name into account | ||||
Matt Harbison
|
r32889 | lines = [''] + [unescape(item) for item in textwrap.wrap( | ||
Wagner Bruna
|
r15290 | escaped_field, | ||
wrapwidth - 2, # 2 for quotes "" | ||||
drop_whitespace=False, | ||||
break_long_words=False | ||||
)] | ||||
Wagner Bruna
|
r11387 | else: | ||
Wagner Bruna
|
r15290 | lines = [field] | ||
Wagner Bruna
|
r11387 | if fieldname.startswith('previous_'): | ||
# quick and dirty trick to get the real field name | ||||
fieldname = fieldname[9:] | ||||
ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index, | ||||
escape(lines.pop(0)))] | ||||
for mstr in lines: | ||||
ret.append('%s"%s"' % (delflag, escape(mstr))) | ||||
return ret | ||||
# }}} | ||||
# class POEntry {{{ | ||||
class POEntry(_BaseEntry): | ||||
""" | ||||
Represents a po file entry. | ||||
""" | ||||
def __init__(self, *args, **kwargs): | ||||
Wagner Bruna
|
r15290 | """ | ||
Constructor, accepts the following keyword arguments: | ||||
``comment`` | ||||
string, the entry comment. | ||||
``tcomment`` | ||||
string, the entry translator comment. | ||||
``occurrences`` | ||||
list, the entry occurrences. | ||||
``flags`` | ||||
list, the entry flags. | ||||
``previous_msgctxt`` | ||||
string, the entry previous context. | ||||
``previous_msgid`` | ||||
string, the entry previous msgid. | ||||
``previous_msgid_plural`` | ||||
string, the entry previous msgid_plural. | ||||
""" | ||||
Wagner Bruna
|
r11387 | _BaseEntry.__init__(self, *args, **kwargs) | ||
self.comment = kwargs.get('comment', '') | ||||
self.tcomment = kwargs.get('tcomment', '') | ||||
self.occurrences = kwargs.get('occurrences', []) | ||||
self.flags = kwargs.get('flags', []) | ||||
self.previous_msgctxt = kwargs.get('previous_msgctxt', None) | ||||
self.previous_msgid = kwargs.get('previous_msgid', None) | ||||
self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None) | ||||
Wagner Bruna
|
r15290 | def __unicode__(self, wrapwidth=78): | ||
Wagner Bruna
|
r11387 | """ | ||
Wagner Bruna
|
r15290 | Returns the unicode representation of the entry. | ||
Wagner Bruna
|
r11387 | """ | ||
if self.obsolete: | ||||
Wagner Bruna
|
r15290 | return _BaseEntry.__unicode__(self, wrapwidth) | ||
Wagner Bruna
|
r11387 | ret = [] | ||
Wagner Bruna
|
r15290 | # comments first, if any (with text wrapping as xgettext does) | ||
comments = [('comment', '#. '), ('tcomment', '# ')] | ||||
for c in comments: | ||||
val = getattr(self, c[0]) | ||||
if val: | ||||
for comment in val.split('\n'): | ||||
if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth: | ||||
Matt Harbison
|
r32889 | ret += textwrap.wrap( | ||
Wagner Bruna
|
r15290 | comment, | ||
wrapwidth, | ||||
initial_indent=c[1], | ||||
subsequent_indent=c[1], | ||||
break_long_words=False | ||||
) | ||||
else: | ||||
ret.append('%s%s' % (c[1], comment)) | ||||
Wagner Bruna
|
r11387 | # occurrences (with text wrapping as xgettext does) | ||
if self.occurrences: | ||||
filelist = [] | ||||
for fpath, lineno in self.occurrences: | ||||
if lineno: | ||||
filelist.append('%s:%s' % (fpath, lineno)) | ||||
else: | ||||
filelist.append(fpath) | ||||
filestr = ' '.join(filelist) | ||||
Wagner Bruna
|
r15290 | if wrapwidth > 0 and len(filestr) + 3 > wrapwidth: | ||
Mads Kiilerich
|
r19023 | # textwrap split words that contain hyphen, this is not | ||
# what we want for filenames, so the dirty hack is to | ||||
# temporally replace hyphens with a char that a file cannot | ||||
Wagner Bruna
|
r11387 | # contain, like "*" | ||
Matt Harbison
|
r32889 | ret += [l.replace('*', '-') for l in textwrap.wrap( | ||
Wagner Bruna
|
r15290 | filestr.replace('-', '*'), | ||
wrapwidth, | ||||
initial_indent='#: ', | ||||
subsequent_indent='#: ', | ||||
break_long_words=False | ||||
)] | ||||
Wagner Bruna
|
r11387 | else: | ||
Wagner Bruna
|
r15290 | ret.append('#: ' + filestr) | ||
# flags (TODO: wrapping ?) | ||||
Wagner Bruna
|
r11387 | if self.flags: | ||
Wagner Bruna
|
r15290 | ret.append('#, %s' % ', '.join(self.flags)) | ||
Wagner Bruna
|
r11387 | |||
# previous context and previous msgid/msgid_plural | ||||
Wagner Bruna
|
r15290 | fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural'] | ||
for f in fields: | ||||
val = getattr(self, f) | ||||
if val: | ||||
ret += self._str_field(f, "#| ", "", val, wrapwidth) | ||||
Wagner Bruna
|
r11387 | |||
Wagner Bruna
|
r15290 | ret.append(_BaseEntry.__unicode__(self, wrapwidth)) | ||
ret = '\n'.join(ret) | ||||
if type(ret) != types.UnicodeType: | ||||
return unicode(ret, self.encoding) | ||||
return ret | ||||
Wagner Bruna
|
r11387 | |||
def __cmp__(self, other): | ||||
Wagner Bruna
|
r15290 | """ | ||
Wagner Bruna
|
r11387 | Called by comparison operations if rich comparison is not defined. | ||
Wagner Bruna
|
r15290 | """ | ||
Wagner Bruna
|
r11387 | def compare_occurrences(a, b): | ||
""" | ||||
Compare an entry occurrence with another one. | ||||
""" | ||||
if a[0] != b[0]: | ||||
return a[0] < b[0] | ||||
if a[1] != b[1]: | ||||
return a[1] < b[1] | ||||
return 0 | ||||
# First: Obsolete test | ||||
if self.obsolete != other.obsolete: | ||||
if self.obsolete: | ||||
return -1 | ||||
else: | ||||
return 1 | ||||
# Work on a copy to protect original | ||||
occ1 = self.occurrences[:] | ||||
occ2 = other.occurrences[:] | ||||
# Sorting using compare method | ||||
occ1.sort(compare_occurrences) | ||||
occ2.sort(compare_occurrences) | ||||
# Comparing sorted occurrences | ||||
pos = 0 | ||||
for entry1 in occ1: | ||||
try: | ||||
entry2 = occ2[pos] | ||||
except IndexError: | ||||
return 1 | ||||
pos = pos + 1 | ||||
if entry1[0] != entry2[0]: | ||||
if entry1[0] > entry2[0]: | ||||
return 1 | ||||
else: | ||||
return -1 | ||||
if entry1[1] != entry2[1]: | ||||
if entry1[1] > entry2[1]: | ||||
return 1 | ||||
else: | ||||
return -1 | ||||
# Finally: Compare message ID | ||||
if self.msgid > other.msgid: return 1 | ||||
else: return -1 | ||||
def translated(self): | ||||
""" | ||||
Wagner Bruna
|
r15290 | Returns ``True`` if the entry has been translated or ``False`` | ||
otherwise. | ||||
Wagner Bruna
|
r11387 | """ | ||
if self.obsolete or 'fuzzy' in self.flags: | ||||
return False | ||||
if self.msgstr != '': | ||||
return True | ||||
if self.msgstr_plural: | ||||
for pos in self.msgstr_plural: | ||||
if self.msgstr_plural[pos] == '': | ||||
return False | ||||
return True | ||||
return False | ||||
def merge(self, other): | ||||
""" | ||||
Merge the current entry with the given pot entry. | ||||
""" | ||||
Wagner Bruna
|
r15290 | self.msgid = other.msgid | ||
self.msgctxt = other.msgctxt | ||||
self.occurrences = other.occurrences | ||||
self.comment = other.comment | ||||
fuzzy = 'fuzzy' in self.flags | ||||
self.flags = other.flags[:] # clone flags | ||||
if fuzzy: | ||||
self.flags.append('fuzzy') | ||||
Wagner Bruna
|
r11387 | self.msgid_plural = other.msgid_plural | ||
Wagner Bruna
|
r15290 | self.obsolete = other.obsolete | ||
self.previous_msgctxt = other.previous_msgctxt | ||||
self.previous_msgid = other.previous_msgid | ||||
self.previous_msgid_plural = other.previous_msgid_plural | ||||
Wagner Bruna
|
r11387 | if other.msgstr_plural: | ||
for pos in other.msgstr_plural: | ||||
try: | ||||
# keep existing translation at pos if any | ||||
self.msgstr_plural[pos] | ||||
except KeyError: | ||||
self.msgstr_plural[pos] = '' | ||||
# }}} | ||||
# class MOEntry {{{ | ||||
class MOEntry(_BaseEntry): | ||||
""" | ||||
Represents a mo file entry. | ||||
""" | ||||
Wagner Bruna
|
r15290 | pass | ||
Wagner Bruna
|
r11387 | |||
# }}} | ||||
# class _POFileParser {{{ | ||||
class _POFileParser(object): | ||||
""" | ||||
A finite state machine to parse efficiently and correctly po | ||||
file format. | ||||
""" | ||||
Wagner Bruna
|
r15290 | def __init__(self, pofile, *args, **kwargs): | ||
Wagner Bruna
|
r11387 | """ | ||
Constructor. | ||||
Wagner Bruna
|
r15290 | Keyword arguments: | ||
``pofile`` | ||||
string, path to the po file or its content | ||||
``encoding`` | ||||
string, the encoding to use, defaults to ``default_encoding`` | ||||
global variable (optional). | ||||
``check_for_duplicates`` | ||||
whether to check for duplicate entries when adding entries to the | ||||
file (optional, default: ``False``). | ||||
Wagner Bruna
|
r11387 | """ | ||
enc = kwargs.get('encoding', default_encoding) | ||||
Wagner Bruna
|
r15290 | if os.path.exists(pofile): | ||
try: | ||||
self.fhandle = codecs.open(pofile, 'rU', enc) | ||||
except LookupError: | ||||
enc = default_encoding | ||||
self.fhandle = codecs.open(pofile, 'rU', enc) | ||||
else: | ||||
self.fhandle = pofile.splitlines() | ||||
Wagner Bruna
|
r11387 | self.instance = POFile( | ||
Wagner Bruna
|
r15290 | pofile=pofile, | ||
Wagner Bruna
|
r11387 | encoding=enc, | ||
Wagner Bruna
|
r15290 | check_for_duplicates=kwargs.get('check_for_duplicates', False) | ||
Wagner Bruna
|
r11387 | ) | ||
self.transitions = {} | ||||
self.current_entry = POEntry() | ||||
self.current_state = 'ST' | ||||
self.current_token = None | ||||
# two memo flags used in handlers | ||||
self.msgstr_index = 0 | ||||
self.entry_obsolete = 0 | ||||
# Configure the state machine, by adding transitions. | ||||
# Signification of symbols: | ||||
# * ST: Beginning of the file (start) | ||||
# * HE: Header | ||||
# * TC: a translation comment | ||||
# * GC: a generated comment | ||||
# * OC: a file/line occurence | ||||
# * FL: a flags line | ||||
# * CT: a message context | ||||
# * PC: a previous msgctxt | ||||
# * PM: a previous msgid | ||||
# * PP: a previous msgid_plural | ||||
# * MI: a msgid | ||||
# * MP: a msgid plural | ||||
# * MS: a msgstr | ||||
# * MX: a msgstr plural | ||||
# * MC: a msgid or msgstr continuation line | ||||
all = ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'PC', 'PM', 'PP', 'TC', | ||||
'MS', 'MP', 'MX', 'MI'] | ||||
self.add('TC', ['ST', 'HE'], 'HE') | ||||
self.add('TC', ['GC', 'OC', 'FL', 'TC', 'PC', 'PM', 'PP', 'MS', | ||||
'MP', 'MX', 'MI'], 'TC') | ||||
self.add('GC', all, 'GC') | ||||
self.add('OC', all, 'OC') | ||||
self.add('FL', all, 'FL') | ||||
self.add('PC', all, 'PC') | ||||
self.add('PM', all, 'PM') | ||||
self.add('PP', all, 'PP') | ||||
self.add('CT', ['ST', 'HE', 'GC', 'OC', 'FL', 'TC', 'PC', 'PM', | ||||
'PP', 'MS', 'MX'], 'CT') | ||||
Mads Kiilerich
|
r19023 | self.add('MI', ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'TC', 'PC', | ||
Wagner Bruna
|
r11387 | 'PM', 'PP', 'MS', 'MX'], 'MI') | ||
self.add('MP', ['TC', 'GC', 'PC', 'PM', 'PP', 'MI'], 'MP') | ||||
self.add('MS', ['MI', 'MP', 'TC'], 'MS') | ||||
self.add('MX', ['MI', 'MX', 'MP', 'TC'], 'MX') | ||||
self.add('MC', ['CT', 'MI', 'MP', 'MS', 'MX', 'PM', 'PP', 'PC'], 'MC') | ||||
def parse(self): | ||||
""" | ||||
Run the state machine, parse the file line by line and call process() | ||||
with the current matched symbol. | ||||
""" | ||||
Wagner Bruna
|
r15290 | i = 0 | ||
keywords = { | ||||
'msgctxt': 'CT', | ||||
'msgid': 'MI', | ||||
'msgstr': 'MS', | ||||
'msgid_plural': 'MP', | ||||
} | ||||
prev_keywords = { | ||||
'msgid_plural': 'PP', | ||||
'msgid': 'PM', | ||||
'msgctxt': 'PC', | ||||
} | ||||
Wagner Bruna
|
r11387 | for line in self.fhandle: | ||
Wagner Bruna
|
r15290 | i += 1 | ||
Wagner Bruna
|
r11387 | line = line.strip() | ||
if line == '': | ||||
continue | ||||
Wagner Bruna
|
r15290 | |||
tokens = line.split(None, 2) | ||||
nb_tokens = len(tokens) | ||||
if tokens[0] == '#~' and nb_tokens > 1: | ||||
line = line[3:].strip() | ||||
tokens = tokens[1:] | ||||
nb_tokens -= 1 | ||||
Wagner Bruna
|
r11387 | self.entry_obsolete = 1 | ||
else: | ||||
self.entry_obsolete = 0 | ||||
Wagner Bruna
|
r15290 | |||
# Take care of keywords like | ||||
# msgid, msgid_plural, msgctxt & msgstr. | ||||
if tokens[0] in keywords and nb_tokens > 1: | ||||
line = line[len(tokens[0]):].lstrip() | ||||
self.current_token = line | ||||
self.process(keywords[tokens[0]], i) | ||||
continue | ||||
Wagner Bruna
|
r11387 | self.current_token = line | ||
Wagner Bruna
|
r15290 | |||
if tokens[0] == '#:' and nb_tokens > 1: | ||||
Wagner Bruna
|
r11387 | # we are on a occurrences line | ||
self.process('OC', i) | ||||
Wagner Bruna
|
r15290 | |||
elif line[:1] == '"': | ||||
# we are on a continuation line | ||||
Wagner Bruna
|
r11387 | self.process('MC', i) | ||
Wagner Bruna
|
r15290 | |||
Wagner Bruna
|
r11387 | elif line[:7] == 'msgstr[': | ||
# we are on a msgstr plural | ||||
self.process('MX', i) | ||||
Wagner Bruna
|
r15290 | |||
elif tokens[0] == '#,' and nb_tokens > 1: | ||||
Wagner Bruna
|
r11387 | # we are on a flags line | ||
self.process('FL', i) | ||||
Wagner Bruna
|
r15290 | |||
elif tokens[0] == '#': | ||||
if line == '#': line += ' ' | ||||
Wagner Bruna
|
r11387 | # we are on a translator comment line | ||
self.process('TC', i) | ||||
Wagner Bruna
|
r15290 | |||
elif tokens[0] == '#.' and nb_tokens > 1: | ||||
Wagner Bruna
|
r11387 | # we are on a generated comment line | ||
self.process('GC', i) | ||||
Wagner Bruna
|
r15290 | |||
elif tokens[0] == '#|': | ||||
if nb_tokens < 2: | ||||
self.process('??', i) | ||||
continue | ||||
# Remove the marker and any whitespace right after that. | ||||
line = line[2:].lstrip() | ||||
self.current_token = line | ||||
if tokens[1].startswith('"'): | ||||
# Continuation of previous metadata. | ||||
self.process('MC', i) | ||||
continue | ||||
if nb_tokens == 2: | ||||
# Invalid continuation line. | ||||
self.process('??', i) | ||||
# we are on a "previous translation" comment line, | ||||
if tokens[1] not in prev_keywords: | ||||
# Unknown keyword in previous translation comment. | ||||
self.process('??', i) | ||||
# Remove the keyword and any whitespace | ||||
# between it and the starting quote. | ||||
line = line[len(tokens[1]):].lstrip() | ||||
self.current_token = line | ||||
self.process(prev_keywords[tokens[1]], i) | ||||
else: | ||||
self.process('??', i) | ||||
Wagner Bruna
|
r11387 | |||
if self.current_entry: | ||||
# since entries are added when another entry is found, we must add | ||||
# the last entry here (only if there are lines) | ||||
self.instance.append(self.current_entry) | ||||
Mads Kiilerich
|
r19023 | # before returning the instance, check if there's metadata and if | ||
Wagner Bruna
|
r11387 | # so extract it in a dict | ||
firstentry = self.instance[0] | ||||
if firstentry.msgid == '': # metadata found | ||||
# remove the entry | ||||
firstentry = self.instance.pop(0) | ||||
self.instance.metadata_is_fuzzy = firstentry.flags | ||||
key = None | ||||
for msg in firstentry.msgstr.splitlines(): | ||||
try: | ||||
key, val = msg.split(':', 1) | ||||
self.instance.metadata[key] = val.strip() | ||||
except: | ||||
if key is not None: | ||||
self.instance.metadata[key] += '\n'+ msg.strip() | ||||
# close opened file | ||||
Wagner Bruna
|
r15290 | if isinstance(self.fhandle, file): | ||
self.fhandle.close() | ||||
Wagner Bruna
|
r11387 | return self.instance | ||
def add(self, symbol, states, next_state): | ||||
""" | ||||
Add a transition to the state machine. | ||||
Wagner Bruna
|
r15290 | |||
Wagner Bruna
|
r11387 | Keywords arguments: | ||
Wagner Bruna
|
r15290 | ``symbol`` | ||
string, the matched token (two chars symbol). | ||||
``states`` | ||||
list, a list of states (two chars symbols). | ||||
``next_state`` | ||||
the next state the fsm will have after the action. | ||||
Wagner Bruna
|
r11387 | """ | ||
for state in states: | ||||
action = getattr(self, 'handle_%s' % next_state.lower()) | ||||
self.transitions[(symbol, state)] = (action, next_state) | ||||
def process(self, symbol, linenum): | ||||
""" | ||||
Process the transition corresponding to the current state and the | ||||
symbol provided. | ||||
Keywords arguments: | ||||
Wagner Bruna
|
r15290 | |||
``symbol`` | ||||
string, the matched token (two chars symbol). | ||||
``linenum`` | ||||
integer, the current line number of the parsed file. | ||||
Wagner Bruna
|
r11387 | """ | ||
try: | ||||
(action, state) = self.transitions[(symbol, self.current_state)] | ||||
if action(): | ||||
self.current_state = state | ||||
Gregory Szorc
|
r25660 | except Exception as exc: | ||
Wagner Bruna
|
r11387 | raise IOError('Syntax error in po file (line %s)' % linenum) | ||
# state handlers | ||||
def handle_he(self): | ||||
"""Handle a header comment.""" | ||||
if self.instance.header != '': | ||||
self.instance.header += '\n' | ||||
self.instance.header += self.current_token[2:] | ||||
return 1 | ||||
def handle_tc(self): | ||||
"""Handle a translator comment.""" | ||||
if self.current_state in ['MC', 'MS', 'MX']: | ||||
self.instance.append(self.current_entry) | ||||
self.current_entry = POEntry() | ||||
if self.current_entry.tcomment != '': | ||||
self.current_entry.tcomment += '\n' | ||||
self.current_entry.tcomment += self.current_token[2:] | ||||
return True | ||||
def handle_gc(self): | ||||
"""Handle a generated comment.""" | ||||
if self.current_state in ['MC', 'MS', 'MX']: | ||||
self.instance.append(self.current_entry) | ||||
self.current_entry = POEntry() | ||||
if self.current_entry.comment != '': | ||||
self.current_entry.comment += '\n' | ||||
self.current_entry.comment += self.current_token[3:] | ||||
return True | ||||
def handle_oc(self): | ||||
"""Handle a file:num occurence.""" | ||||
if self.current_state in ['MC', 'MS', 'MX']: | ||||
self.instance.append(self.current_entry) | ||||
self.current_entry = POEntry() | ||||
occurrences = self.current_token[3:].split() | ||||
for occurrence in occurrences: | ||||
if occurrence != '': | ||||
try: | ||||
fil, line = occurrence.split(':') | ||||
if not line.isdigit(): | ||||
fil = fil + line | ||||
line = '' | ||||
self.current_entry.occurrences.append((fil, line)) | ||||
except: | ||||
self.current_entry.occurrences.append((occurrence, '')) | ||||
return True | ||||
def handle_fl(self): | ||||
"""Handle a flags line.""" | ||||
if self.current_state in ['MC', 'MS', 'MX']: | ||||
self.instance.append(self.current_entry) | ||||
self.current_entry = POEntry() | ||||
self.current_entry.flags += self.current_token[3:].split(', ') | ||||
return True | ||||
def handle_pp(self): | ||||
"""Handle a previous msgid_plural line.""" | ||||
if self.current_state in ['MC', 'MS', 'MX']: | ||||
self.instance.append(self.current_entry) | ||||
self.current_entry = POEntry() | ||||
self.current_entry.previous_msgid_plural = \ | ||||
Wagner Bruna
|
r15290 | unescape(self.current_token[1:-1]) | ||
Wagner Bruna
|
r11387 | return True | ||
def handle_pm(self): | ||||
"""Handle a previous msgid line.""" | ||||
if self.current_state in ['MC', 'MS', 'MX']: | ||||
self.instance.append(self.current_entry) | ||||
self.current_entry = POEntry() | ||||
self.current_entry.previous_msgid = \ | ||||
Wagner Bruna
|
r15290 | unescape(self.current_token[1:-1]) | ||
Wagner Bruna
|
r11387 | return True | ||
def handle_pc(self): | ||||
"""Handle a previous msgctxt line.""" | ||||
if self.current_state in ['MC', 'MS', 'MX']: | ||||
self.instance.append(self.current_entry) | ||||
self.current_entry = POEntry() | ||||
self.current_entry.previous_msgctxt = \ | ||||
Wagner Bruna
|
r15290 | unescape(self.current_token[1:-1]) | ||
Wagner Bruna
|
r11387 | return True | ||
def handle_ct(self): | ||||
"""Handle a msgctxt.""" | ||||
if self.current_state in ['MC', 'MS', 'MX']: | ||||
self.instance.append(self.current_entry) | ||||
self.current_entry = POEntry() | ||||
Wagner Bruna
|
r15290 | self.current_entry.msgctxt = unescape(self.current_token[1:-1]) | ||
Wagner Bruna
|
r11387 | return True | ||
def handle_mi(self): | ||||
"""Handle a msgid.""" | ||||
if self.current_state in ['MC', 'MS', 'MX']: | ||||
self.instance.append(self.current_entry) | ||||
self.current_entry = POEntry() | ||||
self.current_entry.obsolete = self.entry_obsolete | ||||
Wagner Bruna
|
r15290 | self.current_entry.msgid = unescape(self.current_token[1:-1]) | ||
Wagner Bruna
|
r11387 | return True | ||
def handle_mp(self): | ||||
"""Handle a msgid plural.""" | ||||
Wagner Bruna
|
r15290 | self.current_entry.msgid_plural = unescape(self.current_token[1:-1]) | ||
Wagner Bruna
|
r11387 | return True | ||
def handle_ms(self): | ||||
"""Handle a msgstr.""" | ||||
Wagner Bruna
|
r15290 | self.current_entry.msgstr = unescape(self.current_token[1:-1]) | ||
Wagner Bruna
|
r11387 | return True | ||
def handle_mx(self): | ||||
"""Handle a msgstr plural.""" | ||||
index, value = self.current_token[7], self.current_token[11:-1] | ||||
self.current_entry.msgstr_plural[index] = unescape(value) | ||||
self.msgstr_index = index | ||||
return True | ||||
def handle_mc(self): | ||||
"""Handle a msgid or msgstr continuation line.""" | ||||
token = unescape(self.current_token[1:-1]) | ||||
if self.current_state == 'CT': | ||||
typ = 'msgctxt' | ||||
self.current_entry.msgctxt += token | ||||
elif self.current_state == 'MI': | ||||
typ = 'msgid' | ||||
self.current_entry.msgid += token | ||||
elif self.current_state == 'MP': | ||||
typ = 'msgid_plural' | ||||
self.current_entry.msgid_plural += token | ||||
elif self.current_state == 'MS': | ||||
typ = 'msgstr' | ||||
self.current_entry.msgstr += token | ||||
elif self.current_state == 'MX': | ||||
typ = 'msgstr[%s]' % self.msgstr_index | ||||
self.current_entry.msgstr_plural[self.msgstr_index] += token | ||||
elif self.current_state == 'PP': | ||||
typ = 'previous_msgid_plural' | ||||
token = token[3:] | ||||
self.current_entry.previous_msgid_plural += token | ||||
elif self.current_state == 'PM': | ||||
typ = 'previous_msgid' | ||||
token = token[3:] | ||||
self.current_entry.previous_msgid += token | ||||
elif self.current_state == 'PC': | ||||
typ = 'previous_msgctxt' | ||||
token = token[3:] | ||||
self.current_entry.previous_msgctxt += token | ||||
# don't change the current state | ||||
return False | ||||
# }}} | ||||
# class _MOFileParser {{{ | ||||
class _MOFileParser(object): | ||||
""" | ||||
A class to parse binary mo files. | ||||
""" | ||||
Wagner Bruna
|
r15290 | def __init__(self, mofile, *args, **kwargs): | ||
Wagner Bruna
|
r11387 | """ | ||
Constructor. | ||||
Wagner Bruna
|
r15290 | Keyword arguments: | ||
``mofile`` | ||||
string, path to the mo file or its content | ||||
``encoding`` | ||||
string, the encoding to use, defaults to ``default_encoding`` | ||||
global variable (optional). | ||||
``check_for_duplicates`` | ||||
whether to check for duplicate entries when adding entries to the | ||||
file (optional, default: ``False``). | ||||
Wagner Bruna
|
r11387 | """ | ||
Wagner Bruna
|
r15290 | self.fhandle = open(mofile, 'rb') | ||
Wagner Bruna
|
r11387 | self.instance = MOFile( | ||
Wagner Bruna
|
r15290 | fpath=mofile, | ||
encoding=kwargs.get('encoding', default_encoding), | ||||
check_for_duplicates=kwargs.get('check_for_duplicates', False) | ||||
Wagner Bruna
|
r11387 | ) | ||
def parse(self): | ||||
""" | ||||
Build the instance with the file handle provided in the | ||||
constructor. | ||||
""" | ||||
Wagner Bruna
|
r15290 | # parse magic number | ||
Wagner Bruna
|
r11387 | magic_number = self._readbinary('<I', 4) | ||
Wagner Bruna
|
r15290 | if magic_number == MOFile.LITTLE_ENDIAN: | ||
Wagner Bruna
|
r11387 | ii = '<II' | ||
Wagner Bruna
|
r15290 | elif magic_number == MOFile.BIG_ENDIAN: | ||
Wagner Bruna
|
r11387 | ii = '>II' | ||
else: | ||||
raise IOError('Invalid mo file, magic number is incorrect !') | ||||
self.instance.magic_number = magic_number | ||||
# parse the version number and the number of strings | ||||
self.instance.version, numofstrings = self._readbinary(ii, 8) | ||||
# original strings and translation strings hash table offset | ||||
msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8) | ||||
# move to msgid hash table and read length and offset of msgids | ||||
self.fhandle.seek(msgids_hash_offset) | ||||
msgids_index = [] | ||||
for i in range(numofstrings): | ||||
msgids_index.append(self._readbinary(ii, 8)) | ||||
# move to msgstr hash table and read length and offset of msgstrs | ||||
self.fhandle.seek(msgstrs_hash_offset) | ||||
msgstrs_index = [] | ||||
for i in range(numofstrings): | ||||
msgstrs_index.append(self._readbinary(ii, 8)) | ||||
# build entries | ||||
for i in range(numofstrings): | ||||
self.fhandle.seek(msgids_index[i][1]) | ||||
msgid = self.fhandle.read(msgids_index[i][0]) | ||||
self.fhandle.seek(msgstrs_index[i][1]) | ||||
msgstr = self.fhandle.read(msgstrs_index[i][0]) | ||||
if i == 0: # metadata | ||||
raw_metadata, metadata = msgstr.split('\n'), {} | ||||
for line in raw_metadata: | ||||
tokens = line.split(':', 1) | ||||
if tokens[0] != '': | ||||
try: | ||||
metadata[tokens[0]] = tokens[1].strip() | ||||
except IndexError: | ||||
metadata[tokens[0]] = '' | ||||
self.instance.metadata = metadata | ||||
continue | ||||
# test if we have a plural entry | ||||
msgid_tokens = msgid.split('\0') | ||||
if len(msgid_tokens) > 1: | ||||
Wagner Bruna
|
r15290 | entry = self._build_entry( | ||
Wagner Bruna
|
r11387 | msgid=msgid_tokens[0], | ||
msgid_plural=msgid_tokens[1], | ||||
Wagner Bruna
|
r15290 | msgstr_plural=dict((k,v) for k,v in enumerate(msgstr.split('\0'))) | ||
Wagner Bruna
|
r11387 | ) | ||
else: | ||||
Wagner Bruna
|
r15290 | entry = self._build_entry(msgid=msgid, msgstr=msgstr) | ||
Wagner Bruna
|
r11387 | self.instance.append(entry) | ||
# close opened file | ||||
self.fhandle.close() | ||||
return self.instance | ||||
Mads Kiilerich
|
r19023 | |||
Wagner Bruna
|
r15290 | def _build_entry(self, msgid, msgstr=None, msgid_plural=None, | ||
msgstr_plural=None): | ||||
msgctxt_msgid = msgid.split('\x04') | ||||
if len(msgctxt_msgid) > 1: | ||||
kwargs = { | ||||
'msgctxt': msgctxt_msgid[0], | ||||
'msgid' : msgctxt_msgid[1], | ||||
} | ||||
else: | ||||
kwargs = {'msgid': msgid} | ||||
if msgstr: | ||||
kwargs['msgstr'] = msgstr | ||||
if msgid_plural: | ||||
kwargs['msgid_plural'] = msgid_plural | ||||
if msgstr_plural: | ||||
kwargs['msgstr_plural'] = msgstr_plural | ||||
return MOEntry(**kwargs) | ||||
Wagner Bruna
|
r11387 | |||
def _readbinary(self, fmt, numbytes): | ||||
""" | ||||
Private method that unpack n bytes of data using format <fmt>. | ||||
It returns a tuple or a mixed value if the tuple length is 1. | ||||
""" | ||||
bytes = self.fhandle.read(numbytes) | ||||
tup = struct.unpack(fmt, bytes) | ||||
if len(tup) == 1: | ||||
return tup[0] | ||||
return tup | ||||
# }}} | ||||