Add deeper XML checking to update-translation script

- Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774.
10 years ago · da59f28335
1 changed files with 134 additions and 14 deletions
--- a/contrib/devtools/update-translations.py
+++ b/contrib/devtools/update-translations.py
@ -14,13 +14,14 @@ It will do the following automatically:
 TODO:
 - auto-add new translations to the build system according to the translation process
 - remove 'unfinished' translation items
 '''
 from __future__ import division, print_function
 import subprocess
 import re
 import sys
 import os
 import io
 import xml.etree.ElementTree as ET
 # Name of transifex tool
 TX = 'tx'
@ -40,24 +41,143 @@ def fetch_all_translations():
        print('Error while fetching translations', file=sys.stderr)
        exit(1)
-def postprocess_translations():
+def find_format_specifiers(s):
-    print('Postprocessing...')
+    '''Find all format specifiers in a string.'''
    pos = 0
    specifiers = []
    while True:
        percent = s.find('%', pos)
        if percent < 0:
            break
        specifiers.append(s[percent+1])
        pos = percent+2
    return specifiers
 def split_format_specifiers(specifiers):
    '''Split format specifiers between numeric (Qt) and others (strprintf)'''
    numeric = []
    other = []
    for s in specifiers:
        if s in {'1','2','3','4','5','6','7','8','9'}:
            numeric.append(s)
        else:
            other.append(s)
    # numeric (Qt) can be present in any order, others (strprintf) must be in specified order
    return set(numeric),other
 def sanitize_string(s):
    '''Sanitize string for printing'''
    return s.replace('\n',' ')
 def check_format_specifiers(source, translation, errors):
    source_f = split_format_specifiers(find_format_specifiers(source))
    # assert that no source messages contain both Qt and strprintf format specifiers
    # if this fails, go change the source as this is hacky and confusing!
    assert(not(source_f[0] and source_f[1]))
    try:
        translation_f = split_format_specifiers(find_format_specifiers(translation))
    except IndexError:
        errors.append("Parse error in translation '%s'" % sanitize_string(translation))
        return False
    else:
        if source_f != translation_f:
            errors.append("Mismatch between '%s' and '%s'" % (sanitize_string(source), sanitize_string(translation)))
            return False
    return True
 def all_ts_files(suffix=''):
    for filename in os.listdir(LOCALE_DIR):
        # process only language files, and do not process source language
-        if not filename.endswith('.ts') or filename == SOURCE_LANG: 
+        if not filename.endswith('.ts'+suffix) or filename == SOURCE_LANG+suffix:
            continue
        if suffix: # remove provided suffix
            filename = filename[0:-len(suffix)]
        filepath = os.path.join(LOCALE_DIR, filename)
-        with open(filepath, 'rb') as f:
+        yield(filename, filepath)
 FIX_RE = re.compile(b'[\x00-\x09\x0b\x0c\x0e-\x1f]')
 def remove_invalid_characters(s):
    '''Remove invalid characters from translation string'''
    return FIX_RE.sub(b'', s)
 # Override cdata escape function to make our output match Qt's (optional, just for cleaner diffs for
 # comparison, disable by default)
 _orig_escape_cdata = None
 def escape_cdata(text):
    text = _orig_escape_cdata(text)
    text = text.replace("'", '&apos;')
    text = text.replace('"', '&quot;')
    return text
 def postprocess_translations(reduce_diff_hacks=False):
    print('Checking and postprocessing...')
    if reduce_diff_hacks:
        global _orig_escape_cdata
        _orig_escape_cdata = ET._escape_cdata
        ET._escape_cdata = escape_cdata
    for (filename,filepath) in all_ts_files():
        os.rename(filepath, filepath+'.orig')
    have_errors = False
    for (filename,filepath) in all_ts_files('.orig'):
        # pre-fixups to cope with transifex output
        parser = ET.XMLParser(encoding='utf-8') # need to override encoding because 'utf8' is not understood only 'utf-8'
        with open(filepath + '.orig', 'rb') as f:
            data = f.read()
-        # remove non-allowed control characters
+        # remove control characters; this must be done over the entire file otherwise the XML parser will fail
-        data = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', data)
+        data = remove_invalid_characters(data)
-        data = data.split('\n')
+        tree = ET.parse(io.BytesIO(data), parser=parser)
-        # strip locations from non-origin translation
+
-        # location tags are used to guide translators, they are not necessary for compilation
+        # iterate over all messages in file
-        # TODO: actually process XML instead of relying on Transifex's one-tag-per-line output format
+        root = tree.getroot()
-        data = [line for line in data if not '<location' in line]
+        for context in root.findall('context'):
-        with open(filepath, 'wb') as f:
+            for message in context.findall('message'):
-            f.write('\n'.join(data))
+                numerus = message.get('numerus') == 'yes'
                source = message.find('source').text
                translation_node = message.find('translation')
                # pick all numerusforms
                if numerus:
                    translations = [i.text for i in translation_node.findall('numerusform')]
                else:
                    translations = [translation_node.text]
                for translation in translations:
                    if translation is None:
                        continue
                    errors = []
                    valid = check_format_specifiers(source, translation, errors)
                    for error in errors:
                        print('%s: %s' % (filename, error))
                    if not valid: # set type to unfinished and clear string if invalid
                        translation_node.clear()
                        translation_node.set('type', 'unfinished')
                        have_errors = True
                # Remove location tags
                for location in message.findall('location'):
                    message.remove(location)
                # Remove entire message if it is an unfinished translation
                if translation_node.get('type') == 'unfinished':
                    context.remove(message)
        # write fixed-up tree
        # if diff reduction requested, replace some XML to 'sanitize' to qt formatting
        if reduce_diff_hacks:
            out = io.BytesIO()
            tree.write(out, encoding='utf-8')
            out = out.getvalue()
            out = out.replace(b' />', b'/>')
            with open(filepath, 'wb') as f:
                f.write(out)
        else:
            tree.write(filepath, encoding='utf-8')
    return have_errors
 if __name__ == '__main__':
    check_at_repository_root()