Update to how similar messages matching is handled when updating po files from pot one (gain something like 20% in heavy update situations, and save a nice bunch of memory!).

2013-02-12 17:32:54 +00:00
parent dcea2800a7
commit f2d9fc7e25
4 changed files with 96 additions and 29 deletions
--- a/release/scripts/modules/bl_i18n_utils/bl_process_msg.py
+++ b/release/scripts/modules/bl_i18n_utils/bl_process_msg.py
@@ -387,7 +387,7 @@ def dump_py_messages_from_files(messages, check_ctxt, files):
            estr_ls.append(estr)
            nds_ls.extend(nds)
        ret = _extract_string_merge(estr_ls, nds_ls)
-        print(ret)
+        #print(ret)
        return ret
    
    def extract_strings_split(node):
--- a/release/scripts/modules/bl_i18n_utils/spell_check_utils.py
+++ b/release/scripts/modules/bl_i18n_utils/spell_check_utils.py
@@ -225,6 +225,7 @@ dict_uimsgs = {
    "loc", "rot", "pos",
    "lorem",
    "luma",
+    "mem",
    "multicam",
    "num",
    "ok",
--- a/release/scripts/modules/bl_i18n_utils/update_po.py
+++ b/release/scripts/modules/bl_i18n_utils/update_po.py
@@ -97,10 +97,13 @@ def main():
            if os.path.exists(po):
                pool_data.append((po, lang, pot_msgs))

-    with concurrent.futures.ProcessPoolExecutor() as executor:
-        for r in executor.map(process_po, pool_data, timeout=600):
-            if r != 0:
-                ret = r
+    for r in map(process_po, pool_data):
+        if r != 0:
+            ret = r
+    #with concurrent.futures.ProcessPoolExecutor() as executor:
+        #for r in executor.map(process_po, pool_data, timeout=600):
+            #if r != 0:
+                #ret = r

    return ret

--- a/release/scripts/modules/bl_i18n_utils/utils.py
+++ b/release/scripts/modules/bl_i18n_utils/utils.py
@@ -21,6 +21,7 @@
 # Some misc utilities...

 import collections
+import concurrent.futures
 import copy
 import os
 import re
@@ -61,6 +62,35 @@ def is_valid_po_path(path):
    return bool(_valid_po_path_re.match(path))


+def get_best_similar(data):
+    import difflib
+    key, use_similar, similar_pool = data
+
+    # try to find some close key in existing messages...
+    # Optimized code inspired by difflib.get_close_matches (as we only need the best match).
+    # We also consider to never make a match when len differs more than -len_key / 2, +len_key * 2 (which is valid
+    # as long as use_similar is not below ~0.7).
+    # Gives an overall ~20% of improvement!
+    #tmp = difflib.get_close_matches(key[1], similar_pool, n=1, cutoff=use_similar)
+    #if tmp:
+        #tmp = tmp[0]
+    tmp = None
+    s = difflib.SequenceMatcher()
+    s.set_seq2(key[1])
+    len_key = len(key[1])
+    min_len = len_key // 2
+    max_len = len_key * 2
+    for x in similar_pool:
+        if min_len < len(x) < max_len:
+            s.set_seq1(x)
+            if s.real_quick_ratio() >= use_similar and s.quick_ratio() >= use_similar:
+                sratio = s.ratio()
+                if sratio >= use_similar:
+                    tmp = x
+                    use_similar = sratio
+    return key, tmp
+
+
 class I18nMessage:
    """
    Internal representation of a message.
@@ -233,40 +263,73 @@ class I18nMessages:
        existing one. Messages no more found in ref will be marked as commented if keep_old_commented is True,
        or removed.
        """
-        import difflib
        similar_pool = {}
        if use_similar > 0.0:
            for key, msg in self.msgs.items():
                if msg.msgstr:  # No need to waste time with void translations!
                    similar_pool.setdefault(key[1], set()).add(key)

-        msgs = self._new_messages()
-        for (key, msg) in ref.msgs.items():
-            if key in self.msgs:
-                msgs[key] = self.msgs[key]
-                msgs[key].sources = msg.sources
-            else:
-                skey = None
-                if use_similar > 0.0:
-                    # try to find some close key in existing messages...
-                    tmp = difflib.get_close_matches(key[1], similar_pool, n=1, cutoff=use_similar)
-                    if tmp:
-                        tmp = tmp[0]
+        msgs = self._new_messages().fromkeys(ref.msgs.keys())
+        ref_keys = set(ref.msgs.keys())
+        org_keys = set(self.msgs.keys())
+        new_keys = ref_keys - org_keys
+        removed_keys = org_keys - ref_keys
+
+        print(new_keys, "\n\n", removed_keys)
+
+        # First process keys present in both org and ref messages.
+        for key in ref_keys - new_keys:
+            msg, refmsg = self.msgs[key], ref.msgs[key]
+            msg.sources = refmsg.sources
+            msg.is_commented = refmsg.is_commented
+            msg.is_fuzzy = refmsg.is_fuzzy
+            msgs[key] = msg
+
+        # Next process new keys.
+        if use_similar > 0.0:
+            with concurrent.futures.ProcessPoolExecutor() as exctr:
+                for key, msgid in exctr.map(get_best_similar,
+                                            tuple((nk, use_similar, tuple(similar_pool.keys())) for nk in new_keys)):
+                    if msgid:
                        # Try to get the same context, else just get one...
-                        skey = (key[0], tmp)
-                        if skey not in similar_pool[tmp]:
-                            skey = tuple(similar_pool[tmp])[0]
-                msgs[key] = msg
-                if skey:
-                    msgs[key].msgstr = self.msgs[skey].msgstr
-                    msgs[key].is_fuzzy = True
+                        skey = (key[0], msgid)
+                        if skey not in similar_pool[msgid]:
+                            skey = tuple(similar_pool[msgid])[0]
+                        # We keep org translation and comments, and mark message as fuzzy.
+                        msg, refmsg = copy.deepcopy(self.msgs[skey]), ref.msgs[key]
+                        msg.msgctxt = refmsg.msgctxt
+                        msg.msgid = refmsg.msgid
+                        msg.sources = refmsg.sources
+                        msg.is_fuzzy = True
+                        msg.is_commented = refmsg.is_commented
+                        msgs[key] = msg
+                    else:
+                        msgs[key] = ref.msgs[key]
+        else:
+            for key in new_keys:
+                msgs[key] = ref.msgs[key]
+
        # Add back all "old" and already commented messages as commented ones, if required
        # (and translation was not void!).
        if keep_old_commented:
-            for key, msg in self.msgs.items():
-                if key not in msgs and msg.msgstr:
-                    msgs[key] = msg
-                    msgs[key].is_commented = True
+            for key in removed_keys:
+                msgs[key] = self.msgs[key]
+                msgs[key].is_commented = True
+                msgs[key].sources = []
+
+        # Special 'meta' message, change project ID version and pot creation date...
+        key = ("", "")
+        rep = []
+        markers = ("Project-Id-Version:", "POT-Creation-Date:")
+        for mrk in markers:
+            for rl in ref.msgs[key].msgstr_lines:
+                if rl.startswith(mrk):
+                    for idx, ml in enumerate(msgs[key].msgstr_lines):
+                        if ml.startswith(mrk):
+                            rep.append((idx, rl))
+        for idx, txt in rep:
+            msgs[key].msgstr_lines[idx] = txt
+
        # And finalize the update!
        self.msgs = msgs