Update to how similar messages matching is handled when updating po files from pot one (gain something like 20% in heavy update situations, and save a nice bunch of memory!).

This commit is contained in:
Bastien Montagne
2013-02-12 17:32:54 +00:00
parent dcea2800a7
commit f2d9fc7e25
4 changed files with 96 additions and 29 deletions

View File

@@ -387,7 +387,7 @@ def dump_py_messages_from_files(messages, check_ctxt, files):
estr_ls.append(estr)
nds_ls.extend(nds)
ret = _extract_string_merge(estr_ls, nds_ls)
print(ret)
#print(ret)
return ret
def extract_strings_split(node):

View File

@@ -225,6 +225,7 @@ dict_uimsgs = {
"loc", "rot", "pos",
"lorem",
"luma",
"mem",
"multicam",
"num",
"ok",

View File

@@ -97,10 +97,13 @@ def main():
if os.path.exists(po):
pool_data.append((po, lang, pot_msgs))
with concurrent.futures.ProcessPoolExecutor() as executor:
for r in executor.map(process_po, pool_data, timeout=600):
if r != 0:
ret = r
for r in map(process_po, pool_data):
if r != 0:
ret = r
#with concurrent.futures.ProcessPoolExecutor() as executor:
#for r in executor.map(process_po, pool_data, timeout=600):
#if r != 0:
#ret = r
return ret

View File

@@ -21,6 +21,7 @@
# Some misc utilities...
import collections
import concurrent.futures
import copy
import os
import re
@@ -61,6 +62,35 @@ def is_valid_po_path(path):
return bool(_valid_po_path_re.match(path))
def get_best_similar(data):
import difflib
key, use_similar, similar_pool = data
# try to find some close key in existing messages...
# Optimized code inspired by difflib.get_close_matches (as we only need the best match).
# We also consider to never make a match when len differs more than -len_key / 2, +len_key * 2 (which is valid
# as long as use_similar is not below ~0.7).
# Gives an overall ~20% of improvement!
#tmp = difflib.get_close_matches(key[1], similar_pool, n=1, cutoff=use_similar)
#if tmp:
#tmp = tmp[0]
tmp = None
s = difflib.SequenceMatcher()
s.set_seq2(key[1])
len_key = len(key[1])
min_len = len_key // 2
max_len = len_key * 2
for x in similar_pool:
if min_len < len(x) < max_len:
s.set_seq1(x)
if s.real_quick_ratio() >= use_similar and s.quick_ratio() >= use_similar:
sratio = s.ratio()
if sratio >= use_similar:
tmp = x
use_similar = sratio
return key, tmp
class I18nMessage:
"""
Internal representation of a message.
@@ -233,40 +263,73 @@ class I18nMessages:
existing one. Messages no more found in ref will be marked as commented if keep_old_commented is True,
or removed.
"""
import difflib
similar_pool = {}
if use_similar > 0.0:
for key, msg in self.msgs.items():
if msg.msgstr: # No need to waste time with void translations!
similar_pool.setdefault(key[1], set()).add(key)
msgs = self._new_messages()
for (key, msg) in ref.msgs.items():
if key in self.msgs:
msgs[key] = self.msgs[key]
msgs[key].sources = msg.sources
else:
skey = None
if use_similar > 0.0:
# try to find some close key in existing messages...
tmp = difflib.get_close_matches(key[1], similar_pool, n=1, cutoff=use_similar)
if tmp:
tmp = tmp[0]
msgs = self._new_messages().fromkeys(ref.msgs.keys())
ref_keys = set(ref.msgs.keys())
org_keys = set(self.msgs.keys())
new_keys = ref_keys - org_keys
removed_keys = org_keys - ref_keys
print(new_keys, "\n\n", removed_keys)
# First process keys present in both org and ref messages.
for key in ref_keys - new_keys:
msg, refmsg = self.msgs[key], ref.msgs[key]
msg.sources = refmsg.sources
msg.is_commented = refmsg.is_commented
msg.is_fuzzy = refmsg.is_fuzzy
msgs[key] = msg
# Next process new keys.
if use_similar > 0.0:
with concurrent.futures.ProcessPoolExecutor() as exctr:
for key, msgid in exctr.map(get_best_similar,
tuple((nk, use_similar, tuple(similar_pool.keys())) for nk in new_keys)):
if msgid:
# Try to get the same context, else just get one...
skey = (key[0], tmp)
if skey not in similar_pool[tmp]:
skey = tuple(similar_pool[tmp])[0]
msgs[key] = msg
if skey:
msgs[key].msgstr = self.msgs[skey].msgstr
msgs[key].is_fuzzy = True
skey = (key[0], msgid)
if skey not in similar_pool[msgid]:
skey = tuple(similar_pool[msgid])[0]
# We keep org translation and comments, and mark message as fuzzy.
msg, refmsg = copy.deepcopy(self.msgs[skey]), ref.msgs[key]
msg.msgctxt = refmsg.msgctxt
msg.msgid = refmsg.msgid
msg.sources = refmsg.sources
msg.is_fuzzy = True
msg.is_commented = refmsg.is_commented
msgs[key] = msg
else:
msgs[key] = ref.msgs[key]
else:
for key in new_keys:
msgs[key] = ref.msgs[key]
# Add back all "old" and already commented messages as commented ones, if required
# (and translation was not void!).
if keep_old_commented:
for key, msg in self.msgs.items():
if key not in msgs and msg.msgstr:
msgs[key] = msg
msgs[key].is_commented = True
for key in removed_keys:
msgs[key] = self.msgs[key]
msgs[key].is_commented = True
msgs[key].sources = []
# Special 'meta' message, change project ID version and pot creation date...
key = ("", "")
rep = []
markers = ("Project-Id-Version:", "POT-Creation-Date:")
for mrk in markers:
for rl in ref.msgs[key].msgstr_lines:
if rl.startswith(mrk):
for idx, ml in enumerate(msgs[key].msgstr_lines):
if ml.startswith(mrk):
rep.append((idx, rl))
for idx, txt in rep:
msgs[key].msgstr_lines[idx] = txt
# And finalize the update!
self.msgs = msgs