Update to how similar messages matching is handled when updating po files from pot one (gain something like 20% in heavy update situations, and save a nice bunch of memory!).
This commit is contained in:
@@ -387,7 +387,7 @@ def dump_py_messages_from_files(messages, check_ctxt, files):
|
||||
estr_ls.append(estr)
|
||||
nds_ls.extend(nds)
|
||||
ret = _extract_string_merge(estr_ls, nds_ls)
|
||||
print(ret)
|
||||
#print(ret)
|
||||
return ret
|
||||
|
||||
def extract_strings_split(node):
|
||||
|
@@ -225,6 +225,7 @@ dict_uimsgs = {
|
||||
"loc", "rot", "pos",
|
||||
"lorem",
|
||||
"luma",
|
||||
"mem",
|
||||
"multicam",
|
||||
"num",
|
||||
"ok",
|
||||
|
@@ -97,10 +97,13 @@ def main():
|
||||
if os.path.exists(po):
|
||||
pool_data.append((po, lang, pot_msgs))
|
||||
|
||||
with concurrent.futures.ProcessPoolExecutor() as executor:
|
||||
for r in executor.map(process_po, pool_data, timeout=600):
|
||||
for r in map(process_po, pool_data):
|
||||
if r != 0:
|
||||
ret = r
|
||||
#with concurrent.futures.ProcessPoolExecutor() as executor:
|
||||
#for r in executor.map(process_po, pool_data, timeout=600):
|
||||
#if r != 0:
|
||||
#ret = r
|
||||
|
||||
return ret
|
||||
|
||||
|
@@ -21,6 +21,7 @@
|
||||
# Some misc utilities...
|
||||
|
||||
import collections
|
||||
import concurrent.futures
|
||||
import copy
|
||||
import os
|
||||
import re
|
||||
@@ -61,6 +62,35 @@ def is_valid_po_path(path):
|
||||
return bool(_valid_po_path_re.match(path))
|
||||
|
||||
|
||||
def get_best_similar(data):
|
||||
import difflib
|
||||
key, use_similar, similar_pool = data
|
||||
|
||||
# try to find some close key in existing messages...
|
||||
# Optimized code inspired by difflib.get_close_matches (as we only need the best match).
|
||||
# We also consider to never make a match when len differs more than -len_key / 2, +len_key * 2 (which is valid
|
||||
# as long as use_similar is not below ~0.7).
|
||||
# Gives an overall ~20% of improvement!
|
||||
#tmp = difflib.get_close_matches(key[1], similar_pool, n=1, cutoff=use_similar)
|
||||
#if tmp:
|
||||
#tmp = tmp[0]
|
||||
tmp = None
|
||||
s = difflib.SequenceMatcher()
|
||||
s.set_seq2(key[1])
|
||||
len_key = len(key[1])
|
||||
min_len = len_key // 2
|
||||
max_len = len_key * 2
|
||||
for x in similar_pool:
|
||||
if min_len < len(x) < max_len:
|
||||
s.set_seq1(x)
|
||||
if s.real_quick_ratio() >= use_similar and s.quick_ratio() >= use_similar:
|
||||
sratio = s.ratio()
|
||||
if sratio >= use_similar:
|
||||
tmp = x
|
||||
use_similar = sratio
|
||||
return key, tmp
|
||||
|
||||
|
||||
class I18nMessage:
|
||||
"""
|
||||
Internal representation of a message.
|
||||
@@ -233,40 +263,73 @@ class I18nMessages:
|
||||
existing one. Messages no more found in ref will be marked as commented if keep_old_commented is True,
|
||||
or removed.
|
||||
"""
|
||||
import difflib
|
||||
similar_pool = {}
|
||||
if use_similar > 0.0:
|
||||
for key, msg in self.msgs.items():
|
||||
if msg.msgstr: # No need to waste time with void translations!
|
||||
similar_pool.setdefault(key[1], set()).add(key)
|
||||
|
||||
msgs = self._new_messages()
|
||||
for (key, msg) in ref.msgs.items():
|
||||
if key in self.msgs:
|
||||
msgs[key] = self.msgs[key]
|
||||
msgs[key].sources = msg.sources
|
||||
else:
|
||||
skey = None
|
||||
if use_similar > 0.0:
|
||||
# try to find some close key in existing messages...
|
||||
tmp = difflib.get_close_matches(key[1], similar_pool, n=1, cutoff=use_similar)
|
||||
if tmp:
|
||||
tmp = tmp[0]
|
||||
# Try to get the same context, else just get one...
|
||||
skey = (key[0], tmp)
|
||||
if skey not in similar_pool[tmp]:
|
||||
skey = tuple(similar_pool[tmp])[0]
|
||||
msgs = self._new_messages().fromkeys(ref.msgs.keys())
|
||||
ref_keys = set(ref.msgs.keys())
|
||||
org_keys = set(self.msgs.keys())
|
||||
new_keys = ref_keys - org_keys
|
||||
removed_keys = org_keys - ref_keys
|
||||
|
||||
print(new_keys, "\n\n", removed_keys)
|
||||
|
||||
# First process keys present in both org and ref messages.
|
||||
for key in ref_keys - new_keys:
|
||||
msg, refmsg = self.msgs[key], ref.msgs[key]
|
||||
msg.sources = refmsg.sources
|
||||
msg.is_commented = refmsg.is_commented
|
||||
msg.is_fuzzy = refmsg.is_fuzzy
|
||||
msgs[key] = msg
|
||||
if skey:
|
||||
msgs[key].msgstr = self.msgs[skey].msgstr
|
||||
msgs[key].is_fuzzy = True
|
||||
|
||||
# Next process new keys.
|
||||
if use_similar > 0.0:
|
||||
with concurrent.futures.ProcessPoolExecutor() as exctr:
|
||||
for key, msgid in exctr.map(get_best_similar,
|
||||
tuple((nk, use_similar, tuple(similar_pool.keys())) for nk in new_keys)):
|
||||
if msgid:
|
||||
# Try to get the same context, else just get one...
|
||||
skey = (key[0], msgid)
|
||||
if skey not in similar_pool[msgid]:
|
||||
skey = tuple(similar_pool[msgid])[0]
|
||||
# We keep org translation and comments, and mark message as fuzzy.
|
||||
msg, refmsg = copy.deepcopy(self.msgs[skey]), ref.msgs[key]
|
||||
msg.msgctxt = refmsg.msgctxt
|
||||
msg.msgid = refmsg.msgid
|
||||
msg.sources = refmsg.sources
|
||||
msg.is_fuzzy = True
|
||||
msg.is_commented = refmsg.is_commented
|
||||
msgs[key] = msg
|
||||
else:
|
||||
msgs[key] = ref.msgs[key]
|
||||
else:
|
||||
for key in new_keys:
|
||||
msgs[key] = ref.msgs[key]
|
||||
|
||||
# Add back all "old" and already commented messages as commented ones, if required
|
||||
# (and translation was not void!).
|
||||
if keep_old_commented:
|
||||
for key, msg in self.msgs.items():
|
||||
if key not in msgs and msg.msgstr:
|
||||
msgs[key] = msg
|
||||
for key in removed_keys:
|
||||
msgs[key] = self.msgs[key]
|
||||
msgs[key].is_commented = True
|
||||
msgs[key].sources = []
|
||||
|
||||
# Special 'meta' message, change project ID version and pot creation date...
|
||||
key = ("", "")
|
||||
rep = []
|
||||
markers = ("Project-Id-Version:", "POT-Creation-Date:")
|
||||
for mrk in markers:
|
||||
for rl in ref.msgs[key].msgstr_lines:
|
||||
if rl.startswith(mrk):
|
||||
for idx, ml in enumerate(msgs[key].msgstr_lines):
|
||||
if ml.startswith(mrk):
|
||||
rep.append((idx, rl))
|
||||
for idx, txt in rep:
|
||||
msgs[key].msgstr_lines[idx] = txt
|
||||
|
||||
# And finalize the update!
|
||||
self.msgs = msgs
|
||||
|
||||
|
Reference in New Issue
Block a user