load the Chinese character mapping from a .msgpack.gz file

Former-commit-id: 6cf4210187
2024-12-23 09:21:37 +00:00 · 2015-09-22 16:31:50 -04:00 · 2015-09-22 16:31:50 -04:00 · db30d09947
commit db30d09947
parent fe8a6b51e7
5 changed files with 25 additions and 3294 deletions
--- a/scripts/make_chinese_mapping.py
+++ b/scripts/make_chinese_mapping.py
@ -1,20 +1,21 @@
 """
-Generate a Python file, _chinese_mapping.py, that maps Traditional Chinese
-characters to their Simplified Chinese equivalents.
+Generate a msgpack file, _chinese_mapping.msgpack.gz, that maps Traditional
+Chinese characters to their Simplified Chinese equivalents.

 This is meant to be a normalization of text, somewhat like case-folding -- not
 an actual translator, a task for which this method would be unsuitable. We
-store word frequencies using Simplified Chinese characters so that, in the large
-number of cases where a Traditional Chinese word has an obvious Simplified Chinese
-mapping, we can get a frequency for it that's the same in Simplified and Traditional
-Chinese.
+store word frequencies using Simplified Chinese characters so that, in the
+large number of cases where a Traditional Chinese word has an obvious
+Simplified Chinese mapping, we can get a frequency for it that's the same in
+Simplified and Traditional Chinese.

 Generating this mapping requires the external Chinese conversion tool OpenCC.
 """
 import unicodedata
 import itertools
 import os
-import pprint
+import msgpack
+import gzip


 def make_hanzi_table(filename):
@ -25,7 +26,7 @@ def make_hanzi_table(filename):
                print('%5X\t%s' % (codept, char), file=out)


-def make_hanzi_converter(table_in, python_out):
+def make_hanzi_converter(table_in, msgpack_out):
    table = {}
    with open(table_in, encoding='utf-8') as infile:
        for line in infile:
@ -34,15 +35,14 @@ def make_hanzi_converter(table_in, python_out):
            assert len(char) == 1
            if chr(codept) != char:
                table[codept] = char
-    with open(python_out, 'w', encoding='utf-8') as outfile:
-        print('SIMPLIFIED_MAP = ', end='', file=outfile)
-        pprint.pprint(table, stream=outfile)
+    with gzip.open(msgpack_out, 'wb') as outfile:
+        msgpack.dump(table, outfile, encoding='utf-8')


 def build():
    make_hanzi_table('/tmp/han_in.txt')
    os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
-    make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')
+    make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.msgpack.gz')


 if __name__ == '__main__':
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -85,10 +85,11 @@ def available_languages(wordlist='combined'):
    """
    available = {}
    for path in DATA_PATH.glob('*.msgpack.gz'):
-        list_name = path.name.split('.')[0]
-        name, lang = list_name.split('_')
-        if name == wordlist:
-            available[lang] = str(path)
+        if not path.name.startswith('_'):
+            list_name = path.name.split('.')[0]
+            name, lang = list_name.split('_')
+            if name == wordlist:
+                available[lang] = str(path)
    return available


--- a/wordfreq/_chinese_mapping.py
+++ b/wordfreq/_chinese_mapping.py
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -1,14 +1,19 @@
 from pkg_resources import resource_filename
-from wordfreq._chinese_mapping import SIMPLIFIED_MAP
 import jieba
-
+import msgpack
+import gzip

 jieba_tokenizer = None
+simplified_map = None
 DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
+SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')


 def simplify_chinese(text):
-    return text.translate(SIMPLIFIED_MAP).casefold()
+    global simplified_map
+    if simplified_map is None:
+        simplified_map = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
+    return text.translate(simplified_map).casefold()


 def jieba_tokenize(text):
--- a/wordfreq/data/_chinese_mapping.msgpack.gz
+++ b/wordfreq/data/_chinese_mapping.msgpack.gz