load the Chinese character mapping from a .msgpack.gz file

This commit is contained in:
Rob Speer 2015-09-22 16:31:50 -04:00
parent 06f8b29971
commit 6cf4210187
5 changed files with 25 additions and 3294 deletions

View File

@ -1,20 +1,21 @@
""" """
Generate a Python file, _chinese_mapping.py, that maps Traditional Chinese Generate a msgpack file, _chinese_mapping.msgpack.gz, that maps Traditional
characters to their Simplified Chinese equivalents. Chinese characters to their Simplified Chinese equivalents.
This is meant to be a normalization of text, somewhat like case-folding -- not This is meant to be a normalization of text, somewhat like case-folding -- not
an actual translator, a task for which this method would be unsuitable. We an actual translator, a task for which this method would be unsuitable. We
store word frequencies using Simplified Chinese characters so that, in the large store word frequencies using Simplified Chinese characters so that, in the
number of cases where a Traditional Chinese word has an obvious Simplified Chinese large number of cases where a Traditional Chinese word has an obvious
mapping, we can get a frequency for it that's the same in Simplified and Traditional Simplified Chinese mapping, we can get a frequency for it that's the same in
Chinese. Simplified and Traditional Chinese.
Generating this mapping requires the external Chinese conversion tool OpenCC. Generating this mapping requires the external Chinese conversion tool OpenCC.
""" """
import unicodedata import unicodedata
import itertools import itertools
import os import os
import pprint import msgpack
import gzip
def make_hanzi_table(filename): def make_hanzi_table(filename):
@ -25,7 +26,7 @@ def make_hanzi_table(filename):
print('%5X\t%s' % (codept, char), file=out) print('%5X\t%s' % (codept, char), file=out)
def make_hanzi_converter(table_in, python_out): def make_hanzi_converter(table_in, msgpack_out):
table = {} table = {}
with open(table_in, encoding='utf-8') as infile: with open(table_in, encoding='utf-8') as infile:
for line in infile: for line in infile:
@ -34,15 +35,14 @@ def make_hanzi_converter(table_in, python_out):
assert len(char) == 1 assert len(char) == 1
if chr(codept) != char: if chr(codept) != char:
table[codept] = char table[codept] = char
with open(python_out, 'w', encoding='utf-8') as outfile: with gzip.open(msgpack_out, 'wb') as outfile:
print('SIMPLIFIED_MAP = ', end='', file=outfile) msgpack.dump(table, outfile, encoding='utf-8')
pprint.pprint(table, stream=outfile)
def build(): def build():
make_hanzi_table('/tmp/han_in.txt') make_hanzi_table('/tmp/han_in.txt')
os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt') os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py') make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.msgpack.gz')
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -85,10 +85,11 @@ def available_languages(wordlist='combined'):
""" """
available = {} available = {}
for path in DATA_PATH.glob('*.msgpack.gz'): for path in DATA_PATH.glob('*.msgpack.gz'):
list_name = path.name.split('.')[0] if not path.name.startswith('_'):
name, lang = list_name.split('_') list_name = path.name.split('.')[0]
if name == wordlist: name, lang = list_name.split('_')
available[lang] = str(path) if name == wordlist:
available[lang] = str(path)
return available return available

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +1,19 @@
from pkg_resources import resource_filename from pkg_resources import resource_filename
from wordfreq._chinese_mapping import SIMPLIFIED_MAP
import jieba import jieba
import msgpack
import gzip
jieba_tokenizer = None jieba_tokenizer = None
simplified_map = None
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt') DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
def simplify_chinese(text): def simplify_chinese(text):
return text.translate(SIMPLIFIED_MAP).casefold() global simplified_map
if simplified_map is None:
simplified_map = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
return text.translate(simplified_map).casefold()
def jieba_tokenize(text): def jieba_tokenize(text):

Binary file not shown.