load the Chinese character mapping from a .msgpack.gz file

This commit is contained in:
Rob Speer 2015-09-22 16:31:50 -04:00
parent 06f8b29971
commit 6cf4210187
5 changed files with 25 additions and 3294 deletions

View File

@ -1,20 +1,21 @@
"""
Generate a Python file, _chinese_mapping.py, that maps Traditional Chinese
characters to their Simplified Chinese equivalents.
Generate a msgpack file, _chinese_mapping.msgpack.gz, that maps Traditional
Chinese characters to their Simplified Chinese equivalents.
This is meant to be a normalization of text, somewhat like case-folding -- not
an actual translator, a task for which this method would be unsuitable. We
store word frequencies using Simplified Chinese characters so that, in the large
number of cases where a Traditional Chinese word has an obvious Simplified Chinese
mapping, we can get a frequency for it that's the same in Simplified and Traditional
Chinese.
store word frequencies using Simplified Chinese characters so that, in the
large number of cases where a Traditional Chinese word has an obvious
Simplified Chinese mapping, we can get a frequency for it that's the same in
Simplified and Traditional Chinese.
Generating this mapping requires the external Chinese conversion tool OpenCC.
"""
import unicodedata
import itertools
import os
import pprint
import msgpack
import gzip
def make_hanzi_table(filename):
@ -25,7 +26,7 @@ def make_hanzi_table(filename):
print('%5X\t%s' % (codept, char), file=out)
def make_hanzi_converter(table_in, python_out):
def make_hanzi_converter(table_in, msgpack_out):
table = {}
with open(table_in, encoding='utf-8') as infile:
for line in infile:
@ -34,15 +35,14 @@ def make_hanzi_converter(table_in, python_out):
assert len(char) == 1
if chr(codept) != char:
table[codept] = char
with open(python_out, 'w', encoding='utf-8') as outfile:
print('SIMPLIFIED_MAP = ', end='', file=outfile)
pprint.pprint(table, stream=outfile)
with gzip.open(msgpack_out, 'wb') as outfile:
msgpack.dump(table, outfile, encoding='utf-8')
def build():
make_hanzi_table('/tmp/han_in.txt')
os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.msgpack.gz')
if __name__ == '__main__':

View File

@ -85,10 +85,11 @@ def available_languages(wordlist='combined'):
"""
available = {}
for path in DATA_PATH.glob('*.msgpack.gz'):
list_name = path.name.split('.')[0]
name, lang = list_name.split('_')
if name == wordlist:
available[lang] = str(path)
if not path.name.startswith('_'):
list_name = path.name.split('.')[0]
name, lang = list_name.split('_')
if name == wordlist:
available[lang] = str(path)
return available

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +1,19 @@
from pkg_resources import resource_filename
from wordfreq._chinese_mapping import SIMPLIFIED_MAP
import jieba
import msgpack
import gzip
jieba_tokenizer = None
simplified_map = None
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
def simplify_chinese(text):
return text.translate(SIMPLIFIED_MAP).casefold()
global simplified_map
if simplified_map is None:
simplified_map = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
return text.translate(simplified_map).casefold()
def jieba_tokenize(text):

Binary file not shown.