wordfreq/scripts/make_chinese_mapping.py

51 lines
1.7 KiB
Python
Raw Normal View History

2015-09-22 19:31:27 +00:00
"""
Generate a msgpack file, _chinese_mapping.msgpack.gz, that maps Traditional
Chinese characters to their Simplified Chinese equivalents.
2015-09-22 19:31:27 +00:00
This is meant to be a normalization of text, somewhat like case-folding -- not
an actual translator, a task for which this method would be unsuitable. We
store word frequencies using Simplified Chinese characters so that, in the
large number of cases where a Traditional Chinese word has an obvious
Simplified Chinese mapping, we can get a frequency for it that's the same in
Simplified and Traditional Chinese.
2015-09-22 19:31:27 +00:00
Generating this mapping requires the external Chinese conversion tool OpenCC.
"""
2015-09-04 20:59:11 +00:00
import unicodedata
import itertools
import os
import msgpack
import gzip
2015-09-04 20:59:11 +00:00
def make_hanzi_table(filename):
with open(filename, 'w', encoding='utf-8') as out:
for codept in itertools.chain(range(0x3400, 0xa000), range(0xf900, 0xfb00), range(0x20000, 0x30000)):
char = chr(codept)
if unicodedata.category(char) != 'Cn':
print('%5X\t%s' % (codept, char), file=out)
def make_hanzi_converter(table_in, msgpack_out):
2015-09-04 20:59:11 +00:00
table = {}
with open(table_in, encoding='utf-8') as infile:
for line in infile:
hexcode, char = line.rstrip('\n').split('\t')
codept = int(hexcode, 16)
assert len(char) == 1
if chr(codept) != char:
table[codept] = char
with gzip.open(msgpack_out, 'wb') as outfile:
msgpack.dump(table, outfile, encoding='utf-8')
2015-09-04 20:59:11 +00:00
def build():
make_hanzi_table('/tmp/han_in.txt')
os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.msgpack.gz')
2015-09-04 20:59:11 +00:00
if __name__ == '__main__':
build()