wordfreq/scripts/make_chinese_mapping.py

"""
Generate a msgpack file, _chinese_mapping.msgpack.gz, that maps Traditional
Chinese characters to their Simplified Chinese equivalents.

This is meant to be a normalization of text, somewhat like case-folding -- not
an actual translator, a task for which this method would be unsuitable. We
store word frequencies using Simplified Chinese characters so that, in the
large number of cases where a Traditional Chinese word has an obvious
Simplified Chinese mapping, we can get a frequency for it that's the same in
Simplified and Traditional Chinese.

Generating this mapping requires the external Chinese conversion tool OpenCC.
"""
import unicodedata
import itertools
import os
import msgpack
import gzip


def make_hanzi_table(filename):
    with open(filename, 'w', encoding='utf-8') as out:
        for codept in itertools.chain(range(0x3400, 0xa000), range(0xf900, 0xfb00), range(0x20000, 0x30000)):
            char = chr(codept)
            if unicodedata.category(char) != 'Cn':
                print('%5X\t%s' % (codept, char), file=out)


def make_hanzi_converter(table_in, msgpack_out):
    table = {}
    with open(table_in, encoding='utf-8') as infile:
        for line in infile:
            hexcode, char = line.rstrip('\n').split('\t')
            codept = int(hexcode, 16)
            assert len(char) == 1
            if chr(codept) != char:
                table[codept] = char
    with gzip.open(msgpack_out, 'wb') as outfile:
        msgpack.dump(table, outfile, raw=False)


def build():
    make_hanzi_table('/tmp/han_in.txt')
    os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
    make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.msgpack.gz')


if __name__ == '__main__':
    build()
document what this file is for Former-commit-id: 06f8b299712a5952c5847ece8b13c1a18fcc2ed0 2015-09-22 19:31:27 +00:00			`"""`
load the Chinese character mapping from a .msgpack.gz file Former-commit-id: 6cf4210187430c822297ce921775db6d983c22ab 2015-09-22 20:31:50 +00:00			`Generate a msgpack file, _chinese_mapping.msgpack.gz, that maps Traditional`
			`Chinese characters to their Simplified Chinese equivalents.`
document what this file is for Former-commit-id: 06f8b299712a5952c5847ece8b13c1a18fcc2ed0 2015-09-22 19:31:27 +00:00
			`This is meant to be a normalization of text, somewhat like case-folding -- not`
			`an actual translator, a task for which this method would be unsuitable. We`
load the Chinese character mapping from a .msgpack.gz file Former-commit-id: 6cf4210187430c822297ce921775db6d983c22ab 2015-09-22 20:31:50 +00:00			`store word frequencies using Simplified Chinese characters so that, in the`
			`large number of cases where a Traditional Chinese word has an obvious`
			`Simplified Chinese mapping, we can get a frequency for it that's the same in`
			`Simplified and Traditional Chinese.`
document what this file is for Former-commit-id: 06f8b299712a5952c5847ece8b13c1a18fcc2ed0 2015-09-22 19:31:27 +00:00
			`Generating this mapping requires the external Chinese conversion tool OpenCC.`
			`"""`
WIP: Traditional Chinese Former-commit-id: 7906a671ea7d2f5d9a49d519aac456329622d184 2015-09-04 20:59:11 +00:00			`import unicodedata`
			`import itertools`
			`import os`
load the Chinese character mapping from a .msgpack.gz file Former-commit-id: 6cf4210187430c822297ce921775db6d983c22ab 2015-09-22 20:31:50 +00:00			`import msgpack`
			`import gzip`
WIP: Traditional Chinese Former-commit-id: 7906a671ea7d2f5d9a49d519aac456329622d184 2015-09-04 20:59:11 +00:00

			`def make_hanzi_table(filename):`
			`with open(filename, 'w', encoding='utf-8') as out:`
			`for codept in itertools.chain(range(0x3400, 0xa000), range(0xf900, 0xfb00), range(0x20000, 0x30000)):`
			`char = chr(codept)`
			`if unicodedata.category(char) != 'Cn':`
			`print('%5X\t%s' % (codept, char), file=out)`


load the Chinese character mapping from a .msgpack.gz file Former-commit-id: 6cf4210187430c822297ce921775db6d983c22ab 2015-09-22 20:31:50 +00:00			`def make_hanzi_converter(table_in, msgpack_out):`
WIP: Traditional Chinese Former-commit-id: 7906a671ea7d2f5d9a49d519aac456329622d184 2015-09-04 20:59:11 +00:00			`table = {}`
			`with open(table_in, encoding='utf-8') as infile:`
			`for line in infile:`
			`hexcode, char = line.rstrip('\n').split('\t')`
			`codept = int(hexcode, 16)`
			`assert len(char) == 1`
			`if chr(codept) != char:`
			`table[codept] = char`
load the Chinese character mapping from a .msgpack.gz file Former-commit-id: 6cf4210187430c822297ce921775db6d983c22ab 2015-09-22 20:31:50 +00:00			`with gzip.open(msgpack_out, 'wb') as outfile:`
update msgpack call in scripts/make_chinese_mapping 2019-02-05 16:16:22 +00:00			`msgpack.dump(table, outfile, raw=False)`
WIP: Traditional Chinese Former-commit-id: 7906a671ea7d2f5d9a49d519aac456329622d184 2015-09-04 20:59:11 +00:00

			`def build():`
			`make_hanzi_table('/tmp/han_in.txt')`
			`os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')`
load the Chinese character mapping from a .msgpack.gz file Former-commit-id: 6cf4210187430c822297ce921775db6d983c22ab 2015-09-22 20:31:50 +00:00			`make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.msgpack.gz')`
WIP: Traditional Chinese Former-commit-id: 7906a671ea7d2f5d9a49d519aac456329622d184 2015-09-04 20:59:11 +00:00

			`if __name__ == '__main__':`
			`build()`