wordfreq/scripts/make_chinese_mapping.py

import unicodedata
import itertools
import os
import pprint


def make_hanzi_table(filename):
    with open(filename, 'w', encoding='utf-8') as out:
        for codept in itertools.chain(range(0x3400, 0xa000), range(0xf900, 0xfb00), range(0x20000, 0x30000)):
            char = chr(codept)
            if unicodedata.category(char) != 'Cn':
                print('%5X\t%s' % (codept, char), file=out)


def make_hanzi_converter(table_in, python_out):
    table = {}
    with open(table_in, encoding='utf-8') as infile:
        for line in infile:
            hexcode, char = line.rstrip('\n').split('\t')
            codept = int(hexcode, 16)
            assert len(char) == 1
            if chr(codept) != char:
                table[codept] = char
    with open(python_out, 'w', encoding='utf-8') as outfile:
        print('SIMPLIFIED_MAP = ', end='', file=outfile)
        pprint.pprint(table, stream=outfile)


def build():
    make_hanzi_table('/tmp/han_in.txt')
    os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
    make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')


if __name__ == '__main__':
    build()
WIP: Traditional Chinese Former-commit-id: 7906a671ea7d2f5d9a49d519aac456329622d184 2015-09-04 20:59:11 +00:00			`import unicodedata`
			`import itertools`
			`import os`
			`import pprint`


			`def make_hanzi_table(filename):`
			`with open(filename, 'w', encoding='utf-8') as out:`
			`for codept in itertools.chain(range(0x3400, 0xa000), range(0xf900, 0xfb00), range(0x20000, 0x30000)):`
			`char = chr(codept)`
			`if unicodedata.category(char) != 'Cn':`
			`print('%5X\t%s' % (codept, char), file=out)`


			`def make_hanzi_converter(table_in, python_out):`
			`table = {}`
			`with open(table_in, encoding='utf-8') as infile:`
			`for line in infile:`
			`hexcode, char = line.rstrip('\n').split('\t')`
			`codept = int(hexcode, 16)`
			`assert len(char) == 1`
			`if chr(codept) != char:`
			`table[codept] = char`
			`with open(python_out, 'w', encoding='utf-8') as outfile:`
			`print('SIMPLIFIED_MAP = ', end='', file=outfile)`
			`pprint.pprint(table, stream=outfile)`


			`def build():`
			`make_hanzi_table('/tmp/han_in.txt')`
			`os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')`
			`make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')`


			`if __name__ == '__main__':`
			`build()`