mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
load the Chinese character mapping from a .msgpack.gz file
This commit is contained in:
parent
06f8b29971
commit
6cf4210187
@ -1,20 +1,21 @@
|
|||||||
"""
|
"""
|
||||||
Generate a Python file, _chinese_mapping.py, that maps Traditional Chinese
|
Generate a msgpack file, _chinese_mapping.msgpack.gz, that maps Traditional
|
||||||
characters to their Simplified Chinese equivalents.
|
Chinese characters to their Simplified Chinese equivalents.
|
||||||
|
|
||||||
This is meant to be a normalization of text, somewhat like case-folding -- not
|
This is meant to be a normalization of text, somewhat like case-folding -- not
|
||||||
an actual translator, a task for which this method would be unsuitable. We
|
an actual translator, a task for which this method would be unsuitable. We
|
||||||
store word frequencies using Simplified Chinese characters so that, in the large
|
store word frequencies using Simplified Chinese characters so that, in the
|
||||||
number of cases where a Traditional Chinese word has an obvious Simplified Chinese
|
large number of cases where a Traditional Chinese word has an obvious
|
||||||
mapping, we can get a frequency for it that's the same in Simplified and Traditional
|
Simplified Chinese mapping, we can get a frequency for it that's the same in
|
||||||
Chinese.
|
Simplified and Traditional Chinese.
|
||||||
|
|
||||||
Generating this mapping requires the external Chinese conversion tool OpenCC.
|
Generating this mapping requires the external Chinese conversion tool OpenCC.
|
||||||
"""
|
"""
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import itertools
|
import itertools
|
||||||
import os
|
import os
|
||||||
import pprint
|
import msgpack
|
||||||
|
import gzip
|
||||||
|
|
||||||
|
|
||||||
def make_hanzi_table(filename):
|
def make_hanzi_table(filename):
|
||||||
@ -25,7 +26,7 @@ def make_hanzi_table(filename):
|
|||||||
print('%5X\t%s' % (codept, char), file=out)
|
print('%5X\t%s' % (codept, char), file=out)
|
||||||
|
|
||||||
|
|
||||||
def make_hanzi_converter(table_in, python_out):
|
def make_hanzi_converter(table_in, msgpack_out):
|
||||||
table = {}
|
table = {}
|
||||||
with open(table_in, encoding='utf-8') as infile:
|
with open(table_in, encoding='utf-8') as infile:
|
||||||
for line in infile:
|
for line in infile:
|
||||||
@ -34,15 +35,14 @@ def make_hanzi_converter(table_in, python_out):
|
|||||||
assert len(char) == 1
|
assert len(char) == 1
|
||||||
if chr(codept) != char:
|
if chr(codept) != char:
|
||||||
table[codept] = char
|
table[codept] = char
|
||||||
with open(python_out, 'w', encoding='utf-8') as outfile:
|
with gzip.open(msgpack_out, 'wb') as outfile:
|
||||||
print('SIMPLIFIED_MAP = ', end='', file=outfile)
|
msgpack.dump(table, outfile, encoding='utf-8')
|
||||||
pprint.pprint(table, stream=outfile)
|
|
||||||
|
|
||||||
|
|
||||||
def build():
|
def build():
|
||||||
make_hanzi_table('/tmp/han_in.txt')
|
make_hanzi_table('/tmp/han_in.txt')
|
||||||
os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
|
os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
|
||||||
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')
|
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.msgpack.gz')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -85,10 +85,11 @@ def available_languages(wordlist='combined'):
|
|||||||
"""
|
"""
|
||||||
available = {}
|
available = {}
|
||||||
for path in DATA_PATH.glob('*.msgpack.gz'):
|
for path in DATA_PATH.glob('*.msgpack.gz'):
|
||||||
list_name = path.name.split('.')[0]
|
if not path.name.startswith('_'):
|
||||||
name, lang = list_name.split('_')
|
list_name = path.name.split('.')[0]
|
||||||
if name == wordlist:
|
name, lang = list_name.split('_')
|
||||||
available[lang] = str(path)
|
if name == wordlist:
|
||||||
|
available[lang] = str(path)
|
||||||
return available
|
return available
|
||||||
|
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,14 +1,19 @@
|
|||||||
from pkg_resources import resource_filename
|
from pkg_resources import resource_filename
|
||||||
from wordfreq._chinese_mapping import SIMPLIFIED_MAP
|
|
||||||
import jieba
|
import jieba
|
||||||
|
import msgpack
|
||||||
|
import gzip
|
||||||
|
|
||||||
jieba_tokenizer = None
|
jieba_tokenizer = None
|
||||||
|
simplified_map = None
|
||||||
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
|
||||||
|
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
|
||||||
|
|
||||||
|
|
||||||
def simplify_chinese(text):
|
def simplify_chinese(text):
|
||||||
return text.translate(SIMPLIFIED_MAP).casefold()
|
global simplified_map
|
||||||
|
if simplified_map is None:
|
||||||
|
simplified_map = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
|
||||||
|
return text.translate(simplified_map).casefold()
|
||||||
|
|
||||||
|
|
||||||
def jieba_tokenize(text):
|
def jieba_tokenize(text):
|
||||||
|
BIN
wordfreq/data/_chinese_mapping.msgpack.gz
Normal file
BIN
wordfreq/data/_chinese_mapping.msgpack.gz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user