WIP: Traditional Chinese

This commit is contained in:
Rob Speer 2015-09-04 16:59:11 -04:00
parent 3c3371a9ff
commit 7906a671ea
8 changed files with 3367 additions and 12 deletions

View File

@ -117,32 +117,32 @@ of word usage on different topics at different levels of formality. The sources
- **GBooks**: Google Books Ngrams 2013
- **LeedsIC**: The Leeds Internet Corpus
- **OpenSub**: OpenSubtitles
- **SUBTLEX**: The SUBTLEX word frequency lists
- **OpenSub**: Data derived from OpenSubtitles but not from SUBTLEX
- **Twitter**: Messages sampled from Twitter's public stream
- **Wikipedia**: The full text of Wikipedia in 2015
The following 12 languages are well-supported, with reasonable tokenization and
at least 3 different sources of word frequencies:
Language Code GBooks SUBTLEX LeedsIC OpenSub Twitter Wikipedia
Language Code GBooks SUBTLEX OpenSub LeedsIC Twitter Wikipedia
──────────────────┼──────────────────────────────────────────────────
Arabic ar │ - - Yes Yes Yes Yes
German de │ - Yes Yes Yes Yes[1] Yes
German de │ - Yes - Yes Yes[1] Yes
Greek el │ - - Yes Yes Yes Yes
English en │ Yes Yes Yes Yes Yes Yes
Spanish es │ - - Yes Yes Yes Yes
French fr │ - - Yes Yes Yes Yes
Indonesian id │ - - - Yes Yes Yes
Indonesian id │ - - Yes - Yes Yes
Italian it │ - - Yes Yes Yes Yes
Japanese ja │ - - Yes - Yes Yes
Malay ms │ - - - Yes Yes Yes
Dutch nl │ - Yes - Yes Yes Yes
Polish pl │ - - - Yes Yes Yes
Japanese ja │ - - - Yes Yes Yes
Malay ms │ - - Yes - Yes Yes
Dutch nl │ - Yes Yes - Yes Yes
Polish pl │ - - Yes - Yes Yes
Portuguese pt │ - - Yes Yes Yes Yes
Russian ru │ - - Yes Yes Yes Yes
Swedish sv │ - - - Yes Yes Yes
Turkish tr │ - - - Yes Yes Yes
Swedish sv │ - - Yes - Yes Yes
Turkish tr │ - - Yes - Yes Yes
These languages are only marginally supported so far. We have too few data
sources so far in Korean (feel free to suggest some), and we are lacking

View File

@ -0,0 +1,37 @@
import unicodedata
import itertools
import os
import pprint
def make_hanzi_table(filename):
with open(filename, 'w', encoding='utf-8') as out:
for codept in itertools.chain(range(0x3400, 0xa000), range(0xf900, 0xfb00), range(0x20000, 0x30000)):
char = chr(codept)
if unicodedata.category(char) != 'Cn':
print('%5X\t%s' % (codept, char), file=out)
def make_hanzi_converter(table_in, python_out):
table = {}
with open(table_in, encoding='utf-8') as infile:
for line in infile:
hexcode, char = line.rstrip('\n').split('\t')
codept = int(hexcode, 16)
assert len(char) == 1
if chr(codept) != char:
table[codept] = char
with open(python_out, 'w', encoding='utf-8') as outfile:
print('SIMPLIFIED_MAP = ', end='', file=outfile)
pprint.pprint(table, stream=outfile)
def build():
make_hanzi_table('/tmp/han_in.txt')
os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')
if __name__ == '__main__':
build()

3275
wordfreq/_chinese_mapping.py Normal file

File diff suppressed because it is too large Load Diff

19
wordfreq/chinese.py Normal file
View File

@ -0,0 +1,19 @@
from pkg_resources import resource_filename
from wordfreq._chinese_mapping import SIMPLIFIED_MAP
import jieba
jieba_initialized = False
def simplify_chinese(text):
return text.translate(SIMPLIFIED_MAP).casefold()
def chinese_tokenize(text):
global jieba_initialized
if not jieba_initialized:
jieba.set_dictionary(resource_filename('wordfreq', 'data/jieba.txt'))
jieba_initialized = True
return list(jieba.cut(simplify_chinese(text)))

View File

@ -1,5 +1,6 @@
import regex
import unicodedata
from pkg_resources import resource_filename
TOKEN_RE = regex.compile(r"""
@ -87,6 +88,7 @@ def remove_arabic_marks(text):
mecab_tokenize = None
jieba_tokenize = None
def tokenize(text, lang):
"""
Tokenize this text in a way that's relatively simple but appropriate for
@ -115,9 +117,15 @@ def tokenize(text, lang):
if lang == 'ja':
global mecab_tokenize
if mecab_tokenize is None:
from wordfreq.mecab import mecab_tokenize
from wordfreq.japanese import mecab_tokenize
return mecab_tokenize(text)
if lang == 'zh':
global jieba_tokenize
if jieba_tokenize is None:
from wordfreq.chinese import jieba_tokenize
return jieba_tokenize(text)
if lang == 'tr':
return turkish_tokenize(text)

View File

@ -32,10 +32,15 @@ rule wiki2text
command = bunzip2 -c $in | wiki2text > $out
# To tokenize Japanese, we run it through Mecab and take the first column.
# We don't have a plan for tokenizing Chinese yet.
rule tokenize_japanese
command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
# Process Chinese by converting all Traditional Chinese characters to
# Simplified equivalents -- not because that's a good way to get readable
# text, but because that's how we're going to look them up.
rule simplify_chinese
command = python -m wordfreq_builder.cli.simplify_chinese < $in > $out
# Tokenizing text from Twitter requires us to language-detect and tokenize
# in the same step.
rule tokenize_twitter

View File

@ -0,0 +1,11 @@
from wordfreq.chinese import simplify_chinese
import sys
def main():
for line in sys.stdin:
sys.stdout.write(simplify_chinese(line))
if __name__ == '__main__':
main()