WIP: Traditional Chinese

Former-commit-id: 7906a671ea
This commit is contained in:
Robyn Speer 2015-09-04 16:59:11 -04:00
parent e77c2dbca8
commit 7d1c2e72e4
8 changed files with 3367 additions and 12 deletions

View File

@ -117,32 +117,32 @@ of word usage on different topics at different levels of formality. The sources
- **GBooks**: Google Books Ngrams 2013 - **GBooks**: Google Books Ngrams 2013
- **LeedsIC**: The Leeds Internet Corpus - **LeedsIC**: The Leeds Internet Corpus
- **OpenSub**: OpenSubtitles
- **SUBTLEX**: The SUBTLEX word frequency lists - **SUBTLEX**: The SUBTLEX word frequency lists
- **OpenSub**: Data derived from OpenSubtitles but not from SUBTLEX
- **Twitter**: Messages sampled from Twitter's public stream - **Twitter**: Messages sampled from Twitter's public stream
- **Wikipedia**: The full text of Wikipedia in 2015 - **Wikipedia**: The full text of Wikipedia in 2015
The following 12 languages are well-supported, with reasonable tokenization and The following 12 languages are well-supported, with reasonable tokenization and
at least 3 different sources of word frequencies: at least 3 different sources of word frequencies:
Language Code GBooks SUBTLEX LeedsIC OpenSub Twitter Wikipedia Language Code GBooks SUBTLEX OpenSub LeedsIC Twitter Wikipedia
──────────────────┼────────────────────────────────────────────────── ──────────────────┼──────────────────────────────────────────────────
Arabic ar │ - - Yes Yes Yes Yes Arabic ar │ - - Yes Yes Yes Yes
German de │ - Yes Yes Yes Yes[1] Yes German de │ - Yes - Yes Yes[1] Yes
Greek el │ - - Yes Yes Yes Yes Greek el │ - - Yes Yes Yes Yes
English en │ Yes Yes Yes Yes Yes Yes English en │ Yes Yes Yes Yes Yes Yes
Spanish es │ - - Yes Yes Yes Yes Spanish es │ - - Yes Yes Yes Yes
French fr │ - - Yes Yes Yes Yes French fr │ - - Yes Yes Yes Yes
Indonesian id │ - - - Yes Yes Yes Indonesian id │ - - Yes - Yes Yes
Italian it │ - - Yes Yes Yes Yes Italian it │ - - Yes Yes Yes Yes
Japanese ja │ - - Yes - Yes Yes Japanese ja │ - - - Yes Yes Yes
Malay ms │ - - - Yes Yes Yes Malay ms │ - - Yes - Yes Yes
Dutch nl │ - Yes - Yes Yes Yes Dutch nl │ - Yes Yes - Yes Yes
Polish pl │ - - - Yes Yes Yes Polish pl │ - - Yes - Yes Yes
Portuguese pt │ - - Yes Yes Yes Yes Portuguese pt │ - - Yes Yes Yes Yes
Russian ru │ - - Yes Yes Yes Yes Russian ru │ - - Yes Yes Yes Yes
Swedish sv │ - - - Yes Yes Yes Swedish sv │ - - Yes - Yes Yes
Turkish tr │ - - - Yes Yes Yes Turkish tr │ - - Yes - Yes Yes
These languages are only marginally supported so far. We have too few data These languages are only marginally supported so far. We have too few data
sources so far in Korean (feel free to suggest some), and we are lacking sources so far in Korean (feel free to suggest some), and we are lacking

View File

@ -0,0 +1,37 @@
import unicodedata
import itertools
import os
import pprint
def make_hanzi_table(filename):
with open(filename, 'w', encoding='utf-8') as out:
for codept in itertools.chain(range(0x3400, 0xa000), range(0xf900, 0xfb00), range(0x20000, 0x30000)):
char = chr(codept)
if unicodedata.category(char) != 'Cn':
print('%5X\t%s' % (codept, char), file=out)
def make_hanzi_converter(table_in, python_out):
table = {}
with open(table_in, encoding='utf-8') as infile:
for line in infile:
hexcode, char = line.rstrip('\n').split('\t')
codept = int(hexcode, 16)
assert len(char) == 1
if chr(codept) != char:
table[codept] = char
with open(python_out, 'w', encoding='utf-8') as outfile:
print('SIMPLIFIED_MAP = ', end='', file=outfile)
pprint.pprint(table, stream=outfile)
def build():
make_hanzi_table('/tmp/han_in.txt')
os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')
if __name__ == '__main__':
build()

3275
wordfreq/_chinese_mapping.py Normal file

File diff suppressed because it is too large Load Diff

19
wordfreq/chinese.py Normal file
View File

@ -0,0 +1,19 @@
from pkg_resources import resource_filename
from wordfreq._chinese_mapping import SIMPLIFIED_MAP
import jieba
jieba_initialized = False
def simplify_chinese(text):
return text.translate(SIMPLIFIED_MAP).casefold()
def chinese_tokenize(text):
global jieba_initialized
if not jieba_initialized:
jieba.set_dictionary(resource_filename('wordfreq', 'data/jieba.txt'))
jieba_initialized = True
return list(jieba.cut(simplify_chinese(text)))

View File

@ -1,5 +1,6 @@
import regex import regex
import unicodedata import unicodedata
from pkg_resources import resource_filename
TOKEN_RE = regex.compile(r""" TOKEN_RE = regex.compile(r"""
@ -87,6 +88,7 @@ def remove_arabic_marks(text):
mecab_tokenize = None mecab_tokenize = None
jieba_tokenize = None
def tokenize(text, lang): def tokenize(text, lang):
""" """
Tokenize this text in a way that's relatively simple but appropriate for Tokenize this text in a way that's relatively simple but appropriate for
@ -115,9 +117,15 @@ def tokenize(text, lang):
if lang == 'ja': if lang == 'ja':
global mecab_tokenize global mecab_tokenize
if mecab_tokenize is None: if mecab_tokenize is None:
from wordfreq.mecab import mecab_tokenize from wordfreq.japanese import mecab_tokenize
return mecab_tokenize(text) return mecab_tokenize(text)
if lang == 'zh':
global jieba_tokenize
if jieba_tokenize is None:
from wordfreq.chinese import jieba_tokenize
return jieba_tokenize(text)
if lang == 'tr': if lang == 'tr':
return turkish_tokenize(text) return turkish_tokenize(text)

View File

@ -32,10 +32,15 @@ rule wiki2text
command = bunzip2 -c $in | wiki2text > $out command = bunzip2 -c $in | wiki2text > $out
# To tokenize Japanese, we run it through Mecab and take the first column. # To tokenize Japanese, we run it through Mecab and take the first column.
# We don't have a plan for tokenizing Chinese yet.
rule tokenize_japanese rule tokenize_japanese
command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
# Process Chinese by converting all Traditional Chinese characters to
# Simplified equivalents -- not because that's a good way to get readable
# text, but because that's how we're going to look them up.
rule simplify_chinese
command = python -m wordfreq_builder.cli.simplify_chinese < $in > $out
# Tokenizing text from Twitter requires us to language-detect and tokenize # Tokenizing text from Twitter requires us to language-detect and tokenize
# in the same step. # in the same step.
rule tokenize_twitter rule tokenize_twitter

View File

@ -0,0 +1,11 @@
from wordfreq.chinese import simplify_chinese
import sys
def main():
for line in sys.stdin:
sys.stdout.write(simplify_chinese(line))
if __name__ == '__main__':
main()