WIP: Traditional Chinese

Former-commit-id: 7906a671ea
2024-12-23 17:31:41 +00:00 · 2015-09-04 16:59:11 -04:00 · 2015-09-04 16:59:11 -04:00 · 7d1c2e72e4
commit 7d1c2e72e4
parent e77c2dbca8
8 changed files with 3367 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -117,32 +117,32 @@ of word usage on different topics at different levels of formality. The sources
 - **GBooks**: Google Books Ngrams 2013
 - **LeedsIC**: The Leeds Internet Corpus
 - **OpenSub**: OpenSubtitles
 - **SUBTLEX**: The SUBTLEX word frequency lists
 - **OpenSub**: Data derived from OpenSubtitles but not from SUBTLEX
 - **Twitter**: Messages sampled from Twitter's public stream
 - **Wikipedia**: The full text of Wikipedia in 2015
 The following 12 languages are well-supported, with reasonable tokenization and
 at least 3 different sources of word frequencies:
-    Language    Code    GBooks  SUBTLEX LeedsIC OpenSub Twitter Wikipedia
+    Language    Code    GBooks  SUBTLEX OpenSub LeedsIC Twitter Wikipedia
    ──────────────────┼──────────────────────────────────────────────────
    Arabic      ar    │ -       -       Yes     Yes     Yes     Yes
-    German      de    │ -       Yes     Yes     Yes     Yes[1]  Yes
+    German      de    │ -       Yes     -       Yes     Yes[1]  Yes
    Greek       el    │ -       -       Yes     Yes     Yes     Yes
    English     en    │ Yes     Yes     Yes     Yes     Yes     Yes
    Spanish     es    │ -       -       Yes     Yes     Yes     Yes
    French      fr    │ -       -       Yes     Yes     Yes     Yes
-    Indonesian  id    │ -       -       -       Yes     Yes     Yes
+    Indonesian  id    │ -       -       Yes     -       Yes     Yes
    Italian     it    │ -       -       Yes     Yes     Yes     Yes
-    Japanese    ja    │ -       -       Yes     -       Yes     Yes
+    Japanese    ja    │ -       -       -       Yes     Yes     Yes
-    Malay       ms    │ -       -       -       Yes     Yes     Yes
+    Malay       ms    │ -       -       Yes     -       Yes     Yes
-    Dutch       nl    │ -       Yes     -       Yes     Yes     Yes
+    Dutch       nl    │ -       Yes     Yes     -       Yes     Yes
-    Polish      pl    │ -       -       -       Yes     Yes     Yes
+    Polish      pl    │ -       -       Yes     -       Yes     Yes
    Portuguese  pt    │ -       -       Yes     Yes     Yes     Yes
    Russian     ru    │ -       -       Yes     Yes     Yes     Yes
-    Swedish     sv    │ -       -       -       Yes     Yes     Yes
+    Swedish     sv    │ -       -       Yes     -       Yes     Yes
-    Turkish     tr    │ -       -       -       Yes     Yes     Yes
+    Turkish     tr    │ -       -       Yes     -       Yes     Yes
 These languages are only marginally supported so far. We have too few data
 sources so far in Korean (feel free to suggest some), and we are lacking
--- a/scripts/make_chinese_mapping.py
+++ b/scripts/make_chinese_mapping.py
@ -0,0 +1,37 @@
 import unicodedata
 import itertools
 import os
 import pprint
 def make_hanzi_table(filename):
    with open(filename, 'w', encoding='utf-8') as out:
        for codept in itertools.chain(range(0x3400, 0xa000), range(0xf900, 0xfb00), range(0x20000, 0x30000)):
            char = chr(codept)
            if unicodedata.category(char) != 'Cn':
                print('%5X\t%s' % (codept, char), file=out)
 def make_hanzi_converter(table_in, python_out):
    table = {}
    with open(table_in, encoding='utf-8') as infile:
        for line in infile:
            hexcode, char = line.rstrip('\n').split('\t')
            codept = int(hexcode, 16)
            assert len(char) == 1
            if chr(codept) != char:
                table[codept] = char
    with open(python_out, 'w', encoding='utf-8') as outfile:
        print('SIMPLIFIED_MAP = ', end='', file=outfile)
        pprint.pprint(table, stream=outfile)
 def build():
    make_hanzi_table('/tmp/han_in.txt')
    os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
    make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')
 if __name__ == '__main__':
    build()
--- a/wordfreq/_chinese_mapping.py
+++ b/wordfreq/_chinese_mapping.py
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -0,0 +1,19 @@
 from pkg_resources import resource_filename
 from wordfreq._chinese_mapping import SIMPLIFIED_MAP
 import jieba
 jieba_initialized = False
 def simplify_chinese(text):
    return text.translate(SIMPLIFIED_MAP).casefold()
 def chinese_tokenize(text):
    global jieba_initialized
    if not jieba_initialized:
        jieba.set_dictionary(resource_filename('wordfreq', 'data/jieba.txt'))
        jieba_initialized = True
    return list(jieba.cut(simplify_chinese(text)))
--- a/wordfreq/japanese.py
+++ b/wordfreq/japanese.py
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -1,5 +1,6 @@
 import regex
 import unicodedata
 from pkg_resources import resource_filename
 TOKEN_RE = regex.compile(r"""
@ -87,6 +88,7 @@ def remove_arabic_marks(text):
 mecab_tokenize = None
 jieba_tokenize = None
 def tokenize(text, lang):
    """
    Tokenize this text in a way that's relatively simple but appropriate for
@ -115,9 +117,15 @@ def tokenize(text, lang):
    if lang == 'ja':
        global mecab_tokenize
        if mecab_tokenize is None:
-            from wordfreq.mecab import mecab_tokenize
+            from wordfreq.japanese import mecab_tokenize
        return mecab_tokenize(text)
    if lang == 'zh':
        global jieba_tokenize
        if jieba_tokenize is None:
            from wordfreq.chinese import jieba_tokenize
        return jieba_tokenize(text)
    if lang == 'tr':
        return turkish_tokenize(text)
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -32,10 +32,15 @@ rule wiki2text
  command = bunzip2 -c $in | wiki2text > $out
 # To tokenize Japanese, we run it through Mecab and take the first column.
 # We don't have a plan for tokenizing Chinese yet.
 rule tokenize_japanese
  command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
 # Process Chinese by converting all Traditional Chinese characters to
 # Simplified equivalents -- not because that's a good way to get readable
 # text, but because that's how we're going to look them up.
 rule simplify_chinese
  command = python -m wordfreq_builder.cli.simplify_chinese < $in > $out
 # Tokenizing text from Twitter requires us to language-detect and tokenize
 # in the same step.
 rule tokenize_twitter
--- a/wordfreq_builder/wordfreq_builder/cli/simplify_chinese.py
+++ b/wordfreq_builder/wordfreq_builder/cli/simplify_chinese.py
@ -0,0 +1,11 @@
 from wordfreq.chinese import simplify_chinese
 import sys
 def main():
    for line in sys.stdin:
        sys.stdout.write(simplify_chinese(line))
 if __name__ == '__main__':
    main()