WIP: Traditional Chinese

2024-12-23 09:21:37 +00:00 · 2015-09-04 16:59:11 -04:00 · 2015-09-04 16:59:11 -04:00 · 7906a671ea
commit 7906a671ea
parent 3c3371a9ff
8 changed files with 3367 additions and 12 deletions
--- a/README.md
+++ b/README.md
@ -117,32 +117,32 @@ of word usage on different topics at different levels of formality. The sources

 - **GBooks**: Google Books Ngrams 2013
 - **LeedsIC**: The Leeds Internet Corpus
- **OpenSub**: OpenSubtitles
 - **SUBTLEX**: The SUBTLEX word frequency lists
+- **OpenSub**: Data derived from OpenSubtitles but not from SUBTLEX
 - **Twitter**: Messages sampled from Twitter's public stream
 - **Wikipedia**: The full text of Wikipedia in 2015

 The following 12 languages are well-supported, with reasonable tokenization and
 at least 3 different sources of word frequencies:

-    Language    Code    GBooks  SUBTLEX LeedsIC OpenSub Twitter Wikipedia
+    Language    Code    GBooks  SUBTLEX OpenSub LeedsIC Twitter Wikipedia
    ──────────────────┼──────────────────────────────────────────────────
    Arabic      ar    │ -       -       Yes     Yes     Yes     Yes
-    German      de    │ -       Yes     Yes     Yes     Yes[1]  Yes
+    German      de    │ -       Yes     -       Yes     Yes[1]  Yes
    Greek       el    │ -       -       Yes     Yes     Yes     Yes
    English     en    │ Yes     Yes     Yes     Yes     Yes     Yes
    Spanish     es    │ -       -       Yes     Yes     Yes     Yes
    French      fr    │ -       -       Yes     Yes     Yes     Yes
-    Indonesian  id    │ -       -       -       Yes     Yes     Yes
+    Indonesian  id    │ -       -       Yes     -       Yes     Yes
    Italian     it    │ -       -       Yes     Yes     Yes     Yes
-    Japanese    ja    │ -       -       Yes     -       Yes     Yes
-    Malay       ms    │ -       -       -       Yes     Yes     Yes
-    Dutch       nl    │ -       Yes     -       Yes     Yes     Yes
-    Polish      pl    │ -       -       -       Yes     Yes     Yes
+    Japanese    ja    │ -       -       -       Yes     Yes     Yes
+    Malay       ms    │ -       -       Yes     -       Yes     Yes
+    Dutch       nl    │ -       Yes     Yes     -       Yes     Yes
+    Polish      pl    │ -       -       Yes     -       Yes     Yes
    Portuguese  pt    │ -       -       Yes     Yes     Yes     Yes
    Russian     ru    │ -       -       Yes     Yes     Yes     Yes
-    Swedish     sv    │ -       -       -       Yes     Yes     Yes
-    Turkish     tr    │ -       -       -       Yes     Yes     Yes
+    Swedish     sv    │ -       -       Yes     -       Yes     Yes
+    Turkish     tr    │ -       -       Yes     -       Yes     Yes

 These languages are only marginally supported so far. We have too few data
 sources so far in Korean (feel free to suggest some), and we are lacking
--- a/scripts/make_chinese_mapping.py
+++ b/scripts/make_chinese_mapping.py
@ -0,0 +1,37 @@
+import unicodedata
+import itertools
+import os
+import pprint
+
+
+def make_hanzi_table(filename):
+    with open(filename, 'w', encoding='utf-8') as out:
+        for codept in itertools.chain(range(0x3400, 0xa000), range(0xf900, 0xfb00), range(0x20000, 0x30000)):
+            char = chr(codept)
+            if unicodedata.category(char) != 'Cn':
+                print('%5X\t%s' % (codept, char), file=out)
+
+
+def make_hanzi_converter(table_in, python_out):
+    table = {}
+    with open(table_in, encoding='utf-8') as infile:
+        for line in infile:
+            hexcode, char = line.rstrip('\n').split('\t')
+            codept = int(hexcode, 16)
+            assert len(char) == 1
+            if chr(codept) != char:
+                table[codept] = char
+    with open(python_out, 'w', encoding='utf-8') as outfile:
+        print('SIMPLIFIED_MAP = ', end='', file=outfile)
+        pprint.pprint(table, stream=outfile)
+
+
+def build():
+    make_hanzi_table('/tmp/han_in.txt')
+    os.system('opencc -c zht2zhs.ini < /tmp/han_in.txt > /tmp/han_out.txt')
+    make_hanzi_converter('/tmp/han_out.txt', '_chinese_mapping.py')
+
+
+if __name__ == '__main__':
+    build()
+
--- a/wordfreq/_chinese_mapping.py
+++ b/wordfreq/_chinese_mapping.py
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -0,0 +1,19 @@
+from pkg_resources import resource_filename
+from wordfreq._chinese_mapping import SIMPLIFIED_MAP
+import jieba
+
+
+jieba_initialized = False
+
+
+def simplify_chinese(text):
+    return text.translate(SIMPLIFIED_MAP).casefold()
+
+
+def chinese_tokenize(text):
+    global jieba_initialized
+    if not jieba_initialized:
+        jieba.set_dictionary(resource_filename('wordfreq', 'data/jieba.txt'))
+        jieba_initialized = True
+    return list(jieba.cut(simplify_chinese(text)))
+
--- a/wordfreq/japanese.py
+++ b/wordfreq/japanese.py
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -1,5 +1,6 @@
 import regex
 import unicodedata
+from pkg_resources import resource_filename


 TOKEN_RE = regex.compile(r"""
@ -87,6 +88,7 @@ def remove_arabic_marks(text):


 mecab_tokenize = None
+jieba_tokenize = None
 def tokenize(text, lang):
    """
    Tokenize this text in a way that's relatively simple but appropriate for
@ -115,9 +117,15 @@ def tokenize(text, lang):
    if lang == 'ja':
        global mecab_tokenize
        if mecab_tokenize is None:
-            from wordfreq.mecab import mecab_tokenize
+            from wordfreq.japanese import mecab_tokenize
        return mecab_tokenize(text)

+    if lang == 'zh':
+        global jieba_tokenize
+        if jieba_tokenize is None:
+            from wordfreq.chinese import jieba_tokenize
+        return jieba_tokenize(text)
+
    if lang == 'tr':
        return turkish_tokenize(text)

--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -32,10 +32,15 @@ rule wiki2text
  command = bunzip2 -c $in | wiki2text > $out

 # To tokenize Japanese, we run it through Mecab and take the first column.
-# We don't have a plan for tokenizing Chinese yet.
 rule tokenize_japanese
  command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out

+# Process Chinese by converting all Traditional Chinese characters to
+# Simplified equivalents -- not because that's a good way to get readable
+# text, but because that's how we're going to look them up.
+rule simplify_chinese
+  command = python -m wordfreq_builder.cli.simplify_chinese < $in > $out
+
 # Tokenizing text from Twitter requires us to language-detect and tokenize
 # in the same step.
 rule tokenize_twitter
--- a/wordfreq_builder/wordfreq_builder/cli/simplify_chinese.py
+++ b/wordfreq_builder/wordfreq_builder/cli/simplify_chinese.py
@ -0,0 +1,11 @@
+from wordfreq.chinese import simplify_chinese
+import sys
+
+
+def main():
+    for line in sys.stdin:
+        sys.stdout.write(simplify_chinese(line))
+
+
+if __name__ == '__main__':
+    main()