tokenize Chinese using jieba and our own frequencies

2024-12-23 17:31:41 +00:00 · 2015-09-05 03:16:56 -04:00 · 2015-09-05 03:16:56 -04:00 · 2327f2e4d6
commit 2327f2e4d6
parent 7906a671ea
12 changed files with 32088 additions and 40 deletions
--- a/setup.py
+++ b/setup.py
@ -33,7 +33,7 @@ if sys.version_info < (3, 4):

 setup(
    name="wordfreq",
-    version='1.1',
+    version='1.2',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
@ -50,8 +50,11 @@ setup(
    # turn, it depends on libmecab-dev being installed on the system. It's not
    # listed under 'install_requires' because wordfreq should be usable in
    # other languages without it.
+    #
+    # Similarly, jieba is required for Chinese word frequencies.
    extras_require={
-        'mecab': 'mecab-python3'
+        'mecab': 'mecab-python3',
+        'jieba': 'jieba'
    },
-    tests_require=['mecab-python3'],
+    tests_require=['mecab-python3', 'jieba'],
 )
--- a/tests/test.py
+++ b/tests/test.py
@ -162,8 +162,8 @@ def test_ar():


 def test_ideographic_fallback():
-    # Try tokenizing Chinese text -- it should remain stuck together.
-    eq_(tokenize('中国文字', 'zh'), ['中国文字'])
+    # Try tokenizing Chinese text as English -- it should remain stuck together.
+    eq_(tokenize('中国文字', 'en'), ['中国文字'])

    # When Japanese is tagged with the wrong language, it will be split
    # at script boundaries.
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@ -0,0 +1,48 @@
+from nose.tools import eq_, assert_almost_equal, assert_greater
+from wordfreq import tokenize, word_frequency
+
+
+def test_tokens():
+    # Let's test on some Chinese text that has unusual combinations of
+    # syllables, because it is about an American vice-president.
+    #
+    # (He was the Chinese Wikipedia's featured article of the day when I
+    # wrote this test.)
+
+    hobart = '加勒特·霍巴特'  # Garret Hobart, or "jiā lè tè huò bā tè".
+
+    # He was the sixth American vice president to die in office.
+    fact_simplified  = '他是历史上第六位在任期内去世的美国副总统。'
+    fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。'
+
+    # His name breaks into five pieces, with the only piece staying together
+    # being the one that means 'Bart'. The dot is not included as a token.
+    eq_(
+        tokenize(hobart, 'zh'),
+        ['加', '勒', '特', '霍', '巴特']
+    )
+
+    eq_(
+        tokenize(fact_simplified, 'zh'),
+        [
+         # he / is / in history / #6 / counter for people
+         '他', '是',  '历史上', '第六', '位',
+         # during / term of office / in / die
+         '在', '任期', '内', '去世',
+         # of / U.S. / deputy / president
+         '的', '美国', '副', '总统'
+        ]
+    )
+
+    # You match the same tokens if you look it up in Traditional Chinese.
+    eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
+    assert_greater(word_frequency(fact_traditional, 'zh'), 0)
+
+
+def test_combination():
+    xiexie_freq = word_frequency('谢谢', 'zh')   # "Thanks"
+    assert_almost_equal(
+        word_frequency('谢谢谢谢', 'zh'),
+        xiexie_freq / 2
+    )
+
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -3,17 +3,17 @@ from wordfreq._chinese_mapping import SIMPLIFIED_MAP
 import jieba


-jieba_initialized = False
+jieba_tokenizer = None
+DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')


 def simplify_chinese(text):
    return text.translate(SIMPLIFIED_MAP).casefold()


-def chinese_tokenize(text):
-    global jieba_initialized
-    if not jieba_initialized:
-        jieba.set_dictionary(resource_filename('wordfreq', 'data/jieba.txt'))
-        jieba_initialized = True
-    return list(jieba.cut(simplify_chinese(text)))
+def jieba_tokenize(text):
+    global jieba_tokenizer
+    if jieba_tokenizer is None:
+        jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
+    return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)

--- a/wordfreq/data/jieba_zh.txt
+++ b/wordfreq/data/jieba_zh.txt
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -118,13 +118,16 @@ def tokenize(text, lang):
        global mecab_tokenize
        if mecab_tokenize is None:
            from wordfreq.japanese import mecab_tokenize
-        return mecab_tokenize(text)
+        tokens = mecab_tokenize(text)
+        return [token.casefold() for token in tokens if TOKEN_RE.match(token)]

    if lang == 'zh':
        global jieba_tokenize
        if jieba_tokenize is None:
            from wordfreq.chinese import jieba_tokenize
-        return jieba_tokenize(text)
+        tokens = jieba_tokenize(text)
+        return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
+

    if lang == 'tr':
        return turkish_tokenize(text)
--- a/wordfreq_builder/build.png
+++ b/wordfreq_builder/build.png
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -67,6 +67,13 @@ rule convert_opensubtitles
 rule convert_subtlex
  command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr '	",' ',  ' | grep -v 'â,' > $out

+rule convert_jieba
+  command = cut -d ' ' -f 1,2 $in | grep -v '[,"]' | tr ' ' ',' > $out
+
+rule counts_to_jieba
+  command = python -m wordfreq_builder.cli.counts_to_jieba $in $out
+
+
 # Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
 # the input files, keep only the single words and their counts, and only keep
 # lines with counts of 100 or more.
--- a/wordfreq_builder/wordfreq_builder/cli/counts_to_jieba.py
+++ b/wordfreq_builder/wordfreq_builder/cli/counts_to_jieba.py
@ -0,0 +1,15 @@
+from wordfreq_builder.word_counts import read_values, write_jieba
+import argparse
+
+
+def handle_counts(filename_in, filename_out):
+    freqs, total = read_values(filename_in, cutoff=1e-6)
+    write_jieba(freqs, filename_out)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filename_in', help='name of input wordlist')
+    parser.add_argument('filename_out', help='name of output Jieba-compatible wordlist')
+    args = parser.parse_args()
+    handle_counts(args.filename_in, args.filename_out)
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@ -41,6 +41,7 @@ CONFIG = {
        ],
        'subtlex-en': ['en'],
        'subtlex-other': ['de', 'nl', 'zh'],
+        'jieba': ['zh']
    },
    # Subtlex languages that need to be pre-processed
    'wordlist_paths': {
@ -51,9 +52,11 @@ CONFIG = {
        'google-books': 'generated/google-books/google_books_{lang}.{ext}',
        'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
        'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
+        'jieba': 'generated/jieba/jieba_{lang}.{ext}',
        'combined': 'generated/combined/combined_{lang}.{ext}',
        'combined-dist': 'dist/combined_{lang}.{ext}',
-        'twitter-dist': 'dist/twitter_{lang}.{ext}'
+        'twitter-dist': 'dist/twitter_{lang}.{ext}',
+        'jieba-dist': 'dist/jieba_{lang}.{ext}'
    },
    'min_sources': 2
 }
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -3,6 +3,7 @@ from wordfreq_builder.config import (
 )
 import sys
 import pathlib
+import itertools

 HEADER = """# This file is automatically generated. Do not edit it.
 # You can change its behavior by editing wordfreq_builder/ninja.py,
@ -45,51 +46,43 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
    # The first dependency is to make sure the build file is up to date.
    add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja',
            extra='wordfreq_builder/ninja.py')
-    lines.extend(
+    lines.extend(itertools.chain(
        twitter_deps(
            data_filename('raw-input/twitter/all-2014.txt'),
            slice_prefix=data_filename('slices/twitter/tweets-2014'),
            combined_prefix=data_filename('generated/twitter/tweets-2014'),
            slices=40,
            languages=CONFIG['sources']['twitter']
-        )
-    )
-    lines.extend(
+        ),
        wikipedia_deps(
            data_filename('raw-input/wikipedia'),
            CONFIG['sources']['wikipedia']
-        )
-    )
-    lines.extend(
+        ),
        google_books_deps(
            data_filename('raw-input/google-books')
-        )
-    )
-    lines.extend(
+        ),
        leeds_deps(
            data_filename('source-lists/leeds'),
            CONFIG['sources']['leeds']
-        )
-    )
-    lines.extend(
+        ),
        opensubtitles_deps(
            data_filename('source-lists/opensubtitles'),
            CONFIG['sources']['opensubtitles']
-        )
-    )
-    lines.extend(
+        ),
        subtlex_en_deps(
            data_filename('source-lists/subtlex'),
            CONFIG['sources']['subtlex-en']
-        )
-    )
-    lines.extend(
+        ),
        subtlex_other_deps(
            data_filename('source-lists/subtlex'),
            CONFIG['sources']['subtlex-other']
-        )
-    )
-    lines.extend(combine_lists(all_languages()))
+        ),
+        jieba_deps(
+            data_filename('source-lists/jieba'),
+            CONFIG['sources']['jieba']
+        ),
+        combine_lists(all_languages())
+    ))

    print('\n'.join(lines), file=out)

@ -189,8 +182,14 @@ def leeds_deps(dirname_in, languages):
        input_file = '{prefix}/internet-{lang}-forms.num'.format(
            prefix=dirname_in, lang=language
        )
+        if language == 'zh':
+            step2_file = wordlist_filename('leeds', 'zh-Hans', 'converted.txt')
+            add_dep(lines, 'simplify_chinese', input_file, step2_file)
+        else:
+            step2_file = input_file
+
        reformatted_file = wordlist_filename('leeds', language, 'counts.txt')
-        add_dep(lines, 'convert_leeds', input_file, reformatted_file)
+        add_dep(lines, 'convert_leeds', step2_file, reformatted_file)

    return lines

@ -201,14 +200,37 @@ def opensubtitles_deps(dirname_in, languages):
        input_file = '{prefix}/{lang}.txt'.format(
            prefix=dirname_in, lang=language
        )
+        if language == 'zh':
+            step2_file = wordlist_filename('opensubtitles', 'zh-Hans', 'converted.txt')
+            add_dep(lines, 'simplify_chinese', input_file, step2_file)
+        else:
+            step2_file = input_file
        reformatted_file = wordlist_filename(
            'opensubtitles', language, 'counts.txt'
        )
-        add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
+        add_dep(lines, 'convert_opensubtitles', step2_file, reformatted_file)

    return lines


+def jieba_deps(dirname_in, languages):
+    lines = []
+    # Either subtlex_zh is turned off, or it's just in Chinese
+    if not languages:
+        return lines
+    assert languages == ['zh']
+    input_file = '{prefix}/dict.txt.big'.format(prefix=dirname_in)
+    transformed_file = wordlist_filename(
+        'jieba', 'zh-Hans', 'converted.txt'
+    )
+    reformatted_file = wordlist_filename(
+        'jieba', 'zh', 'counts.txt'
+    )
+    add_dep(lines, 'simplify_chinese', input_file, transformed_file)
+    add_dep(lines, 'convert_jieba', transformed_file, reformatted_file)
+    return lines
+
+
 # Which columns of the SUBTLEX data files do the word and its frequency appear
 # in?
 SUBTLEX_COLUMN_MAP = {
@ -222,6 +244,9 @@ SUBTLEX_COLUMN_MAP = {

 def subtlex_en_deps(dirname_in, languages):
    lines = []
+    # Either subtlex_en is turned off, or it's just in English
+    if not languages:
+        return lines
    assert languages == ['en']
    regions = ['en-US', 'en-GB']
    processed_files = []
@ -259,8 +284,14 @@ def subtlex_other_deps(dirname_in, languages):
        else:
            startrow = 2

+        if language == 'zh':
+            step2_file = wordlist_filename('subtlex-other', 'zh-Hans', 'converted.txt')
+            add_dep(lines, 'simplify_chinese', input_file, step2_file)
+        else:
+            step2_file = input_file
+
        add_dep(
-            lines, 'convert_subtlex', input_file, processed_file,
+            lines, 'convert_subtlex', step2_file, processed_file,
            params={'textcol': textcol, 'freqcol': freqcol, 'startrow': startrow}
        )
        add_dep(
@ -301,6 +332,12 @@ def combine_lists(languages):

            lines.append('default {}'.format(output_cBpack))

+    # Write a Jieba-compatible frequency file for Chinese tokenization
+    chinese_combined = wordlist_filename('combined', 'zh')
+    jieba_output = wordlist_filename('jieba-dist', 'zh')
+    add_dep(lines, 'counts_to_jieba', chinese_combined, jieba_output,
+            extra=['wordfreq_builder/word_counts.py', 'wordfreq_builder/cli/counts_to_jieba.py'])
+    lines.append('default {}'.format(jieba_output))
    return lines


--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -12,6 +12,7 @@ import regex
 # Match common cases of URLs: the schema http:// or https:// followed by
 # non-whitespace characters.
 URL_RE = regex.compile(r'https?://(?:\S)+')
+HAN_RE = regex.compile(r'[\p{Script=Han}]+')


 def count_tokens(filename):
@ -162,3 +163,19 @@ def write_wordlist(freqs, filename, cutoff=1e-8):
                break
            if not ('"' in word or ',' in word):
                writer.writerow([word, str(freq)])
+
+
+def write_jieba(freqs, filename):
+    """
+    Write a dictionary of frequencies in a format that can be used for Jieba
+    tokenization of Chinese.
+    """
+    with open(filename, 'w', encoding='utf-8', newline='\n') as outfile:
+        items = sorted(freqs.items(), key=itemgetter(1), reverse=True)
+        for word, freq in items:
+            if HAN_RE.search(word):
+                # Only store this word as a token if it contains at least one
+                # Han character.
+                fake_count = round(freq * 1e9)
+                print('%s %d' % (word, fake_count), file=outfile)
+