Merge pull request #28 from LuminosoInsight/chinese-external-wordlist

Add some tokenizer options
2024-12-23 17:31:41 +00:00 · 2015-10-19 18:21:52 -04:00 · 2015-10-19 18:21:52 -04:00 · ca00dfa1d9
commit ca00dfa1d9
parent 15d99be21b a6b6aa07e7
7 changed files with 349200 additions and 69 deletions
--- a/README.md
+++ b/README.md
@ -15,13 +15,26 @@ or by getting the repository and running its setup.py:
    python3 setup.py install
-To handle word frequency lookups in Japanese, you need to additionally install
+Japanese and Chinese have additional external dependencies so that they can be
-mecab-python3, which itself depends on libmecab-dev. These commands will
+tokenized correctly.
-install them on Ubuntu:
+
 To be able to look up word frequencies in Japanese, you need to additionally
 install mecab-python3, which itself depends on libmecab-dev and its dictionary.
 These commands will install them on Ubuntu:
    sudo apt-get install mecab-ipadic-utf8 libmecab-dev
    pip3 install mecab-python3
 To be able to look up word frequencies in Chinese, you need Jieba, a
 pure-Python Chinese tokenizer:
    pip3 install jieba
 These dependencies can also be requested as options when installing wordfreq.
 For example:
    pip3 install wordfreq[mecab,jieba]
 ## Usage
--- a/tests/test.py
+++ b/tests/test.py
@ -21,17 +21,19 @@ def test_languages():
    avail = available_languages()
    assert_greater(len(avail), 15)
-    # Laughter is the universal language
+    # Laughter is the universal language. Look up either 'lol' or '笑' in each
    # language and make sure it has a non-zero frequency.
    for lang in avail:
-        if lang not in {'zh', 'ja'}:
+        if lang in {'zh', 'ja'}:
-            # we do not have enough Chinese data
+            text = '笑'
-            # Japanese people do not lol
+        else:
-            assert_greater(word_frequency('lol', lang), 0)
+            text = 'lol'
        assert_greater(word_frequency(text, lang), 0)
        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
-            assert_greater(word_frequency('lol', new_lang_code), 0)
+        assert_greater(word_frequency(text, new_lang_code), 0)
 def test_twitter():
@ -99,6 +101,9 @@ def test_tokenization():
    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
    eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
        ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
    # Certain punctuation does not inherently split a word.
    eq_(tokenize("Anything is possible at zombo.com", 'en'),
        ['anything', 'is', 'possible', 'at', 'zombo.com'])
@ -108,6 +113,9 @@ def test_tokenization():
    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
    eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
        ['this', 'text', 'has', '...', 'punctuation', ':)'])
 def test_casefolding():
    eq_(tokenize('WEISS', 'de'), ['weiss'])
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@ -34,6 +34,27 @@ def test_tokens():
        ]
    )
    # Jieba's original tokenizer knows a lot of names, it seems.
    eq_(
        tokenize(hobart, 'zh', external_wordlist=True),
        ['加勒特', '霍巴特']
    )
    # We get almost the same tokens from the sentence using Jieba's own
    # wordlist, but it tokenizes "in history" as two words and
    # "sixth person" as one.
    eq_(
        tokenize(fact_simplified, 'zh', external_wordlist=True),
        [
            # he / is / history / in / sixth person
            '他', '是', '历史', '上', '第六位',
            # during / term of office / in / die
            '在', '任期', '内', '去世',
            # of / U.S. / deputy / president
            '的', '美国', '副', '总统'
        ]
    )
    # You match the same tokens if you look it up in Traditional Chinese.
    eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
    assert_greater(word_frequency(fact_traditional, 'zh'), 0)
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -4,9 +4,11 @@ import msgpack
 import gzip
 DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
 ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
 SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
 SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
 jieba_tokenizer = None
 jieba_orig_tokenizer = None
 def simplify_chinese(text):
@ -23,17 +25,28 @@ def simplify_chinese(text):
    return text.translate(SIMPLIFIED_MAP).casefold()
-def jieba_tokenize(text):
+def jieba_tokenize(text, external_wordlist=False):
    """
    Tokenize the given text into tokens whose word frequencies can probably
    be looked up. This uses Jieba, a word-frequency-based tokenizer.
-    We tell Jieba to default to using wordfreq's own Chinese wordlist, and not
+    If `external_wordlist` is False, we tell Jieba to default to using
-    to infer unknown words using a hidden Markov model. This ensures that the
+    wordfreq's own Chinese wordlist, and not to infer unknown words using a
-    multi-character tokens that it outputs will be ones whose word frequencies
+    hidden Markov model. This ensures that the multi-character tokens that it
-    we can look up.
+    outputs will be ones whose word frequencies we can look up.
    If `external_wordlist` is True, this will use the largest version of
    Jieba's original dictionary, with HMM enabled, so its results will be
    independent of the data in wordfreq. These results will be better optimized
    for purposes that aren't looking up word frequencies, such as general-
    purpose tokenization, or collecting word frequencies in the first place.
    """
-    global jieba_tokenizer
+    global jieba_tokenizer, jieba_orig_tokenizer
    if external_wordlist:
        if jieba_orig_tokenizer is None:
            jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
        return jieba_orig_tokenizer.lcut(text)
    else:
        if jieba_tokenizer is None:
            jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
        return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
--- a/wordfreq/data/jieba_zh_orig.txt
+++ b/wordfreq/data/jieba_zh_orig.txt
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -38,10 +38,16 @@ TOKEN_RE = regex.compile(r"""
    (?:\B\S)*
 """, regex.V1 | regex.WORD | regex.VERBOSE)
 TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
    [\p{IsIdeo}\p{Script=Hiragana}]+ |
    [\p{punct}]+ |
    \S(?:\B\S)*
 """, regex.V1 | regex.WORD | regex.VERBOSE)
 ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
-def simple_tokenize(text):
+def simple_tokenize(text, include_punctuation=False):
    """
    Tokenize the given text using a straightforward, Unicode-aware token
    expression.
@ -56,22 +62,46 @@ def simple_tokenize(text):
      ideograms and hiragana) relatively untokenized, instead of splitting each
      character into its own token.
-    - It outputs only the tokens that start with a word-like character, or
+    - If `include_punctuation` is False (the default), it outputs only the
-      miscellaneous symbols such as emoji.
+      tokens that start with a word-like character, or miscellaneous symbols
      such as emoji. If `include_punctuation` is True, it outputs all non-space
      tokens.
    - It breaks on all spaces, even the "non-breaking" ones.
    """
    text = unicodedata.normalize('NFC', text)
-    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
+    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
    return [token.strip("'").casefold() for token in token_expr.findall(text)]
-def turkish_tokenize(text):
+def turkish_tokenize(text, include_punctuation=False):
    """
    Like `simple_tokenize`, but modifies i's so that they case-fold correctly
    in Turkish.
    """
    text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
-    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
+    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
    return [token.strip("'").casefold() for token in token_expr.findall(text)]
 mecab_tokenize = None
 def japanese_tokenize(text, include_punctuation=False):
    global mecab_tokenize
    if mecab_tokenize is None:
        from wordfreq.japanese import mecab_tokenize
    tokens = mecab_tokenize(text)
    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
    return [token.casefold() for token in tokens if token_expr.match(token)]
 jieba_tokenize = None
 def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
    global jieba_tokenize
    if jieba_tokenize is None:
        from wordfreq.chinese import jieba_tokenize
    tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
    return [token.casefold() for token in tokens if token_expr.match(token)]
 def remove_arabic_marks(text):
@ -86,53 +116,53 @@ def remove_arabic_marks(text):
    return ARABIC_MARK_RE.sub('', text)
-mecab_tokenize = None
+def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
 jieba_tokenize = None
 def tokenize(text, lang):
    """
    Tokenize this text in a way that's relatively simple but appropriate for
-    the language.
+    the language. Strings that are looked up in wordfreq will be run through
    this function first, so that they can be expected to match the data.
-    So far, this means:
+    Here is what the tokenizer will do, depending on the language:
    - Chinese will be mapped to Simplified Chinese characters and tokenized
      using the Jieba tokenizer, trained on a custom word list of words that
      can be looked up in wordfreq.
    - Japanese will be delegated to the external mecab-python module. It will
      be NFKC normalized, which is stronger than NFC normalization.
    - Chinese is presumed to already be tokenized. (Sorry. It's hard.)
    - Japanese will be delegated to the external mecab-python module.
    - Chinese or Japanese texts that aren't identified as the appropriate
      language will only split on punctuation and script boundaries, giving
      you untokenized globs of characters that probably represent many words.
    - Arabic will be NFKC normalized, and will have Arabic-specific combining
      marks and tatweels removed.
    - Languages written in cased alphabets will be case-folded to lowercase.
    - Turkish will use a different case-folding procedure, so that capital
      I and İ map to ı and i respectively.
    - All other languages will be tokenized using a regex that mostly
      implements the Word Segmentation section of Unicode Annex #29.
      See `simple_tokenize` for details.
-    Additionally, the text will be case-folded to lowercase, and text marked
+    - Languages besides Japanese and Chinese will be tokenized using a regex
-    as Arabic will be normalized more strongly and have combining marks and
+      that mostly implements the Word Segmentation section of Unicode Annex
-    tatweels removed.
+      #29. See `simple_tokenize` for details.
-    Strings that are looked up in wordfreq will be run through this function
+    The `external_wordlist` option only affects Chinese tokenization.  If it's
-    first, so that they can be expected to match the data.
+    True, then wordfreq will not use its own Chinese wordlist for tokenization.
    Instead, it will use the large wordlist packaged with the Jieba tokenizer,
    and it will leave Traditional Chinese characters as is. This will probably
    give more accurate tokenization, but the resulting tokens won't necessarily
    have word frequencies that can be looked up.
    """
    if lang == 'ja':
-        global mecab_tokenize
+        return japanese_tokenize(text, include_punctuation)
-        if mecab_tokenize is None:
+    elif lang == 'zh':
-            from wordfreq.japanese import mecab_tokenize
+        return chinese_tokenize(text, include_punctuation, external_wordlist)
-        tokens = mecab_tokenize(text)
+    elif lang == 'tr':
-        return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
+        return turkish_tokenize(text, include_punctuation)
-
+    elif lang == 'ar':
    if lang == 'zh':
        global jieba_tokenize
        if jieba_tokenize is None:
            from wordfreq.chinese import jieba_tokenize
        tokens = jieba_tokenize(text)
        return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
    if lang == 'tr':
        return turkish_tokenize(text)
    if lang == 'ar':
        text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
-
+        return simple_tokenize(text, include_punctuation)
-    return simple_tokenize(text)
+    else:
        return simple_tokenize(text, include_punctuation)
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -63,7 +63,7 @@ rule convert_opensubtitles
 # To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
 # run it through ftfy, convert tabs to commas and spurious CSV formatting to
-# and remove lines with unfixable half-mojibake.
+# spaces, and remove lines with unfixable half-mojibake.
 rule convert_subtlex
  command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr '	",' ',  ' | grep -v 'â,' > $out