Merge pull request #28 from LuminosoInsight/chinese-external-wordlist

Add some tokenizer options
2024-12-23 09:21:37 +00:00 · 2015-10-19 18:21:52 -04:00 · 2015-10-19 18:21:52 -04:00 · ca00dfa1d9
commit ca00dfa1d9
parent 15d99be21b a6b6aa07e7
7 changed files with 349200 additions and 69 deletions
--- a/README.md
+++ b/README.md
@ -15,13 +15,26 @@ or by getting the repository and running its setup.py:

    python3 setup.py install

-To handle word frequency lookups in Japanese, you need to additionally install
-mecab-python3, which itself depends on libmecab-dev. These commands will
-install them on Ubuntu:
+Japanese and Chinese have additional external dependencies so that they can be
+tokenized correctly.
+
+To be able to look up word frequencies in Japanese, you need to additionally
+install mecab-python3, which itself depends on libmecab-dev and its dictionary.
+These commands will install them on Ubuntu:

    sudo apt-get install mecab-ipadic-utf8 libmecab-dev
    pip3 install mecab-python3

+To be able to look up word frequencies in Chinese, you need Jieba, a
+pure-Python Chinese tokenizer:
+
+    pip3 install jieba
+
+These dependencies can also be requested as options when installing wordfreq.
+For example:
+
+    pip3 install wordfreq[mecab,jieba]
+

 ## Usage

--- a/tests/test.py
+++ b/tests/test.py
@ -21,17 +21,19 @@ def test_languages():
    avail = available_languages()
    assert_greater(len(avail), 15)

-    # Laughter is the universal language
+    # Laughter is the universal language. Look up either 'lol' or '笑' in each
+    # language and make sure it has a non-zero frequency.
    for lang in avail:
-        if lang not in {'zh', 'ja'}:
-            # we do not have enough Chinese data
-            # Japanese people do not lol
-            assert_greater(word_frequency('lol', lang), 0)
+        if lang in {'zh', 'ja'}:
+            text = '笑'
+        else:
+            text = 'lol'
+        assert_greater(word_frequency(text, lang), 0)

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
-            assert_greater(word_frequency('lol', new_lang_code), 0)
+        assert_greater(word_frequency(text, new_lang_code), 0)


 def test_twitter():
@ -99,6 +101,9 @@ def test_tokenization():
    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
    
+    eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
+        ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
+
    # Certain punctuation does not inherently split a word.
    eq_(tokenize("Anything is possible at zombo.com", 'en'),
        ['anything', 'is', 'possible', 'at', 'zombo.com'])
@ -108,6 +113,9 @@ def test_tokenization():

    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])

+    eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
+        ['this', 'text', 'has', '...', 'punctuation', ':)'])
+

 def test_casefolding():
    eq_(tokenize('WEISS', 'de'), ['weiss'])
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@ -34,6 +34,27 @@ def test_tokens():
        ]
    )

+    # Jieba's original tokenizer knows a lot of names, it seems.
+    eq_(
+        tokenize(hobart, 'zh', external_wordlist=True),
+        ['加勒特', '霍巴特']
+    )
+
+    # We get almost the same tokens from the sentence using Jieba's own
+    # wordlist, but it tokenizes "in history" as two words and
+    # "sixth person" as one.
+    eq_(
+        tokenize(fact_simplified, 'zh', external_wordlist=True),
+        [
+            # he / is / history / in / sixth person
+            '他', '是', '历史', '上', '第六位',
+            # during / term of office / in / die
+            '在', '任期', '内', '去世',
+            # of / U.S. / deputy / president
+            '的', '美国', '副', '总统'
+        ]
+    )
+
    # You match the same tokens if you look it up in Traditional Chinese.
    eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
    assert_greater(word_frequency(fact_traditional, 'zh'), 0)
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -4,9 +4,11 @@ import msgpack
 import gzip

 DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
+ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
 SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
 SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), encoding='utf-8')
 jieba_tokenizer = None
+jieba_orig_tokenizer = None


 def simplify_chinese(text):
@ -23,17 +25,28 @@ def simplify_chinese(text):
    return text.translate(SIMPLIFIED_MAP).casefold()


-def jieba_tokenize(text):
+def jieba_tokenize(text, external_wordlist=False):
    """
    Tokenize the given text into tokens whose word frequencies can probably
    be looked up. This uses Jieba, a word-frequency-based tokenizer.

-    We tell Jieba to default to using wordfreq's own Chinese wordlist, and not
-    to infer unknown words using a hidden Markov model. This ensures that the
-    multi-character tokens that it outputs will be ones whose word frequencies
-    we can look up.
+    If `external_wordlist` is False, we tell Jieba to default to using
+    wordfreq's own Chinese wordlist, and not to infer unknown words using a
+    hidden Markov model. This ensures that the multi-character tokens that it
+    outputs will be ones whose word frequencies we can look up.
+
+    If `external_wordlist` is True, this will use the largest version of
+    Jieba's original dictionary, with HMM enabled, so its results will be
+    independent of the data in wordfreq. These results will be better optimized
+    for purposes that aren't looking up word frequencies, such as general-
+    purpose tokenization, or collecting word frequencies in the first place.
    """
-    global jieba_tokenizer
+    global jieba_tokenizer, jieba_orig_tokenizer
+    if external_wordlist:
+        if jieba_orig_tokenizer is None:
+            jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
+        return jieba_orig_tokenizer.lcut(text)
+    else:
        if jieba_tokenizer is None:
            jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
        return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
--- a/wordfreq/data/jieba_zh_orig.txt
+++ b/wordfreq/data/jieba_zh_orig.txt
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -38,10 +38,16 @@ TOKEN_RE = regex.compile(r"""
    (?:\B\S)*
 """, regex.V1 | regex.WORD | regex.VERBOSE)

+TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
+    [\p{IsIdeo}\p{Script=Hiragana}]+ |
+    [\p{punct}]+ |
+    \S(?:\B\S)*
+""", regex.V1 | regex.WORD | regex.VERBOSE)
+
 ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)


-def simple_tokenize(text):
+def simple_tokenize(text, include_punctuation=False):
    """
    Tokenize the given text using a straightforward, Unicode-aware token
    expression.
@ -56,22 +62,46 @@ def simple_tokenize(text):
      ideograms and hiragana) relatively untokenized, instead of splitting each
      character into its own token.

-    - It outputs only the tokens that start with a word-like character, or
-      miscellaneous symbols such as emoji.
+    - If `include_punctuation` is False (the default), it outputs only the
+      tokens that start with a word-like character, or miscellaneous symbols
+      such as emoji. If `include_punctuation` is True, it outputs all non-space
+      tokens.

    - It breaks on all spaces, even the "non-breaking" ones.
    """
    text = unicodedata.normalize('NFC', text)
-    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
+    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
+    return [token.strip("'").casefold() for token in token_expr.findall(text)]


-def turkish_tokenize(text):
+def turkish_tokenize(text, include_punctuation=False):
    """
    Like `simple_tokenize`, but modifies i's so that they case-fold correctly
    in Turkish.
    """
    text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
-    return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
+    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
+    return [token.strip("'").casefold() for token in token_expr.findall(text)]
+
+
+mecab_tokenize = None
+def japanese_tokenize(text, include_punctuation=False):
+    global mecab_tokenize
+    if mecab_tokenize is None:
+        from wordfreq.japanese import mecab_tokenize
+    tokens = mecab_tokenize(text)
+    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
+    return [token.casefold() for token in tokens if token_expr.match(token)]
+
+
+jieba_tokenize = None
+def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
+    global jieba_tokenize
+    if jieba_tokenize is None:
+        from wordfreq.chinese import jieba_tokenize
+    tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
+    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
+    return [token.casefold() for token in tokens if token_expr.match(token)]


 def remove_arabic_marks(text):
@ -86,53 +116,53 @@ def remove_arabic_marks(text):
    return ARABIC_MARK_RE.sub('', text)


-mecab_tokenize = None
-jieba_tokenize = None
-def tokenize(text, lang):
+def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
    """
    Tokenize this text in a way that's relatively simple but appropriate for
-    the language.
+    the language. Strings that are looked up in wordfreq will be run through
+    this function first, so that they can be expected to match the data.

-    So far, this means:
+    Here is what the tokenizer will do, depending on the language:
+
+    - Chinese will be mapped to Simplified Chinese characters and tokenized
+      using the Jieba tokenizer, trained on a custom word list of words that
+      can be looked up in wordfreq.
+
+    - Japanese will be delegated to the external mecab-python module. It will
+      be NFKC normalized, which is stronger than NFC normalization.

-    - Chinese is presumed to already be tokenized. (Sorry. It's hard.)
-    - Japanese will be delegated to the external mecab-python module.
    - Chinese or Japanese texts that aren't identified as the appropriate
      language will only split on punctuation and script boundaries, giving
      you untokenized globs of characters that probably represent many words.
+
+    - Arabic will be NFKC normalized, and will have Arabic-specific combining
+      marks and tatweels removed.
+
+    - Languages written in cased alphabets will be case-folded to lowercase.
+
    - Turkish will use a different case-folding procedure, so that capital
      I and İ map to ı and i respectively.
-    - All other languages will be tokenized using a regex that mostly
-      implements the Word Segmentation section of Unicode Annex #29.
-      See `simple_tokenize` for details.

-    Additionally, the text will be case-folded to lowercase, and text marked
-    as Arabic will be normalized more strongly and have combining marks and
-    tatweels removed.
+    - Languages besides Japanese and Chinese will be tokenized using a regex
+      that mostly implements the Word Segmentation section of Unicode Annex
+      #29. See `simple_tokenize` for details.

-    Strings that are looked up in wordfreq will be run through this function
-    first, so that they can be expected to match the data.
+    The `external_wordlist` option only affects Chinese tokenization.  If it's
+    True, then wordfreq will not use its own Chinese wordlist for tokenization.
+    Instead, it will use the large wordlist packaged with the Jieba tokenizer,
+    and it will leave Traditional Chinese characters as is. This will probably
+    give more accurate tokenization, but the resulting tokens won't necessarily
+    have word frequencies that can be looked up.
    """
    if lang == 'ja':
-        global mecab_tokenize
-        if mecab_tokenize is None:
-            from wordfreq.japanese import mecab_tokenize
-        tokens = mecab_tokenize(text)
-        return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
-
-    if lang == 'zh':
-        global jieba_tokenize
-        if jieba_tokenize is None:
-            from wordfreq.chinese import jieba_tokenize
-        tokens = jieba_tokenize(text)
-        return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
-
-
-    if lang == 'tr':
-        return turkish_tokenize(text)
-
-    if lang == 'ar':
+        return japanese_tokenize(text, include_punctuation)
+    elif lang == 'zh':
+        return chinese_tokenize(text, include_punctuation, external_wordlist)
+    elif lang == 'tr':
+        return turkish_tokenize(text, include_punctuation)
+    elif lang == 'ar':
        text = remove_arabic_marks(unicodedata.normalize('NFKC', text))
-
-    return simple_tokenize(text)
+        return simple_tokenize(text, include_punctuation)
+    else:
+        return simple_tokenize(text, include_punctuation)

--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -63,7 +63,7 @@ rule convert_opensubtitles

 # To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
 # run it through ftfy, convert tabs to commas and spurious CSV formatting to
-# and remove lines with unfixable half-mojibake.
+# spaces, and remove lines with unfixable half-mojibake.
 rule convert_subtlex
  command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr '	",' ',  ' | grep -v 'â,' > $out