add external_wordlist option to tokenize

2024-12-23 17:31:41 +00:00 · 2015-09-10 18:09:41 -04:00 · 2015-09-10 18:09:41 -04:00 · 669bd16c13
commit 669bd16c13
parent 3cb3061e06
4 changed files with 349108 additions and 15 deletions
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@ -25,12 +25,33 @@ def test_tokens():
    eq_(
        tokenize(fact_simplified, 'zh'),
        [
-         # he / is / in history / #6 / counter for people
+            # he / is / in history / #6 / counter for people
-         '他', '是',  '历史上', '第六', '位',
+            '他', '是',  '历史上', '第六', '位',
-         # during / term of office / in / die
+            # during / term of office / in / die
-         '在', '任期', '内', '去世',
+            '在', '任期', '内', '去世',
-         # of / U.S. / deputy / president
+            # of / U.S. / deputy / president
-         '的', '美国', '副', '总统'
+            '的', '美国', '副', '总统'
        ]
    )
    # Jieba's original tokenizer knows a lot of names, it sees.
    eq_(
        tokenize(hobart, 'zh', external_wordlist=True),
        ['加勒特', '霍巴特']
    )
    # We get almost the same tokens from the sentence using Jieba's own
    # wordlist, but it tokenizes "in history" as two words and
    # "sixth person" as one.
    eq_(
        tokenize(fact_simplified, 'zh', external_wordlist=True),
        [
            # he / is / history / in / sixth person
            '他', '是', '历史', '上', '第六位',
            # during / term of office / in / die
            '在', '任期', '内', '去世',
            # of / U.S. / deputy / president
            '的', '美国', '副', '总统'
        ]
    )
--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -4,16 +4,34 @@ import jieba
 jieba_tokenizer = None
 jieba_orig_tokenizer = None
 DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
 ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
 def simplify_chinese(text):
    return text.translate(SIMPLIFIED_MAP).casefold()
-def jieba_tokenize(text):
+def jieba_tokenize(text, external_wordlist=False):
-    global jieba_tokenizer
+    """
-    if jieba_tokenizer is None:
+    If `external_wordlist` is False, this will tokenize the given text with our
-        jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
+    custom Jieba dictionary, which contains only the strings that have
-    return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
+    frequencies in wordfreq.
    This is perhaps suboptimal as a general-purpose Chinese tokenizer, but for
    the purpose of looking up frequencies, it's ideal.
    If `external_wordlist` is True, this will use the largest version of
    Jieba's original dictionary, so its results will be independent of the
    data in wordfreq.
    """
    global jieba_tokenizer, jieba_orig_tokenizer
    if external_wordlist:
        if jieba_orig_tokenizer is None:
            jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
        return jieba_orig_tokenizer.lcut(text)
    else:
        if jieba_tokenizer is None:
            jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
        return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
--- a/wordfreq/data/jieba_zh_orig.txt
+++ b/wordfreq/data/jieba_zh_orig.txt
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -89,14 +89,16 @@ def remove_arabic_marks(text):
 mecab_tokenize = None
 jieba_tokenize = None
-def tokenize(text, lang):
+def tokenize(text, lang, external_wordlist=False):
    """
    Tokenize this text in a way that's relatively simple but appropriate for
    the language.
    So far, this means:
-    - Chinese is presumed to already be tokenized. (Sorry. It's hard.)
+    - Chinese will be mapped to Simplified Chinese characters and tokenized
      using the jieba tokenizer, on a custom word list of words that can be
      looked up in wordfreq.
    - Japanese will be delegated to the external mecab-python module.
    - Chinese or Japanese texts that aren't identified as the appropriate
      language will only split on punctuation and script boundaries, giving
@ -111,6 +113,13 @@ def tokenize(text, lang):
    as Arabic will be normalized more strongly and have combining marks and
    tatweels removed.
    If `external_wordlist` is True, then the Chinese wordlist in wordfreq will
    not be used for tokenization. Instead, it will use the large wordlist
    packaged with the Jieba tokenizer, and it will leave Traditional Chinese
    characters as is. This will probably give more accurate tokenization, but
    the resulting tokens won't necessarily have word frequencies that can be
    looked up.
    Strings that are looked up in wordfreq will be run through this function
    first, so that they can be expected to match the data.
    """
@ -125,10 +134,9 @@ def tokenize(text, lang):
        global jieba_tokenize
        if jieba_tokenize is None:
            from wordfreq.chinese import jieba_tokenize
-        tokens = jieba_tokenize(text)
+        tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
        return [token.casefold() for token in tokens if TOKEN_RE.match(token)]
    if lang == 'tr':
        return turkish_tokenize(text)