add external_wordlist option to tokenize

Former-commit-id: 669bd16c13
2024-12-23 17:31:41 +00:00 · 2015-09-10 18:09:41 -04:00 · 2015-09-10 18:09:41 -04:00 · 1adbb1aaf1
commit 1adbb1aaf1
parent f2be213933
4 changed files with 349108 additions and 15 deletions
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@ -25,12 +25,33 @@ def test_tokens():
    eq_(
        tokenize(fact_simplified, 'zh'),
        [
-         # he / is / in history / #6 / counter for people
-         '他', '是',  '历史上', '第六', '位',
-         # during / term of office / in / die
-         '在', '任期', '内', '去世',
-         # of / U.S. / deputy / president
-         '的', '美国', '副', '总统'
+            # he / is / in history / #6 / counter for people
+            '他', '是',  '历史上', '第六', '位',
+            # during / term of office / in / die
+            '在', '任期', '内', '去世',
+            # of / U.S. / deputy / president
+            '的', '美国', '副', '总统'
+        ]
+    )
+
+    # Jieba's original tokenizer knows a lot of names, it sees.
+    eq_(
+        tokenize(hobart, 'zh', external_wordlist=True),
+        ['加勒特', '霍巴特']
+    )
+
+    # We get almost the same tokens from the sentence using Jieba's own
+    # wordlist, but it tokenizes "in history" as two words and
+    # "sixth person" as one.
+    eq_(
+        tokenize(fact_simplified, 'zh', external_wordlist=True),
+        [
+            # he / is / history / in / sixth person
+            '他', '是', '历史', '上', '第六位',
+            # during / term of office / in / die
+            '在', '任期', '内', '去世',
+            # of / U.S. / deputy / president
+            '的', '美国', '副', '总统'
        ]
    )

--- a/wordfreq/chinese.py
+++ b/wordfreq/chinese.py
@ -4,16 +4,34 @@ import jieba


 jieba_tokenizer = None
+jieba_orig_tokenizer = None
 DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
+ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')


 def simplify_chinese(text):
    return text.translate(SIMPLIFIED_MAP).casefold()


-def jieba_tokenize(text):
-    global jieba_tokenizer
-    if jieba_tokenizer is None:
-        jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
-    return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
+def jieba_tokenize(text, external_wordlist=False):
+    """
+    If `external_wordlist` is False, this will tokenize the given text with our
+    custom Jieba dictionary, which contains only the strings that have
+    frequencies in wordfreq.

+    This is perhaps suboptimal as a general-purpose Chinese tokenizer, but for
+    the purpose of looking up frequencies, it's ideal.
+
+    If `external_wordlist` is True, this will use the largest version of
+    Jieba's original dictionary, so its results will be independent of the
+    data in wordfreq.
+    """
+    global jieba_tokenizer, jieba_orig_tokenizer
+    if external_wordlist:
+        if jieba_orig_tokenizer is None:
+            jieba_orig_tokenizer = jieba.Tokenizer(dictionary=ORIG_DICT_FILENAME)
+        return jieba_orig_tokenizer.lcut(text)
+    else:
+        if jieba_tokenizer is None:
+            jieba_tokenizer = jieba.Tokenizer(dictionary=DICT_FILENAME)
+        return jieba_tokenizer.lcut(simplify_chinese(text), HMM=False)
--- a/wordfreq/data/jieba_zh_orig.txt
+++ b/wordfreq/data/jieba_zh_orig.txt
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -89,14 +89,16 @@ def remove_arabic_marks(text):

 mecab_tokenize = None
 jieba_tokenize = None
-def tokenize(text, lang):
+def tokenize(text, lang, external_wordlist=False):
    """
    Tokenize this text in a way that's relatively simple but appropriate for
    the language.

    So far, this means:

-    - Chinese is presumed to already be tokenized. (Sorry. It's hard.)
+    - Chinese will be mapped to Simplified Chinese characters and tokenized
+      using the jieba tokenizer, on a custom word list of words that can be
+      looked up in wordfreq.
    - Japanese will be delegated to the external mecab-python module.
    - Chinese or Japanese texts that aren't identified as the appropriate
      language will only split on punctuation and script boundaries, giving
@ -111,6 +113,13 @@ def tokenize(text, lang):
    as Arabic will be normalized more strongly and have combining marks and
    tatweels removed.

+    If `external_wordlist` is True, then the Chinese wordlist in wordfreq will
+    not be used for tokenization. Instead, it will use the large wordlist
+    packaged with the Jieba tokenizer, and it will leave Traditional Chinese
+    characters as is. This will probably give more accurate tokenization, but
+    the resulting tokens won't necessarily have word frequencies that can be
+    looked up.
+
    Strings that are looked up in wordfreq will be run through this function
    first, so that they can be expected to match the data.
    """
@ -125,10 +134,9 @@ def tokenize(text, lang):
        global jieba_tokenize
        if jieba_tokenize is None:
            from wordfreq.chinese import jieba_tokenize
-        tokens = jieba_tokenize(text)
+        tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
        return [token.casefold() for token in tokens if TOKEN_RE.match(token)]

-
    if lang == 'tr':
        return turkish_tokenize(text)