Work on making Japanese tokenization use MeCab consistently

Former-commit-id: 05cf94d1fd
2024-12-24 09:51:38 +00:00 · 2015-05-27 18:10:25 -04:00 · 2015-05-27 18:10:25 -04:00 · 5db3c4ef9e
commit 5db3c4ef9e
parent c66b55d8dd
3 changed files with 52 additions and 5 deletions
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -18,15 +18,41 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 CACHE_SIZE = 100000
-def tokenize(text):
+def simple_tokenize(text):
    """
-    A simple tokenizer that can be applied to most languages. Strings that
+    A simple tokenizer that can be applied to most languages.
-    are looked up in wordfreq will be run through this tokenizer first,
+
-    so that they can be expected to match the data.
+    It considers a word to be made of a sequence of 'token characters', an
    overly inclusive range that includes letters, Han characters, emoji, and a
    bunch of miscellaneous whatnot, but excludes most punctuation and
    whitespace.
    The single complication for the sake of English is that apostrophes are not
    considered part of the token if they appear on the edge of the character
    sequence, but they are if they appear internally. "cats'" is not a token,
    but "cat's" is.
    """
    return [token.lower() for token in TOKEN_RE.findall(text)]
 def tokenize(text, lang):
    """
    Tokenize this text in a way that's straightforward but appropriate for
    the language.
    So far, this means that Japanese is handled by mecab_tokenize, and
    everything else is handled by simple_tokenize.
    Strings that are looked up in wordfreq will be run through this function
    first, so that they can be expected to match the data.
    """
    if lang == 'ja':
        from wordfreq.mecab import mecab_tokenize
        return mecab_tokenize(text)
    else:
        return simple_tokenize(text)
 def read_dBpack(filename):
    """
    Read a file from an idiosyncratic format that we use for storing
@ -163,7 +189,7 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
    """
    freqs = get_frequency_dict(lang, wordlist)
    combined_value = None
-    for token in tokenize(word):
+    for token in tokenize(word, lang):
        if token not in freqs:
            # If any word is missing, just return the default value
            return default
--- a/wordfreq/data/combined_ja.msgpack.gz
+++ b/wordfreq/data/combined_ja.msgpack.gz
--- a/wordfreq/mecab.py
+++ b/wordfreq/mecab.py
@ -0,0 +1,21 @@
 import MeCab
 # Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
 # Tagger.
 MECAB_ANALYZER = MeCab.Tagger()
 def mecab_tokenize(text):
    """
    Use the mecab-python3 package to tokenize the given Japanese text.
    The simplest output from mecab-python3 is the single-string form, which
    contains the same table that the command-line version of MeCab would output.
    We find the tokens in the first column of this table.
    """
    parsed_str = MECAB_ANALYZER.parse(text.strip())
    lines = [line for line in parsed_str.split('\n')
             if line != '' and line != 'EOS']
    tokens = [line.split('\t')[0] for line in lines]
    return tokens