Work on making Japanese tokenization use MeCab consistently

Former-commit-id: 05cf94d1fd
This commit is contained in:
Robyn Speer 2015-05-27 18:10:25 -04:00
parent c66b55d8dd
commit 5db3c4ef9e
3 changed files with 52 additions and 5 deletions

View File

@ -18,15 +18,41 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
CACHE_SIZE = 100000 CACHE_SIZE = 100000
def tokenize(text): def simple_tokenize(text):
""" """
A simple tokenizer that can be applied to most languages. Strings that A simple tokenizer that can be applied to most languages.
are looked up in wordfreq will be run through this tokenizer first,
so that they can be expected to match the data. It considers a word to be made of a sequence of 'token characters', an
overly inclusive range that includes letters, Han characters, emoji, and a
bunch of miscellaneous whatnot, but excludes most punctuation and
whitespace.
The single complication for the sake of English is that apostrophes are not
considered part of the token if they appear on the edge of the character
sequence, but they are if they appear internally. "cats'" is not a token,
but "cat's" is.
""" """
return [token.lower() for token in TOKEN_RE.findall(text)] return [token.lower() for token in TOKEN_RE.findall(text)]
def tokenize(text, lang):
"""
Tokenize this text in a way that's straightforward but appropriate for
the language.
So far, this means that Japanese is handled by mecab_tokenize, and
everything else is handled by simple_tokenize.
Strings that are looked up in wordfreq will be run through this function
first, so that they can be expected to match the data.
"""
if lang == 'ja':
from wordfreq.mecab import mecab_tokenize
return mecab_tokenize(text)
else:
return simple_tokenize(text)
def read_dBpack(filename): def read_dBpack(filename):
""" """
Read a file from an idiosyncratic format that we use for storing Read a file from an idiosyncratic format that we use for storing
@ -163,7 +189,7 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
""" """
freqs = get_frequency_dict(lang, wordlist) freqs = get_frequency_dict(lang, wordlist)
combined_value = None combined_value = None
for token in tokenize(word): for token in tokenize(word, lang):
if token not in freqs: if token not in freqs:
# If any word is missing, just return the default value # If any word is missing, just return the default value
return default return default

Binary file not shown.

21
wordfreq/mecab.py Normal file
View File

@ -0,0 +1,21 @@
import MeCab
# Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
# Tagger.
MECAB_ANALYZER = MeCab.Tagger()
def mecab_tokenize(text):
"""
Use the mecab-python3 package to tokenize the given Japanese text.
The simplest output from mecab-python3 is the single-string form, which
contains the same table that the command-line version of MeCab would output.
We find the tokens in the first column of this table.
"""
parsed_str = MECAB_ANALYZER.parse(text.strip())
lines = [line for line in parsed_str.split('\n')
if line != '' and line != 'EOS']
tokens = [line.split('\t')[0] for line in lines]
return tokens