mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Work on making Japanese tokenization use MeCab consistently
Former-commit-id: 05cf94d1fd
This commit is contained in:
parent
c66b55d8dd
commit
5db3c4ef9e
@ -18,15 +18,41 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||
CACHE_SIZE = 100000
|
||||
|
||||
|
||||
def tokenize(text):
|
||||
def simple_tokenize(text):
|
||||
"""
|
||||
A simple tokenizer that can be applied to most languages. Strings that
|
||||
are looked up in wordfreq will be run through this tokenizer first,
|
||||
so that they can be expected to match the data.
|
||||
A simple tokenizer that can be applied to most languages.
|
||||
|
||||
It considers a word to be made of a sequence of 'token characters', an
|
||||
overly inclusive range that includes letters, Han characters, emoji, and a
|
||||
bunch of miscellaneous whatnot, but excludes most punctuation and
|
||||
whitespace.
|
||||
|
||||
The single complication for the sake of English is that apostrophes are not
|
||||
considered part of the token if they appear on the edge of the character
|
||||
sequence, but they are if they appear internally. "cats'" is not a token,
|
||||
but "cat's" is.
|
||||
"""
|
||||
return [token.lower() for token in TOKEN_RE.findall(text)]
|
||||
|
||||
|
||||
def tokenize(text, lang):
|
||||
"""
|
||||
Tokenize this text in a way that's straightforward but appropriate for
|
||||
the language.
|
||||
|
||||
So far, this means that Japanese is handled by mecab_tokenize, and
|
||||
everything else is handled by simple_tokenize.
|
||||
|
||||
Strings that are looked up in wordfreq will be run through this function
|
||||
first, so that they can be expected to match the data.
|
||||
"""
|
||||
if lang == 'ja':
|
||||
from wordfreq.mecab import mecab_tokenize
|
||||
return mecab_tokenize(text)
|
||||
else:
|
||||
return simple_tokenize(text)
|
||||
|
||||
|
||||
def read_dBpack(filename):
|
||||
"""
|
||||
Read a file from an idiosyncratic format that we use for storing
|
||||
@ -163,7 +189,7 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
|
||||
"""
|
||||
freqs = get_frequency_dict(lang, wordlist)
|
||||
combined_value = None
|
||||
for token in tokenize(word):
|
||||
for token in tokenize(word, lang):
|
||||
if token not in freqs:
|
||||
# If any word is missing, just return the default value
|
||||
return default
|
||||
|
Binary file not shown.
21
wordfreq/mecab.py
Normal file
21
wordfreq/mecab.py
Normal file
@ -0,0 +1,21 @@
|
||||
import MeCab
|
||||
|
||||
|
||||
# Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
|
||||
# Tagger.
|
||||
MECAB_ANALYZER = MeCab.Tagger()
|
||||
|
||||
|
||||
def mecab_tokenize(text):
|
||||
"""
|
||||
Use the mecab-python3 package to tokenize the given Japanese text.
|
||||
|
||||
The simplest output from mecab-python3 is the single-string form, which
|
||||
contains the same table that the command-line version of MeCab would output.
|
||||
We find the tokens in the first column of this table.
|
||||
"""
|
||||
parsed_str = MECAB_ANALYZER.parse(text.strip())
|
||||
lines = [line for line in parsed_str.split('\n')
|
||||
if line != '' and line != 'EOS']
|
||||
tokens = [line.split('\t')[0] for line in lines]
|
||||
return tokens
|
Loading…
Reference in New Issue
Block a user