mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-25 02:05:24 +00:00
Work on making Japanese tokenization use MeCab consistently
Former-commit-id: 05cf94d1fd
This commit is contained in:
parent
c66b55d8dd
commit
5db3c4ef9e
@ -18,15 +18,41 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
|||||||
CACHE_SIZE = 100000
|
CACHE_SIZE = 100000
|
||||||
|
|
||||||
|
|
||||||
def tokenize(text):
|
def simple_tokenize(text):
|
||||||
"""
|
"""
|
||||||
A simple tokenizer that can be applied to most languages. Strings that
|
A simple tokenizer that can be applied to most languages.
|
||||||
are looked up in wordfreq will be run through this tokenizer first,
|
|
||||||
so that they can be expected to match the data.
|
It considers a word to be made of a sequence of 'token characters', an
|
||||||
|
overly inclusive range that includes letters, Han characters, emoji, and a
|
||||||
|
bunch of miscellaneous whatnot, but excludes most punctuation and
|
||||||
|
whitespace.
|
||||||
|
|
||||||
|
The single complication for the sake of English is that apostrophes are not
|
||||||
|
considered part of the token if they appear on the edge of the character
|
||||||
|
sequence, but they are if they appear internally. "cats'" is not a token,
|
||||||
|
but "cat's" is.
|
||||||
"""
|
"""
|
||||||
return [token.lower() for token in TOKEN_RE.findall(text)]
|
return [token.lower() for token in TOKEN_RE.findall(text)]
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(text, lang):
|
||||||
|
"""
|
||||||
|
Tokenize this text in a way that's straightforward but appropriate for
|
||||||
|
the language.
|
||||||
|
|
||||||
|
So far, this means that Japanese is handled by mecab_tokenize, and
|
||||||
|
everything else is handled by simple_tokenize.
|
||||||
|
|
||||||
|
Strings that are looked up in wordfreq will be run through this function
|
||||||
|
first, so that they can be expected to match the data.
|
||||||
|
"""
|
||||||
|
if lang == 'ja':
|
||||||
|
from wordfreq.mecab import mecab_tokenize
|
||||||
|
return mecab_tokenize(text)
|
||||||
|
else:
|
||||||
|
return simple_tokenize(text)
|
||||||
|
|
||||||
|
|
||||||
def read_dBpack(filename):
|
def read_dBpack(filename):
|
||||||
"""
|
"""
|
||||||
Read a file from an idiosyncratic format that we use for storing
|
Read a file from an idiosyncratic format that we use for storing
|
||||||
@ -163,7 +189,7 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
|
|||||||
"""
|
"""
|
||||||
freqs = get_frequency_dict(lang, wordlist)
|
freqs = get_frequency_dict(lang, wordlist)
|
||||||
combined_value = None
|
combined_value = None
|
||||||
for token in tokenize(word):
|
for token in tokenize(word, lang):
|
||||||
if token not in freqs:
|
if token not in freqs:
|
||||||
# If any word is missing, just return the default value
|
# If any word is missing, just return the default value
|
||||||
return default
|
return default
|
||||||
|
Binary file not shown.
21
wordfreq/mecab.py
Normal file
21
wordfreq/mecab.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
import MeCab
|
||||||
|
|
||||||
|
|
||||||
|
# Instantiate the MeCab analyzer, which the mecab-python3 interface calls a
|
||||||
|
# Tagger.
|
||||||
|
MECAB_ANALYZER = MeCab.Tagger()
|
||||||
|
|
||||||
|
|
||||||
|
def mecab_tokenize(text):
|
||||||
|
"""
|
||||||
|
Use the mecab-python3 package to tokenize the given Japanese text.
|
||||||
|
|
||||||
|
The simplest output from mecab-python3 is the single-string form, which
|
||||||
|
contains the same table that the command-line version of MeCab would output.
|
||||||
|
We find the tokens in the first column of this table.
|
||||||
|
"""
|
||||||
|
parsed_str = MECAB_ANALYZER.parse(text.strip())
|
||||||
|
lines = [line for line in parsed_str.split('\n')
|
||||||
|
if line != '' and line != 'EOS']
|
||||||
|
tokens = [line.split('\t')[0] for line in lines]
|
||||||
|
return tokens
|
Loading…
Reference in New Issue
Block a user