mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
47 lines
1.3 KiB
Python
47 lines
1.3 KiB
Python
from __future__ import annotations
|
|
|
|
import unicodedata
|
|
|
|
import MeCab
|
|
|
|
|
|
def make_mecab_analyzer(lang: str) -> MeCab.Tagger:
|
|
"""
|
|
Get a MeCab analyzer object, given the language code of the language to
|
|
analyze.
|
|
"""
|
|
if lang == "ko":
|
|
import mecab_ko_dic
|
|
|
|
return MeCab.Tagger(mecab_ko_dic.MECAB_ARGS)
|
|
elif lang == "ja":
|
|
import ipadic
|
|
|
|
return MeCab.Tagger(ipadic.MECAB_ARGS)
|
|
else:
|
|
raise ValueError(f"Can't run MeCab on language {lang}")
|
|
|
|
|
|
# The constructed analyzers will go in this dictionary.
|
|
MECAB_ANALYZERS: dict[str, MeCab.Tagger] = {}
|
|
|
|
|
|
def mecab_tokenize(text: str, lang: str) -> list[str]:
|
|
"""
|
|
Use the mecab-python3 package to tokenize the given text. The `lang`
|
|
must be 'ja' for Japanese or 'ko' for Korean.
|
|
|
|
The simplest output from mecab-python3 is the single-string form, which
|
|
contains the same table that the command-line version of MeCab would output.
|
|
We find the tokens in the first column of this table.
|
|
"""
|
|
if lang not in MECAB_ANALYZERS:
|
|
MECAB_ANALYZERS[lang] = make_mecab_analyzer(lang)
|
|
|
|
analyzer = MECAB_ANALYZERS[lang]
|
|
text = unicodedata.normalize("NFKC", text.strip())
|
|
analyzed = analyzer.parse(text)
|
|
if not analyzed:
|
|
return []
|
|
return [line.split("\t")[0] for line in analyzed.split("\n") if line != "" and line != "EOS"]
|