wordfreq/tests/test_japanese.py

55 lines
2.4 KiB
Python
Raw Normal View History

from wordfreq import tokenize, simple_tokenize, word_frequency
2018-06-01 20:40:51 +00:00
import pytest
def test_tokens():
assert tokenize("おはようございます", "ja") == ["おはよう", "ござい", "ます"]
def test_simple_tokenize():
# When Japanese is run through simple_tokenize -- either because it's
# tagged with the wrong language, or because we want to pass through
# Japanese text without getting MeCab involved -- it will be split at
# boundaries between Japanese and non-Japanese scripts, but all Japanese
# scripts will be stuck together. Here the switch between hiragana
# (ひらがな) and katakana (カタカナ) is not a boundary, but the switch
# between katakana and romaji is.
#
# We used to try to infer word boundaries between hiragana and katakana,
# but this leads to edge cases that are unsolvable without a dictionary.
ja_text = "ひらがなカタカナromaji"
assert simple_tokenize(ja_text) == ["ひらがなカタカナ", "romaji"]
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
# but sticks together in simple_tokenize
assert simple_tokenize("おはようございます") == ["おはようございます"]
# Names that use the weird possessive marker ヶ, which is technically a
# katakana even though it's being used like a kanji, stay together as one
# token
2018-06-01 20:40:51 +00:00
assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]
# The word in ConceptNet that made me notice that simple_tokenize used
# to have a problem with the character 々
2018-06-01 20:40:51 +00:00
assert simple_tokenize("晴々しい") == ["晴々しい"]
# Explicit word separators are still token boundaries, such as the dot
# between "toner" and "cartridge" in "toner cartridge"
2018-06-01 20:40:51 +00:00
assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"]
# This word has multiple weird characters that aren't quite kanji in it,
# and is in the dictionary
2018-06-01 20:40:51 +00:00
assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]
def test_combination():
ohayou_freq = word_frequency("おはよう", "ja")
gozai_freq = word_frequency("ござい", "ja")
masu_freq = word_frequency("ます", "ja")
assert word_frequency("おはようおはよう", "ja") == pytest.approx(ohayou_freq / 2, rel=0.01)
assert 1.0 / word_frequency("おはようございます", "ja") == pytest.approx(
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01
)