wordfreq/tests/test_korean.py
Robyn Speer 0a2bfb2710 Tokenization in Korean, plus abjad languages (#38)
* Remove marks from more languages

* Add Korean tokenization, and include MeCab files in data

* add a Hebrew tokenization test

* fix terminology in docstrings about abjad scripts

* combine Japanese and Korean tokenization into the same function


Former-commit-id: fec6eddcc3
2016-07-15 15:10:25 -04:00

23 lines
552 B
Python

from nose.tools import eq_, assert_almost_equal
from wordfreq import tokenize, word_frequency
def test_tokens():
eq_(tokenize('감사합니다', 'ko'),
['감사', '합니다'])
def test_combination():
gamsa_freq = word_frequency('감사', 'ko')
habnida_freq = word_frequency('합니다', 'ko')
assert_almost_equal(
word_frequency('감사감사', 'ko'),
gamsa_freq / 2
)
assert_almost_equal(
1.0 / word_frequency('감사합니다', 'ko'),
1.0 / gamsa_freq + 1.0 / habnida_freq
)