mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-25 10:15:23 +00:00
a0893af82e
* Remove marks from more languages
* Add Korean tokenization, and include MeCab files in data
* add a Hebrew tokenization test
* fix terminology in docstrings about abjad scripts
* combine Japanese and Korean tokenization into the same function
Former-commit-id: fec6eddcc3
23 lines
552 B
Python
23 lines
552 B
Python
from nose.tools import eq_, assert_almost_equal
|
|
from wordfreq import tokenize, word_frequency
|
|
|
|
|
|
def test_tokens():
|
|
eq_(tokenize('감사합니다', 'ko'),
|
|
['감사', '합니다'])
|
|
|
|
|
|
def test_combination():
|
|
gamsa_freq = word_frequency('감사', 'ko')
|
|
habnida_freq = word_frequency('합니다', 'ko')
|
|
|
|
assert_almost_equal(
|
|
word_frequency('감사감사', 'ko'),
|
|
gamsa_freq / 2
|
|
)
|
|
assert_almost_equal(
|
|
1.0 / word_frequency('감사합니다', 'ko'),
|
|
1.0 / gamsa_freq + 1.0 / habnida_freq
|
|
)
|
|
|