mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
update Japanese data; test Japanese and token combining
This commit is contained in:
parent
05cf94d1fd
commit
611a6a35de
26
tests/test_japanese.py
Normal file
26
tests/test_japanese.py
Normal file
@ -0,0 +1,26 @@
|
||||
from nose.tools import eq_, assert_almost_equal
|
||||
from wordfreq import tokenize, word_frequency, half_harmonic_mean
|
||||
|
||||
|
||||
def test_tokens():
|
||||
eq_(tokenize('おはようございます', 'ja'),
|
||||
['おはよう', 'ござい', 'ます'])
|
||||
|
||||
|
||||
def test_combination():
|
||||
ohayou_freq = word_frequency('おはよう', 'ja')
|
||||
gozai_freq = word_frequency('ござい', 'ja')
|
||||
masu_freq = word_frequency('ます', 'ja')
|
||||
|
||||
assert_almost_equal(
|
||||
word_frequency('おはようおはよう', 'ja'),
|
||||
ohayou_freq / 2
|
||||
)
|
||||
assert_almost_equal(
|
||||
word_frequency('おはようございます', 'ja'),
|
||||
half_harmonic_mean(
|
||||
half_harmonic_mean(ohayou_freq, gozai_freq),
|
||||
masu_freq
|
||||
)
|
||||
)
|
||||
|
30
tests/test_math.py
Normal file
30
tests/test_math.py
Normal file
@ -0,0 +1,30 @@
|
||||
from nose.tools import assert_less_equal, assert_almost_equal
|
||||
from wordfreq import half_harmonic_mean
|
||||
from functools import reduce
|
||||
import random
|
||||
|
||||
|
||||
def check_hm_properties(inputs):
|
||||
# I asserted that the half-harmonic-mean formula is associative,
|
||||
# commutative, monotonic, and less than or equal to its inputs.
|
||||
# (Less if its inputs are strictly positive, in fact.)
|
||||
#
|
||||
# So let's test that what I said is true.
|
||||
hm1 = reduce(half_harmonic_mean, inputs)
|
||||
random.shuffle(inputs)
|
||||
hm2 = reduce(half_harmonic_mean, inputs)
|
||||
assert_almost_equal(hm1, hm2)
|
||||
|
||||
inputs[0] *= 2
|
||||
hm3 = reduce(half_harmonic_mean, inputs)
|
||||
assert_less_equal(hm2, hm3)
|
||||
|
||||
|
||||
def test_half_harmonic_mean():
|
||||
for count in range(2, 6):
|
||||
for rep in range(10):
|
||||
# get some strictly positive arbitrary numbers
|
||||
inputs = [random.expovariate(0.01)
|
||||
for i in range(count)]
|
||||
yield check_hm_properties, inputs
|
||||
|
@ -164,6 +164,17 @@ def iter_wordlist(lang, wordlist='combined'):
|
||||
yield word
|
||||
|
||||
|
||||
def half_harmonic_mean(a, b):
|
||||
"""
|
||||
An associative, commutative, monotonic function that returns a value
|
||||
less than or equal to both a and b.
|
||||
|
||||
Used for estimating the frequency of terms made of multiple tokens, given
|
||||
the assumption that the tokens very frequently appear together.
|
||||
"""
|
||||
return (a * b) / (a + b)
|
||||
|
||||
|
||||
@lru_cache(maxsize=CACHE_SIZE)
|
||||
def word_frequency(word, lang, wordlist='combined', default=0.):
|
||||
"""
|
||||
@ -199,7 +210,7 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
|
||||
else:
|
||||
# Combine word values using the half-harmonic-mean formula,
|
||||
# (a * b) / (a + b). This operation is associative.
|
||||
combined_value = (combined_value * value) / (combined_value + value)
|
||||
combined_value = half_harmonic_mean(combined_value, value)
|
||||
return combined_value
|
||||
|
||||
|
||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user