update Japanese data; test Japanese and token combining

Former-commit-id: 611a6a35de
This commit is contained in:
Robyn Speer 2015-05-28 14:01:11 -04:00
parent 5db3c4ef9e
commit 860e929bf8
5 changed files with 68 additions and 1 deletions

26
tests/test_japanese.py Normal file
View File

@ -0,0 +1,26 @@
from nose.tools import eq_, assert_almost_equal
from wordfreq import tokenize, word_frequency, half_harmonic_mean
def test_tokens():
eq_(tokenize('おはようございます', 'ja'),
['おはよう', 'ござい', 'ます'])
def test_combination():
ohayou_freq = word_frequency('おはよう', 'ja')
gozai_freq = word_frequency('ござい', 'ja')
masu_freq = word_frequency('ます', 'ja')
assert_almost_equal(
word_frequency('おはようおはよう', 'ja'),
ohayou_freq / 2
)
assert_almost_equal(
word_frequency('おはようございます', 'ja'),
half_harmonic_mean(
half_harmonic_mean(ohayou_freq, gozai_freq),
masu_freq
)
)

30
tests/test_math.py Normal file
View File

@ -0,0 +1,30 @@
from nose.tools import assert_less_equal, assert_almost_equal
from wordfreq import half_harmonic_mean
from functools import reduce
import random
def check_hm_properties(inputs):
# I asserted that the half-harmonic-mean formula is associative,
# commutative, monotonic, and less than or equal to its inputs.
# (Less if its inputs are strictly positive, in fact.)
#
# So let's test that what I said is true.
hm1 = reduce(half_harmonic_mean, inputs)
random.shuffle(inputs)
hm2 = reduce(half_harmonic_mean, inputs)
assert_almost_equal(hm1, hm2)
inputs[0] *= 2
hm3 = reduce(half_harmonic_mean, inputs)
assert_less_equal(hm2, hm3)
def test_half_harmonic_mean():
for count in range(2, 6):
for rep in range(10):
# get some strictly positive arbitrary numbers
inputs = [random.expovariate(0.01)
for i in range(count)]
yield check_hm_properties, inputs

View File

@ -164,6 +164,17 @@ def iter_wordlist(lang, wordlist='combined'):
yield word
def half_harmonic_mean(a, b):
"""
An associative, commutative, monotonic function that returns a value
less than or equal to both a and b.
Used for estimating the frequency of terms made of multiple tokens, given
the assumption that the tokens very frequently appear together.
"""
return (a * b) / (a + b)
@lru_cache(maxsize=CACHE_SIZE)
def word_frequency(word, lang, wordlist='combined', default=0.):
"""
@ -199,7 +210,7 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
else:
# Combine word values using the half-harmonic-mean formula,
# (a * b) / (a + b). This operation is associative.
combined_value = (combined_value * value) / (combined_value + value)
combined_value = half_harmonic_mean(combined_value, value)
return combined_value

Binary file not shown.