mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
update Japanese data; test Japanese and token combining
This commit is contained in:
parent
05cf94d1fd
commit
611a6a35de
26
tests/test_japanese.py
Normal file
26
tests/test_japanese.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
from nose.tools import eq_, assert_almost_equal
|
||||||
|
from wordfreq import tokenize, word_frequency, half_harmonic_mean
|
||||||
|
|
||||||
|
|
||||||
|
def test_tokens():
|
||||||
|
eq_(tokenize('おはようございます', 'ja'),
|
||||||
|
['おはよう', 'ござい', 'ます'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_combination():
|
||||||
|
ohayou_freq = word_frequency('おはよう', 'ja')
|
||||||
|
gozai_freq = word_frequency('ござい', 'ja')
|
||||||
|
masu_freq = word_frequency('ます', 'ja')
|
||||||
|
|
||||||
|
assert_almost_equal(
|
||||||
|
word_frequency('おはようおはよう', 'ja'),
|
||||||
|
ohayou_freq / 2
|
||||||
|
)
|
||||||
|
assert_almost_equal(
|
||||||
|
word_frequency('おはようございます', 'ja'),
|
||||||
|
half_harmonic_mean(
|
||||||
|
half_harmonic_mean(ohayou_freq, gozai_freq),
|
||||||
|
masu_freq
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
30
tests/test_math.py
Normal file
30
tests/test_math.py
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
from nose.tools import assert_less_equal, assert_almost_equal
|
||||||
|
from wordfreq import half_harmonic_mean
|
||||||
|
from functools import reduce
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
def check_hm_properties(inputs):
|
||||||
|
# I asserted that the half-harmonic-mean formula is associative,
|
||||||
|
# commutative, monotonic, and less than or equal to its inputs.
|
||||||
|
# (Less if its inputs are strictly positive, in fact.)
|
||||||
|
#
|
||||||
|
# So let's test that what I said is true.
|
||||||
|
hm1 = reduce(half_harmonic_mean, inputs)
|
||||||
|
random.shuffle(inputs)
|
||||||
|
hm2 = reduce(half_harmonic_mean, inputs)
|
||||||
|
assert_almost_equal(hm1, hm2)
|
||||||
|
|
||||||
|
inputs[0] *= 2
|
||||||
|
hm3 = reduce(half_harmonic_mean, inputs)
|
||||||
|
assert_less_equal(hm2, hm3)
|
||||||
|
|
||||||
|
|
||||||
|
def test_half_harmonic_mean():
|
||||||
|
for count in range(2, 6):
|
||||||
|
for rep in range(10):
|
||||||
|
# get some strictly positive arbitrary numbers
|
||||||
|
inputs = [random.expovariate(0.01)
|
||||||
|
for i in range(count)]
|
||||||
|
yield check_hm_properties, inputs
|
||||||
|
|
@ -164,6 +164,17 @@ def iter_wordlist(lang, wordlist='combined'):
|
|||||||
yield word
|
yield word
|
||||||
|
|
||||||
|
|
||||||
|
def half_harmonic_mean(a, b):
|
||||||
|
"""
|
||||||
|
An associative, commutative, monotonic function that returns a value
|
||||||
|
less than or equal to both a and b.
|
||||||
|
|
||||||
|
Used for estimating the frequency of terms made of multiple tokens, given
|
||||||
|
the assumption that the tokens very frequently appear together.
|
||||||
|
"""
|
||||||
|
return (a * b) / (a + b)
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=CACHE_SIZE)
|
@lru_cache(maxsize=CACHE_SIZE)
|
||||||
def word_frequency(word, lang, wordlist='combined', default=0.):
|
def word_frequency(word, lang, wordlist='combined', default=0.):
|
||||||
"""
|
"""
|
||||||
@ -199,7 +210,7 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
|
|||||||
else:
|
else:
|
||||||
# Combine word values using the half-harmonic-mean formula,
|
# Combine word values using the half-harmonic-mean formula,
|
||||||
# (a * b) / (a + b). This operation is associative.
|
# (a * b) / (a + b). This operation is associative.
|
||||||
combined_value = (combined_value * value) / (combined_value + value)
|
combined_value = half_harmonic_mean(combined_value, value)
|
||||||
return combined_value
|
return combined_value
|
||||||
|
|
||||||
|
|
||||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user