update Japanese data; test Japanese and token combining

2024-12-23 17:31:41 +00:00 · 2015-05-28 14:01:11 -04:00 · 2015-05-28 14:01:11 -04:00 · 611a6a35de
commit 611a6a35de
parent 05cf94d1fd
5 changed files with 68 additions and 1 deletions
--- a/README.txt
+++ b/README.txt
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@ -0,0 +1,26 @@
 from nose.tools import eq_, assert_almost_equal
 from wordfreq import tokenize, word_frequency, half_harmonic_mean
 def test_tokens():
    eq_(tokenize('おはようございます', 'ja'),
        ['おはよう', 'ござい', 'ます'])
 def test_combination():
    ohayou_freq = word_frequency('おはよう', 'ja')
    gozai_freq = word_frequency('ござい', 'ja')
    masu_freq = word_frequency('ます', 'ja')
    assert_almost_equal(
        word_frequency('おはようおはよう', 'ja'),
        ohayou_freq / 2
    )
    assert_almost_equal(
        word_frequency('おはようございます', 'ja'),
        half_harmonic_mean(
            half_harmonic_mean(ohayou_freq, gozai_freq),
            masu_freq
        )
    )
--- a/tests/test_math.py
+++ b/tests/test_math.py
@ -0,0 +1,30 @@
 from nose.tools import assert_less_equal, assert_almost_equal
 from wordfreq import half_harmonic_mean
 from functools import reduce
 import random
 def check_hm_properties(inputs):
    # I asserted that the half-harmonic-mean formula is associative,
    # commutative, monotonic, and less than or equal to its inputs.
    # (Less if its inputs are strictly positive, in fact.)
    #
    # So let's test that what I said is true.
    hm1 = reduce(half_harmonic_mean, inputs)
    random.shuffle(inputs)
    hm2 = reduce(half_harmonic_mean, inputs)
    assert_almost_equal(hm1, hm2)
    inputs[0] *= 2
    hm3 = reduce(half_harmonic_mean, inputs)
    assert_less_equal(hm2, hm3)
 def test_half_harmonic_mean():
    for count in range(2, 6):
        for rep in range(10):
            # get some strictly positive arbitrary numbers
            inputs = [random.expovariate(0.01)
                      for i in range(count)]
            yield check_hm_properties, inputs
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -164,6 +164,17 @@ def iter_wordlist(lang, wordlist='combined'):
            yield word
 def half_harmonic_mean(a, b):
    """
    An associative, commutative, monotonic function that returns a value
    less than or equal to both a and b.
    Used for estimating the frequency of terms made of multiple tokens, given
    the assumption that the tokens very frequently appear together.
    """
    return (a * b) / (a + b)
@lru_cache(maxsize=CACHE_SIZE)
 def word_frequency(word, lang, wordlist='combined', default=0.):
    """
@ -199,7 +210,7 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
        else:
            # Combine word values using the half-harmonic-mean formula,
            # (a * b) / (a + b). This operation is associative.
-            combined_value = (combined_value * value) / (combined_value + value)
+            combined_value = half_harmonic_mean(combined_value, value)
    return combined_value
--- a/wordfreq/data/combined_ja.msgpack.gz
+++ b/wordfreq/data/combined_ja.msgpack.gz