update Japanese data; test Japanese and token combining

Former-commit-id: 611a6a35de
2024-12-23 17:31:41 +00:00 · 2015-05-28 14:01:11 -04:00 · 2015-05-28 14:01:11 -04:00 · 860e929bf8
commit 860e929bf8
parent 5db3c4ef9e
5 changed files with 68 additions and 1 deletions
--- a/README.txt
+++ b/README.txt
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@ -0,0 +1,26 @@
+from nose.tools import eq_, assert_almost_equal
+from wordfreq import tokenize, word_frequency, half_harmonic_mean
+
+
+def test_tokens():
+    eq_(tokenize('おはようございます', 'ja'),
+        ['おはよう', 'ござい', 'ます'])
+
+
+def test_combination():
+    ohayou_freq = word_frequency('おはよう', 'ja')
+    gozai_freq = word_frequency('ござい', 'ja')
+    masu_freq = word_frequency('ます', 'ja')
+
+    assert_almost_equal(
+        word_frequency('おはようおはよう', 'ja'),
+        ohayou_freq / 2
+    )
+    assert_almost_equal(
+        word_frequency('おはようございます', 'ja'),
+        half_harmonic_mean(
+            half_harmonic_mean(ohayou_freq, gozai_freq),
+            masu_freq
+        )
+    )
+
--- a/tests/test_math.py
+++ b/tests/test_math.py
@ -0,0 +1,30 @@
+from nose.tools import assert_less_equal, assert_almost_equal
+from wordfreq import half_harmonic_mean
+from functools import reduce
+import random
+
+
+def check_hm_properties(inputs):
+    # I asserted that the half-harmonic-mean formula is associative,
+    # commutative, monotonic, and less than or equal to its inputs.
+    # (Less if its inputs are strictly positive, in fact.)
+    #
+    # So let's test that what I said is true.
+    hm1 = reduce(half_harmonic_mean, inputs)
+    random.shuffle(inputs)
+    hm2 = reduce(half_harmonic_mean, inputs)
+    assert_almost_equal(hm1, hm2)
+
+    inputs[0] *= 2
+    hm3 = reduce(half_harmonic_mean, inputs)
+    assert_less_equal(hm2, hm3)
+
+
+def test_half_harmonic_mean():
+    for count in range(2, 6):
+        for rep in range(10):
+            # get some strictly positive arbitrary numbers
+            inputs = [random.expovariate(0.01)
+                      for i in range(count)]
+            yield check_hm_properties, inputs
+
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -164,6 +164,17 @@ def iter_wordlist(lang, wordlist='combined'):
            yield word


+def half_harmonic_mean(a, b):
+    """
+    An associative, commutative, monotonic function that returns a value
+    less than or equal to both a and b.
+
+    Used for estimating the frequency of terms made of multiple tokens, given
+    the assumption that the tokens very frequently appear together.
+    """
+    return (a * b) / (a + b)
+
+
@lru_cache(maxsize=CACHE_SIZE)
 def word_frequency(word, lang, wordlist='combined', default=0.):
    """
@ -199,7 +210,7 @@ def word_frequency(word, lang, wordlist='combined', default=0.):
        else:
            # Combine word values using the half-harmonic-mean formula,
            # (a * b) / (a + b). This operation is associative.
-            combined_value = (combined_value * value) / (combined_value + value)
+            combined_value = half_harmonic_mean(combined_value, value)
    return combined_value


--- a/wordfreq/data/combined_ja.msgpack.gz
+++ b/wordfreq/data/combined_ja.msgpack.gz