Express the combining of word frequencies in an explicitly associative and commutative way.

Former-commit-id: 32b4033d63
2024-12-23 17:31:41 +00:00 · 2015-07-09 15:26:54 -04:00 · 2015-07-09 15:26:54 -04:00 · 5c72e68b7e
commit 5c72e68b7e
parent ce364297a2
4 changed files with 17 additions and 67 deletions
--- a/tests/test.py
+++ b/tests/test.py
@ -1,7 +1,6 @@
 from wordfreq import (
    word_frequency, available_languages, cB_to_freq,
-    top_n_list, random_words, random_ascii_words, tokenize,
-    half_harmonic_mean
+    top_n_list, random_words, random_ascii_words, tokenize
 )
 from nose.tools import (
    eq_, assert_almost_equal, assert_greater, raises
@ -114,11 +113,8 @@ def test_phrase_freq():
    plant = word_frequency("plan.t", 'en')
    assert_greater(plant, 0)
    assert_almost_equal(
-        plant,
-        half_harmonic_mean(
-            word_frequency('plan', 'en'),
-            word_frequency('t', 'en')
-            )
+        1.0 / plant,
+        1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
    )


--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@ -1,5 +1,5 @@
 from nose.tools import eq_, assert_almost_equal
-from wordfreq import tokenize, word_frequency, half_harmonic_mean
+from wordfreq import tokenize, word_frequency


 def test_tokens():
@ -17,10 +17,7 @@ def test_combination():
        ohayou_freq / 2
    )
    assert_almost_equal(
-        word_frequency('おはようございます', 'ja'),
-        half_harmonic_mean(
-            half_harmonic_mean(ohayou_freq, gozai_freq),
-            masu_freq
-        )
+        1.0 / word_frequency('おはようございます', 'ja'),
+        1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
    )

--- a/tests/test_math.py
+++ b/tests/test_math.py
@ -1,30 +0,0 @@
-from nose.tools import assert_less_equal, assert_almost_equal
-from wordfreq import half_harmonic_mean
-from functools import reduce
-import random
-
-
-def check_hm_properties(inputs):
-    # I asserted that the half-harmonic-mean formula is associative,
-    # commutative, monotonic, and less than or equal to its inputs.
-    # (Less if its inputs are strictly positive, in fact.)
-    #
-    # So let's test that what I said is true.
-    hm1 = reduce(half_harmonic_mean, inputs)
-    random.shuffle(inputs)
-    hm2 = reduce(half_harmonic_mean, inputs)
-    assert_almost_equal(hm1, hm2)
-
-    inputs[0] *= 2
-    hm3 = reduce(half_harmonic_mean, inputs)
-    assert_less_equal(hm2, hm3)
-
-
-def test_half_harmonic_mean():
-    for count in range(2, 6):
-        for rep in range(10):
-            # get some strictly positive arbitrary numbers
-            inputs = [random.expovariate(0.01)
-                      for i in range(count)]
-            yield check_hm_properties, inputs
-
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -209,42 +209,29 @@ def iter_wordlist(lang, wordlist='combined'):
    return itertools.chain(*get_frequency_list(lang, wordlist))


-def half_harmonic_mean(a, b):
-    """
-    An associative, commutative, monotonic function that returns a value
-    less than or equal to both a and b.
-
-    Used for estimating the frequency of terms made of multiple tokens, given
-    the assumption that the tokens very frequently appear together.
-    """
-    return (a * b) / (a + b)
-
-
 # This dict and inner function are used to implement a "drop everything" cache
 # for word_frequency(); the overheads of lru_cache() are comparable to the time
 # it takes to look up frequencies from scratch, so something faster is needed.
 _wf_cache = {}

 def _word_frequency(word, lang, wordlist, minimum):
-    freqs = get_frequency_dict(lang, wordlist)
-    combined_value = None
    tokens = tokenize(word, lang)
-
-    if len(tokens) == 0:
+    if not tokens:
        return minimum

+    # Frequencies for multiple tokens are combined using the formula
+    #     1 / f = 1 / f1 + 1 / f2 + ...
+    # Thus the resulting frequency is less than any individual frequency, and
+    # the smallest frequency dominates the sum.
+    freqs = get_frequency_dict(lang, wordlist)
+    one_over_result = 0.0
    for token in tokens:
        if token not in freqs:
            # If any word is missing, just return the default value
            return minimum
-        value = freqs[token]
-        if combined_value is None:
-            combined_value = value
-        else:
-            # Combine word values using the half-harmonic-mean formula,
-            # (a * b) / (a + b). This operation is associative.
-            combined_value = half_harmonic_mean(combined_value, value)
-    return max(combined_value, minimum)
+        one_over_result += 1.0 / freqs[token]
+
+    return max(1.0 / one_over_result, minimum)

 def word_frequency(word, lang, wordlist='combined', minimum=0.):
    """