Express the combining of word frequencies in an explicitly associative and commutative way.

2024-12-23 17:31:41 +00:00 · 2015-07-09 15:26:54 -04:00 · 2015-07-09 15:26:54 -04:00 · 32b4033d63
commit 32b4033d63
parent 2821f23e79
4 changed files with 17 additions and 67 deletions
--- a/tests/test.py
+++ b/tests/test.py
@ -1,7 +1,6 @@
 from wordfreq import (
    word_frequency, available_languages, cB_to_freq,
-    top_n_list, random_words, random_ascii_words, tokenize,
+    top_n_list, random_words, random_ascii_words, tokenize
    half_harmonic_mean
 )
 from nose.tools import (
    eq_, assert_almost_equal, assert_greater, raises
@ -114,12 +113,9 @@ def test_phrase_freq():
    plant = word_frequency("plan.t", 'en')
    assert_greater(plant, 0)
    assert_almost_equal(
-        plant,
+        1.0 / plant,
-        half_harmonic_mean(
+        1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
-            word_frequency('plan', 'en'),
+    )
            word_frequency('t', 'en')
            )
        )
 def test_not_really_random():
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@ -1,5 +1,5 @@
 from nose.tools import eq_, assert_almost_equal
-from wordfreq import tokenize, word_frequency, half_harmonic_mean
+from wordfreq import tokenize, word_frequency
 def test_tokens():
@ -17,10 +17,7 @@ def test_combination():
        ohayou_freq / 2
    )
    assert_almost_equal(
-        word_frequency('おはようございます', 'ja'),
+        1.0 / word_frequency('おはようございます', 'ja'),
-        half_harmonic_mean(
+        1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
            half_harmonic_mean(ohayou_freq, gozai_freq),
            masu_freq
        )
    )
--- a/tests/test_math.py
+++ b/tests/test_math.py
@ -1,30 +0,0 @@
 from nose.tools import assert_less_equal, assert_almost_equal
 from wordfreq import half_harmonic_mean
 from functools import reduce
 import random
 def check_hm_properties(inputs):
    # I asserted that the half-harmonic-mean formula is associative,
    # commutative, monotonic, and less than or equal to its inputs.
    # (Less if its inputs are strictly positive, in fact.)
    #
    # So let's test that what I said is true.
    hm1 = reduce(half_harmonic_mean, inputs)
    random.shuffle(inputs)
    hm2 = reduce(half_harmonic_mean, inputs)
    assert_almost_equal(hm1, hm2)
    inputs[0] *= 2
    hm3 = reduce(half_harmonic_mean, inputs)
    assert_less_equal(hm2, hm3)
 def test_half_harmonic_mean():
    for count in range(2, 6):
        for rep in range(10):
            # get some strictly positive arbitrary numbers
            inputs = [random.expovariate(0.01)
                      for i in range(count)]
            yield check_hm_properties, inputs
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -209,42 +209,29 @@ def iter_wordlist(lang, wordlist='combined'):
    return itertools.chain(*get_frequency_list(lang, wordlist))
 def half_harmonic_mean(a, b):
    """
    An associative, commutative, monotonic function that returns a value
    less than or equal to both a and b.
    Used for estimating the frequency of terms made of multiple tokens, given
    the assumption that the tokens very frequently appear together.
    """
    return (a * b) / (a + b)
 # This dict and inner function are used to implement a "drop everything" cache
 # for word_frequency(); the overheads of lru_cache() are comparable to the time
 # it takes to look up frequencies from scratch, so something faster is needed.
 _wf_cache = {}
 def _word_frequency(word, lang, wordlist, minimum):
    freqs = get_frequency_dict(lang, wordlist)
    combined_value = None
    tokens = tokenize(word, lang)
-
+    if not tokens:
    if len(tokens) == 0:
        return minimum
    # Frequencies for multiple tokens are combined using the formula
    #     1 / f = 1 / f1 + 1 / f2 + ...
    # Thus the resulting frequency is less than any individual frequency, and
    # the smallest frequency dominates the sum.
    freqs = get_frequency_dict(lang, wordlist)
    one_over_result = 0.0
    for token in tokens:
        if token not in freqs:
            # If any word is missing, just return the default value
            return minimum
-        value = freqs[token]
+        one_over_result += 1.0 / freqs[token]
-        if combined_value is None:
+
-            combined_value = value
+    return max(1.0 / one_over_result, minimum)
        else:
            # Combine word values using the half-harmonic-mean formula,
            # (a * b) / (a + b). This operation is associative.
            combined_value = half_harmonic_mean(combined_value, value)
    return max(combined_value, minimum)
 def word_frequency(word, lang, wordlist='combined', minimum=0.):
    """