diff --git a/tests/test.py b/tests/test.py index ba52fb8..59d40f8 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,7 +1,6 @@ from wordfreq import ( word_frequency, available_languages, cB_to_freq, - top_n_list, random_words, random_ascii_words, tokenize, - half_harmonic_mean + top_n_list, random_words, random_ascii_words, tokenize ) from nose.tools import ( eq_, assert_almost_equal, assert_greater, raises @@ -114,12 +113,9 @@ def test_phrase_freq(): plant = word_frequency("plan.t", 'en') assert_greater(plant, 0) assert_almost_equal( - plant, - half_harmonic_mean( - word_frequency('plan', 'en'), - word_frequency('t', 'en') - ) - ) + 1.0 / plant, + 1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en') + ) def test_not_really_random(): diff --git a/tests/test_japanese.py b/tests/test_japanese.py index a21eedd..d5a73b3 100644 --- a/tests/test_japanese.py +++ b/tests/test_japanese.py @@ -1,5 +1,5 @@ from nose.tools import eq_, assert_almost_equal -from wordfreq import tokenize, word_frequency, half_harmonic_mean +from wordfreq import tokenize, word_frequency def test_tokens(): @@ -17,10 +17,7 @@ def test_combination(): ohayou_freq / 2 ) assert_almost_equal( - word_frequency('おはようございます', 'ja'), - half_harmonic_mean( - half_harmonic_mean(ohayou_freq, gozai_freq), - masu_freq - ) + 1.0 / word_frequency('おはようございます', 'ja'), + 1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq ) diff --git a/tests/test_math.py b/tests/test_math.py deleted file mode 100644 index c2b3746..0000000 --- a/tests/test_math.py +++ /dev/null @@ -1,30 +0,0 @@ -from nose.tools import assert_less_equal, assert_almost_equal -from wordfreq import half_harmonic_mean -from functools import reduce -import random - - -def check_hm_properties(inputs): - # I asserted that the half-harmonic-mean formula is associative, - # commutative, monotonic, and less than or equal to its inputs. - # (Less if its inputs are strictly positive, in fact.) - # - # So let's test that what I said is true. - hm1 = reduce(half_harmonic_mean, inputs) - random.shuffle(inputs) - hm2 = reduce(half_harmonic_mean, inputs) - assert_almost_equal(hm1, hm2) - - inputs[0] *= 2 - hm3 = reduce(half_harmonic_mean, inputs) - assert_less_equal(hm2, hm3) - - -def test_half_harmonic_mean(): - for count in range(2, 6): - for rep in range(10): - # get some strictly positive arbitrary numbers - inputs = [random.expovariate(0.01) - for i in range(count)] - yield check_hm_properties, inputs - diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 47ac4a6..30c7342 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -209,42 +209,29 @@ def iter_wordlist(lang, wordlist='combined'): return itertools.chain(*get_frequency_list(lang, wordlist)) -def half_harmonic_mean(a, b): - """ - An associative, commutative, monotonic function that returns a value - less than or equal to both a and b. - - Used for estimating the frequency of terms made of multiple tokens, given - the assumption that the tokens very frequently appear together. - """ - return (a * b) / (a + b) - - # This dict and inner function are used to implement a "drop everything" cache # for word_frequency(); the overheads of lru_cache() are comparable to the time # it takes to look up frequencies from scratch, so something faster is needed. _wf_cache = {} def _word_frequency(word, lang, wordlist, minimum): - freqs = get_frequency_dict(lang, wordlist) - combined_value = None tokens = tokenize(word, lang) - - if len(tokens) == 0: + if not tokens: return minimum + # Frequencies for multiple tokens are combined using the formula + # 1 / f = 1 / f1 + 1 / f2 + ... + # Thus the resulting frequency is less than any individual frequency, and + # the smallest frequency dominates the sum. + freqs = get_frequency_dict(lang, wordlist) + one_over_result = 0.0 for token in tokens: if token not in freqs: # If any word is missing, just return the default value return minimum - value = freqs[token] - if combined_value is None: - combined_value = value - else: - # Combine word values using the half-harmonic-mean formula, - # (a * b) / (a + b). This operation is associative. - combined_value = half_harmonic_mean(combined_value, value) - return max(combined_value, minimum) + one_over_result += 1.0 / freqs[token] + + return max(1.0 / one_over_result, minimum) def word_frequency(word, lang, wordlist='combined', minimum=0.): """