diff --git a/tests/test.py b/tests/test.py index afdefae..0a45450 100644 --- a/tests/test.py +++ b/tests/test.py @@ -44,10 +44,10 @@ def test_twitter(): word_frequency('rt', lang, 'combined')) -def test_defaults(): +def test_minimums(): eq_(word_frequency('esquivalience', 'en'), 0) eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6) - + eq_(word_frequency('the', 'en', minimum=1), 1) def test_most_common_words(): # If something causes the most common words in well-supported languages to diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 1a5f39c..5f2896a 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -234,8 +234,8 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.): Words that we believe occur at least once per million tokens, based on the average of these lists, will appear in the word frequency list. - If you look up a word that's not in the list, you'll get the `minimum` - value, which itself defaults to 0. + + The value returned will always be at least as large as `minimum`. If a word decomposes into multiple tokens, we'll return a smoothed estimate of the word frequency that is no greater than the frequency of any of its @@ -259,7 +259,7 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.): # Combine word values using the half-harmonic-mean formula, # (a * b) / (a + b). This operation is associative. combined_value = half_harmonic_mean(combined_value, value) - return combined_value + return max(combined_value, minimum) @lru_cache(maxsize=100)