Change default values to offsets.

2024-12-23 09:21:37 +00:00 · 2013-10-29 18:06:47 -04:00 · 2013-10-29 18:06:47 -04:00 · 68f7b25cf7
commit 68f7b25cf7
parent 8a48e57749
2 changed files with 22 additions and 13 deletions
--- a/tests/test_queries.py
+++ b/tests/test_queries.py
@ -1,7 +1,7 @@
 from __future__ import unicode_literals
 from nose.tools import eq_, assert_almost_equal, assert_greater
 from wordfreq.query import (word_frequency, average_frequency, wordlist_size,
-                            get_wordlists)
+                            get_wordlists, metanl_word_frequency)


 def test_freq_examples():
@ -9,6 +9,10 @@ def test_freq_examples():
        word_frequency('normalization', 'en', 'google-books'),
        1.767e-6, places=9
    )
+    assert_almost_equal(
+        word_frequency('normalization', 'en', 'google-books', 1e-6),
+        2.767e-6, places=9
+    )
    assert_almost_equal(
        word_frequency('normalisation', 'fr', 'leeds-internet'),
        4.162e-6, places=9
@ -18,11 +22,16 @@ def test_freq_examples():
        word_frequency('lol', 'en', 'google-books')
    )
    eq_(
-        word_frequency('totallyfakeword', 'en', 'multi', -1),
-        -1
+        word_frequency('totallyfakeword', 'en', 'multi', .5),
+        .5
    )


+def test_compatibility():
+    eq_(metanl_word_frequency(':|en'), 1e9)
+    eq_(metanl_word_frequency(':|en', offset=1e9), 2e9)
+
+
 def _check_normalized_frequencies(wordlist, lang):
    assert_almost_equal(
        average_frequency(wordlist, lang) * wordlist_size(wordlist, lang),
--- a/wordfreq/query.py
+++ b/wordfreq/query.py
@ -20,21 +20,22 @@ except sqlite3.OperationalError:


@lru_cache(maxsize=CACHE_SIZE)
-def word_frequency(word, lang, wordlist='multi', default=0.):
+def word_frequency(word, lang, wordlist='multi', offset=0.):
    """
    Get the frequency of `word` in the language with code `lang`, from the
    specified `wordlist`.

-    If the word doesn't appear in the wordlist, return the default value.
+    The offset gets added to all values, to monotonically account for the
+    fact that we have not observed all possible words.
    """
    c = CONN.cursor()
    c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
              (word, lang, wordlist))
    row = c.fetchone()
    if row is None:
-        return default
+        return offset
    else:
-        return row[0]
+        return row[0] + offset


 def wordlist_size(wordlist, lang=None):
@ -105,7 +106,7 @@ def get_wordlists():


 METANL_CONSTANT = 50291582140.06433
-def metanl_word_frequency(word, lang, default=0.):
+def metanl_word_frequency(wordlang, offset=0.):
    """
    Return a word's frequency in a form that matches the output of
    metanl 0.6.
@ -120,8 +121,7 @@ def metanl_word_frequency(word, lang, default=0.):
    same output as metanl. It does this by multiplying the word frequency in
    the 'multi' list by a big ugly constant. Oh well.
    """
-    freq = word_frequency(word, lang, 'multi', default=None)
-    if freq is None:
-        return default
-    else:
-        return freq * METANL_CONSTANT
+    word, lang = wordlang.rsplit('|', 1)
+    freq = word_frequency(word, lang, 'multi',
+                          offset = offset / METANL_CONSTANT)
+    return freq * METANL_CONSTANT