Change default values to offsets.

This commit is contained in:
Robyn Speer 2013-10-29 18:06:47 -04:00
parent 8a48e57749
commit 68f7b25cf7
2 changed files with 22 additions and 13 deletions

View File

@ -1,7 +1,7 @@
from __future__ import unicode_literals
from nose.tools import eq_, assert_almost_equal, assert_greater
from wordfreq.query import (word_frequency, average_frequency, wordlist_size,
get_wordlists)
get_wordlists, metanl_word_frequency)
def test_freq_examples():
@ -9,6 +9,10 @@ def test_freq_examples():
word_frequency('normalization', 'en', 'google-books'),
1.767e-6, places=9
)
assert_almost_equal(
word_frequency('normalization', 'en', 'google-books', 1e-6),
2.767e-6, places=9
)
assert_almost_equal(
word_frequency('normalisation', 'fr', 'leeds-internet'),
4.162e-6, places=9
@ -18,11 +22,16 @@ def test_freq_examples():
word_frequency('lol', 'en', 'google-books')
)
eq_(
word_frequency('totallyfakeword', 'en', 'multi', -1),
-1
word_frequency('totallyfakeword', 'en', 'multi', .5),
.5
)
def test_compatibility():
eq_(metanl_word_frequency(':|en'), 1e9)
eq_(metanl_word_frequency(':|en', offset=1e9), 2e9)
def _check_normalized_frequencies(wordlist, lang):
assert_almost_equal(
average_frequency(wordlist, lang) * wordlist_size(wordlist, lang),

View File

@ -20,21 +20,22 @@ except sqlite3.OperationalError:
@lru_cache(maxsize=CACHE_SIZE)
def word_frequency(word, lang, wordlist='multi', default=0.):
def word_frequency(word, lang, wordlist='multi', offset=0.):
"""
Get the frequency of `word` in the language with code `lang`, from the
specified `wordlist`.
If the word doesn't appear in the wordlist, return the default value.
The offset gets added to all values, to monotonically account for the
fact that we have not observed all possible words.
"""
c = CONN.cursor()
c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
(word, lang, wordlist))
row = c.fetchone()
if row is None:
return default
return offset
else:
return row[0]
return row[0] + offset
def wordlist_size(wordlist, lang=None):
@ -105,7 +106,7 @@ def get_wordlists():
METANL_CONSTANT = 50291582140.06433
def metanl_word_frequency(word, lang, default=0.):
def metanl_word_frequency(wordlang, offset=0.):
"""
Return a word's frequency in a form that matches the output of
metanl 0.6.
@ -120,8 +121,7 @@ def metanl_word_frequency(word, lang, default=0.):
same output as metanl. It does this by multiplying the word frequency in
the 'multi' list by a big ugly constant. Oh well.
"""
freq = word_frequency(word, lang, 'multi', default=None)
if freq is None:
return default
else:
return freq * METANL_CONSTANT
word, lang = wordlang.rsplit('|', 1)
freq = word_frequency(word, lang, 'multi',
offset = offset / METANL_CONSTANT)
return freq * METANL_CONSTANT