mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Change default values to offsets.
This commit is contained in:
parent
8a48e57749
commit
68f7b25cf7
@ -1,7 +1,7 @@
|
||||
from __future__ import unicode_literals
|
||||
from nose.tools import eq_, assert_almost_equal, assert_greater
|
||||
from wordfreq.query import (word_frequency, average_frequency, wordlist_size,
|
||||
get_wordlists)
|
||||
get_wordlists, metanl_word_frequency)
|
||||
|
||||
|
||||
def test_freq_examples():
|
||||
@ -9,6 +9,10 @@ def test_freq_examples():
|
||||
word_frequency('normalization', 'en', 'google-books'),
|
||||
1.767e-6, places=9
|
||||
)
|
||||
assert_almost_equal(
|
||||
word_frequency('normalization', 'en', 'google-books', 1e-6),
|
||||
2.767e-6, places=9
|
||||
)
|
||||
assert_almost_equal(
|
||||
word_frequency('normalisation', 'fr', 'leeds-internet'),
|
||||
4.162e-6, places=9
|
||||
@ -18,11 +22,16 @@ def test_freq_examples():
|
||||
word_frequency('lol', 'en', 'google-books')
|
||||
)
|
||||
eq_(
|
||||
word_frequency('totallyfakeword', 'en', 'multi', -1),
|
||||
-1
|
||||
word_frequency('totallyfakeword', 'en', 'multi', .5),
|
||||
.5
|
||||
)
|
||||
|
||||
|
||||
def test_compatibility():
|
||||
eq_(metanl_word_frequency(':|en'), 1e9)
|
||||
eq_(metanl_word_frequency(':|en', offset=1e9), 2e9)
|
||||
|
||||
|
||||
def _check_normalized_frequencies(wordlist, lang):
|
||||
assert_almost_equal(
|
||||
average_frequency(wordlist, lang) * wordlist_size(wordlist, lang),
|
||||
|
@ -20,21 +20,22 @@ except sqlite3.OperationalError:
|
||||
|
||||
|
||||
@lru_cache(maxsize=CACHE_SIZE)
|
||||
def word_frequency(word, lang, wordlist='multi', default=0.):
|
||||
def word_frequency(word, lang, wordlist='multi', offset=0.):
|
||||
"""
|
||||
Get the frequency of `word` in the language with code `lang`, from the
|
||||
specified `wordlist`.
|
||||
|
||||
If the word doesn't appear in the wordlist, return the default value.
|
||||
The offset gets added to all values, to monotonically account for the
|
||||
fact that we have not observed all possible words.
|
||||
"""
|
||||
c = CONN.cursor()
|
||||
c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
|
||||
(word, lang, wordlist))
|
||||
row = c.fetchone()
|
||||
if row is None:
|
||||
return default
|
||||
return offset
|
||||
else:
|
||||
return row[0]
|
||||
return row[0] + offset
|
||||
|
||||
|
||||
def wordlist_size(wordlist, lang=None):
|
||||
@ -105,7 +106,7 @@ def get_wordlists():
|
||||
|
||||
|
||||
METANL_CONSTANT = 50291582140.06433
|
||||
def metanl_word_frequency(word, lang, default=0.):
|
||||
def metanl_word_frequency(wordlang, offset=0.):
|
||||
"""
|
||||
Return a word's frequency in a form that matches the output of
|
||||
metanl 0.6.
|
||||
@ -120,8 +121,7 @@ def metanl_word_frequency(word, lang, default=0.):
|
||||
same output as metanl. It does this by multiplying the word frequency in
|
||||
the 'multi' list by a big ugly constant. Oh well.
|
||||
"""
|
||||
freq = word_frequency(word, lang, 'multi', default=None)
|
||||
if freq is None:
|
||||
return default
|
||||
else:
|
||||
return freq * METANL_CONSTANT
|
||||
word, lang = wordlang.rsplit('|', 1)
|
||||
freq = word_frequency(word, lang, 'multi',
|
||||
offset = offset / METANL_CONSTANT)
|
||||
return freq * METANL_CONSTANT
|
||||
|
Loading…
Reference in New Issue
Block a user