now this package has tests

This commit is contained in:
Robyn Speer 2013-10-29 17:21:55 -04:00
parent a95d88d1b9
commit 8a48e57749
4 changed files with 73 additions and 3 deletions

23
tests/test_build.py Normal file
View File

@ -0,0 +1,23 @@
from wordfreq.build import load_all_data
from wordfreq.transfer import download_and_extract_raw_data
from wordfreq import config
import os
import tempfile
import shutil
def test_build():
"""
Ensure that the build process builds the same DB that gets distributed.
"""
if not os.path.exists(config.RAW_DATA_DIR):
download_and_extract_raw_data()
tempdir = tempfile.mkdtemp('.wordfreq')
try:
db_file = os.path.join(tempdir, 'test.db')
load_all_data(config.RAW_DATA_DIR, db_file)
assert open(db_file).read() == open(config.DB_FILENAME).read()
finally:
shutil.rmtree(tempdir)

37
tests/test_queries.py Normal file
View File

@ -0,0 +1,37 @@
from __future__ import unicode_literals
from nose.tools import eq_, assert_almost_equal, assert_greater
from wordfreq.query import (word_frequency, average_frequency, wordlist_size,
get_wordlists)
def test_freq_examples():
assert_almost_equal(
word_frequency('normalization', 'en', 'google-books'),
1.767e-6, places=9
)
assert_almost_equal(
word_frequency('normalisation', 'fr', 'leeds-internet'),
4.162e-6, places=9
)
assert_greater(
word_frequency('lol', 'xx', 'twitter'),
word_frequency('lol', 'en', 'google-books')
)
eq_(
word_frequency('totallyfakeword', 'en', 'multi', -1),
-1
)
def _check_normalized_frequencies(wordlist, lang):
assert_almost_equal(
average_frequency(wordlist, lang) * wordlist_size(wordlist, lang),
1.0, places=6
)
def test_normalized_frequencies():
for list_info in get_wordlists():
wordlist = list_info['wordlist']
lang = list_info['lang']
yield _check_normalized_frequencies, wordlist, lang

View File

@ -111,13 +111,14 @@ def save_wordlist_to_db(conn, listname, lang, freqs):
conn.commit() conn.commit()
def create_db(conn): def create_db(filename):
""" """
Create a wordlist database, at the filename specified by `wordfreq.config`. Create a wordlist database, at the filename specified by `wordfreq.config`.
This should be safe to run (and have no effect) if the database already This should be safe to run (and have no effect) if the database already
exists. exists.
""" """
conn = get_db_connection(filename)
base_dir = os.path.dirname(filename) base_dir = os.path.dirname(filename)
if not os.path.exists(base_dir): if not os.path.exists(base_dir):
os.makedirs(base_dir) os.makedirs(base_dir)
@ -143,10 +144,10 @@ def load_all_data(source_dir=None, filename=None):
if filename is None: if filename is None:
filename = config.DB_FILENAME filename = config.DB_FILENAME
conn = get_db_connection(filename)
logger.info("Creating database") logger.info("Creating database")
create_db(conn) create_db(filename)
conn = get_db_connection(filename)
logger.info("Loading Leeds internet corpus:") logger.info("Loading Leeds internet corpus:")
for lang in LEEDS_LANGUAGES: for lang in LEEDS_LANGUAGES:
logger.info("\tLanguage: %s" % lang) logger.info("\tLanguage: %s" % lang)

View File

@ -95,6 +95,15 @@ def iter_wordlist(wordlist='multi', lang=None):
return results return results
def get_wordlists():
c = CONN.cursor()
results = c.execute(
"SELECT wordlist, lang, count(*) from words GROUP BY wordlist, lang"
)
for wordlist, lang, count in results:
yield {'wordlist': wordlist, 'lang': lang, 'count': count}
METANL_CONSTANT = 50291582140.06433 METANL_CONSTANT = 50291582140.06433
def metanl_word_frequency(word, lang, default=0.): def metanl_word_frequency(word, lang, default=0.):
""" """