diff --git a/tests/test_build.py b/tests/test_build.py new file mode 100644 index 0000000..c764dbd --- /dev/null +++ b/tests/test_build.py @@ -0,0 +1,23 @@ +from wordfreq.build import load_all_data +from wordfreq.transfer import download_and_extract_raw_data +from wordfreq import config +import os +import tempfile +import shutil + + +def test_build(): + """ + Ensure that the build process builds the same DB that gets distributed. + """ + if not os.path.exists(config.RAW_DATA_DIR): + download_and_extract_raw_data() + + tempdir = tempfile.mkdtemp('.wordfreq') + try: + db_file = os.path.join(tempdir, 'test.db') + load_all_data(config.RAW_DATA_DIR, db_file) + + assert open(db_file).read() == open(config.DB_FILENAME).read() + finally: + shutil.rmtree(tempdir) diff --git a/tests/test_queries.py b/tests/test_queries.py new file mode 100644 index 0000000..2c427c3 --- /dev/null +++ b/tests/test_queries.py @@ -0,0 +1,37 @@ +from __future__ import unicode_literals +from nose.tools import eq_, assert_almost_equal, assert_greater +from wordfreq.query import (word_frequency, average_frequency, wordlist_size, + get_wordlists) + + +def test_freq_examples(): + assert_almost_equal( + word_frequency('normalization', 'en', 'google-books'), + 1.767e-6, places=9 + ) + assert_almost_equal( + word_frequency('normalisation', 'fr', 'leeds-internet'), + 4.162e-6, places=9 + ) + assert_greater( + word_frequency('lol', 'xx', 'twitter'), + word_frequency('lol', 'en', 'google-books') + ) + eq_( + word_frequency('totallyfakeword', 'en', 'multi', -1), + -1 + ) + + +def _check_normalized_frequencies(wordlist, lang): + assert_almost_equal( + average_frequency(wordlist, lang) * wordlist_size(wordlist, lang), + 1.0, places=6 + ) + + +def test_normalized_frequencies(): + for list_info in get_wordlists(): + wordlist = list_info['wordlist'] + lang = list_info['lang'] + yield _check_normalized_frequencies, wordlist, lang diff --git a/wordfreq/build.py b/wordfreq/build.py index ea50cae..1409388 100644 --- a/wordfreq/build.py +++ b/wordfreq/build.py @@ -111,13 +111,14 @@ def save_wordlist_to_db(conn, listname, lang, freqs): conn.commit() -def create_db(conn): +def create_db(filename): """ Create a wordlist database, at the filename specified by `wordfreq.config`. This should be safe to run (and have no effect) if the database already exists. """ + conn = get_db_connection(filename) base_dir = os.path.dirname(filename) if not os.path.exists(base_dir): os.makedirs(base_dir) @@ -143,10 +144,10 @@ def load_all_data(source_dir=None, filename=None): if filename is None: filename = config.DB_FILENAME - conn = get_db_connection(filename) logger.info("Creating database") - create_db(conn) + create_db(filename) + conn = get_db_connection(filename) logger.info("Loading Leeds internet corpus:") for lang in LEEDS_LANGUAGES: logger.info("\tLanguage: %s" % lang) diff --git a/wordfreq/query.py b/wordfreq/query.py index ff0d4e0..4b2852f 100644 --- a/wordfreq/query.py +++ b/wordfreq/query.py @@ -95,6 +95,15 @@ def iter_wordlist(wordlist='multi', lang=None): return results +def get_wordlists(): + c = CONN.cursor() + results = c.execute( + "SELECT wordlist, lang, count(*) from words GROUP BY wordlist, lang" + ) + for wordlist, lang, count in results: + yield {'wordlist': wordlist, 'lang': lang, 'count': count} + + METANL_CONSTANT = 50291582140.06433 def metanl_word_frequency(word, lang, default=0.): """