now this package has tests

2024-12-23 09:21:37 +00:00 · 2013-10-29 17:21:55 -04:00 · 2013-10-29 17:21:55 -04:00 · 8a48e57749
commit 8a48e57749
parent a95d88d1b9
4 changed files with 73 additions and 3 deletions
--- a/tests/test_build.py
+++ b/tests/test_build.py
@ -0,0 +1,23 @@
 from wordfreq.build import load_all_data
 from wordfreq.transfer import download_and_extract_raw_data
 from wordfreq import config
 import os
 import tempfile
 import shutil
 def test_build():
    """
    Ensure that the build process builds the same DB that gets distributed.
    """
    if not os.path.exists(config.RAW_DATA_DIR):
        download_and_extract_raw_data()
    tempdir = tempfile.mkdtemp('.wordfreq')
    try:
        db_file = os.path.join(tempdir, 'test.db')
        load_all_data(config.RAW_DATA_DIR, db_file)
        assert open(db_file).read() == open(config.DB_FILENAME).read()
    finally:
        shutil.rmtree(tempdir)
--- a/tests/test_queries.py
+++ b/tests/test_queries.py
@ -0,0 +1,37 @@
 from __future__ import unicode_literals
 from nose.tools import eq_, assert_almost_equal, assert_greater
 from wordfreq.query import (word_frequency, average_frequency, wordlist_size,
                            get_wordlists)
 def test_freq_examples():
    assert_almost_equal(
        word_frequency('normalization', 'en', 'google-books'),
        1.767e-6, places=9
    )
    assert_almost_equal(
        word_frequency('normalisation', 'fr', 'leeds-internet'),
        4.162e-6, places=9
    )
    assert_greater(
        word_frequency('lol', 'xx', 'twitter'),
        word_frequency('lol', 'en', 'google-books')
    )
    eq_(
        word_frequency('totallyfakeword', 'en', 'multi', -1),
        -1
    )
 def _check_normalized_frequencies(wordlist, lang):
    assert_almost_equal(
        average_frequency(wordlist, lang) * wordlist_size(wordlist, lang),
        1.0, places=6
    )
 def test_normalized_frequencies():
    for list_info in get_wordlists():
        wordlist = list_info['wordlist']
        lang = list_info['lang']
        yield _check_normalized_frequencies, wordlist, lang
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@ -111,13 +111,14 @@ def save_wordlist_to_db(conn, listname, lang, freqs):
    conn.commit()
-def create_db(conn):
+def create_db(filename):
    """
    Create a wordlist database, at the filename specified by `wordfreq.config`.
    This should be safe to run (and have no effect) if the database already
    exists.
    """
    conn = get_db_connection(filename)
    base_dir = os.path.dirname(filename)
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
@ -143,10 +144,10 @@ def load_all_data(source_dir=None, filename=None):
    if filename is None:
        filename = config.DB_FILENAME
    conn = get_db_connection(filename)
    logger.info("Creating database")
-    create_db(conn)
+    create_db(filename)
    conn = get_db_connection(filename)
    logger.info("Loading Leeds internet corpus:")
    for lang in LEEDS_LANGUAGES:
        logger.info("\tLanguage: %s" % lang)
--- a/wordfreq/query.py
+++ b/wordfreq/query.py
@ -95,6 +95,15 @@ def iter_wordlist(wordlist='multi', lang=None):
    return results
 def get_wordlists():
    c = CONN.cursor()
    results = c.execute(
        "SELECT wordlist, lang, count(*) from words GROUP BY wordlist, lang"
    )
    for wordlist, lang, count in results:
        yield {'wordlist': wordlist, 'lang': lang, 'count': count}
 METANL_CONSTANT = 50291582140.06433
 def metanl_word_frequency(word, lang, default=0.):
    """