From 8a48e5774902bbfb666e6b3392292277cc72913b Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Tue, 29 Oct 2013 17:21:55 -0400
Subject: [PATCH] now this package has tests

---
 tests/test_build.py   | 23 +++++++++++++++++++++++
 tests/test_queries.py | 37 +++++++++++++++++++++++++++++++++++++
 wordfreq/build.py     |  7 ++++---
 wordfreq/query.py     |  9 +++++++++
 4 files changed, 73 insertions(+), 3 deletions(-)
 create mode 100644 tests/test_build.py
 create mode 100644 tests/test_queries.py

diff --git a/tests/test_build.py b/tests/test_build.py
new file mode 100644
index 0000000..c764dbd
--- /dev/null
+++ b/tests/test_build.py
@@ -0,0 +1,23 @@
+from wordfreq.build import load_all_data
+from wordfreq.transfer import download_and_extract_raw_data
+from wordfreq import config
+import os
+import tempfile
+import shutil
+
+
+def test_build():
+    """
+    Ensure that the build process builds the same DB that gets distributed.
+    """
+    if not os.path.exists(config.RAW_DATA_DIR):
+        download_and_extract_raw_data()
+
+    tempdir = tempfile.mkdtemp('.wordfreq')
+    try:
+        db_file = os.path.join(tempdir, 'test.db')
+        load_all_data(config.RAW_DATA_DIR, db_file)
+
+        assert open(db_file).read() == open(config.DB_FILENAME).read()
+    finally:
+        shutil.rmtree(tempdir)
diff --git a/tests/test_queries.py b/tests/test_queries.py
new file mode 100644
index 0000000..2c427c3
--- /dev/null
+++ b/tests/test_queries.py
@@ -0,0 +1,37 @@
+from __future__ import unicode_literals
+from nose.tools import eq_, assert_almost_equal, assert_greater
+from wordfreq.query import (word_frequency, average_frequency, wordlist_size,
+                            get_wordlists)
+
+
+def test_freq_examples():
+    assert_almost_equal(
+        word_frequency('normalization', 'en', 'google-books'),
+        1.767e-6, places=9
+    )
+    assert_almost_equal(
+        word_frequency('normalisation', 'fr', 'leeds-internet'),
+        4.162e-6, places=9
+    )
+    assert_greater(
+        word_frequency('lol', 'xx', 'twitter'),
+        word_frequency('lol', 'en', 'google-books')
+    )
+    eq_(
+        word_frequency('totallyfakeword', 'en', 'multi', -1),
+        -1
+    )
+
+
+def _check_normalized_frequencies(wordlist, lang):
+    assert_almost_equal(
+        average_frequency(wordlist, lang) * wordlist_size(wordlist, lang),
+        1.0, places=6
+    )
+
+
+def test_normalized_frequencies():
+    for list_info in get_wordlists():
+        wordlist = list_info['wordlist']
+        lang = list_info['lang']
+        yield _check_normalized_frequencies, wordlist, lang
diff --git a/wordfreq/build.py b/wordfreq/build.py
index ea50cae..1409388 100644
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@@ -111,13 +111,14 @@ def save_wordlist_to_db(conn, listname, lang, freqs):
     conn.commit()
 
 
-def create_db(conn):
+def create_db(filename):
     """
     Create a wordlist database, at the filename specified by `wordfreq.config`.
 
     This should be safe to run (and have no effect) if the database already
     exists.
     """
+    conn = get_db_connection(filename)
     base_dir = os.path.dirname(filename)
     if not os.path.exists(base_dir):
         os.makedirs(base_dir)
@@ -143,10 +144,10 @@ def load_all_data(source_dir=None, filename=None):
     if filename is None:
         filename = config.DB_FILENAME
 
-    conn = get_db_connection(filename)
     logger.info("Creating database")
-    create_db(conn)
+    create_db(filename)
 
+    conn = get_db_connection(filename)
     logger.info("Loading Leeds internet corpus:")
     for lang in LEEDS_LANGUAGES:
         logger.info("\tLanguage: %s" % lang)
diff --git a/wordfreq/query.py b/wordfreq/query.py
index ff0d4e0..4b2852f 100644
--- a/wordfreq/query.py
+++ b/wordfreq/query.py
@@ -95,6 +95,15 @@ def iter_wordlist(wordlist='multi', lang=None):
     return results
 
 
+def get_wordlists():
+    c = CONN.cursor()
+    results = c.execute(
+        "SELECT wordlist, lang, count(*) from words GROUP BY wordlist, lang"
+    )
+    for wordlist, lang, count in results:
+        yield {'wordlist': wordlist, 'lang': lang, 'count': count}
+
+
 METANL_CONSTANT = 50291582140.06433
 def metanl_word_frequency(word, lang, default=0.):
     """