From df863a5169719a154a95c788f237088704b5e619 Mon Sep 17 00:00:00 2001
From: Rob Speer <rob@luminoso.com>
Date: Thu, 21 May 2015 20:34:17 -0400
Subject: [PATCH] tests for new wordfreq with full coverage

---
 tests/test.py       | 113 ++++++++++++++++++++++++++++++++++++++++++++
 tests/test_build.py |  60 -----------------------
 2 files changed, 113 insertions(+), 60 deletions(-)
 create mode 100644 tests/test.py
 delete mode 100644 tests/test_build.py

diff --git a/tests/test.py b/tests/test.py
new file mode 100644
index 0000000..0c57773
--- /dev/null
+++ b/tests/test.py
@@ -0,0 +1,113 @@
+from wordfreq import (
+    word_frequency, available_languages, dB_to_freq, iter_wordlist,
+    top_n_list, random_words, random_ascii_words
+)
+from nose.tools import (
+    eq_, assert_almost_equal, assert_greater, assert_less, raises
+)
+
+
+def test_freq_examples():
+    # Stopwords are most common in the correct language
+    assert_greater(word_frequency('the', 'en'),
+                   word_frequency('de', 'en'))
+
+    assert_greater(word_frequency('de', 'es'),
+                   word_frequency('the', 'es'))
+
+
+def test_languages():
+    # Make sure the number of available languages doesn't decrease
+    avail = available_languages()
+    assert_greater(len(avail), 14)
+
+    # Laughter is the universal language
+    for lang in avail:
+        if lang != 'zh':  # we don't have enough Chinese data yet
+            assert_greater(word_frequency('lol', lang), 0)
+
+            # Make up a weirdly verbose language code and make sure
+            # we still get it
+            new_lang_code = '%s-001-x-fake-extension' % lang.upper()
+            assert_greater(word_frequency('lol', new_lang_code), 0)
+
+
+def test_defaults():
+    eq_(word_frequency('esquivalience', 'en'), 0)
+    eq_(word_frequency('esquivalience', 'en', default=1e-6), 1e-6)
+
+
+def test_most_common_words():
+    # If something causes the most common words in well-supported languages to
+    # change, we should know.
+
+    def get_most_common(lang):
+        """
+        Return the single most common word in the language.
+        """
+        return top_n_list(lang, 1)[0]
+
+    eq_(get_most_common('ar'), 'في')
+    eq_(get_most_common('de'), 'der')
+    eq_(get_most_common('en'), 'the')
+    eq_(get_most_common('es'), 'de')
+    eq_(get_most_common('fr'), 'de')
+    eq_(get_most_common('it'), 'di')
+    eq_(get_most_common('ja'), 'の')
+    eq_(get_most_common('nl'), 'de')
+    eq_(get_most_common('pt'), 'de')
+    eq_(get_most_common('ru'), 'в')
+    eq_(get_most_common('zh'), '的')
+
+
+def test_language_matching():
+    freq = word_frequency('的', 'zh')
+    eq_(word_frequency('的', 'zh-TW'), freq)
+    eq_(word_frequency('的', 'zh-CN'), freq)
+    eq_(word_frequency('的', 'zh-Hant'), freq)
+    eq_(word_frequency('的', 'zh-Hans'), freq)
+    eq_(word_frequency('的', 'yue-HK'), freq)
+    eq_(word_frequency('的', 'cmn'), freq)
+
+
+def test_dB_conversion():
+    eq_(dB_to_freq(0), 1.)
+    assert_almost_equal(dB_to_freq(-10), 0.1)
+    assert_almost_equal(dB_to_freq(-60), 1e-6)
+
+
+@raises(ValueError)
+def test_failed_dB_conversion():
+    dB_to_freq(1)
+
+
+def test_tokenization():
+    # We preserve apostrophes within words, so "can't" is a single word in the
+    # data, while the fake word "plan't" can't be found.
+    assert_greater(word_frequency("can't", 'en'), 0)
+    eq_(word_frequency("plan't", 'en'), 0)
+
+    # We do split at other punctuation, causing the word-combining rule to
+    # apply.
+    assert_greater(word_frequency("can.t", 'en'), 0)
+    plant = word_frequency("plan.t", 'en')
+    assert_greater(plant, 0)
+    assert_less(plant, word_frequency('plan', 'en'))
+    assert_less(plant, word_frequency('t', 'en'))
+
+
+def test_not_really_random():
+    # If your xkcd-style password comes out like this, maybe you shouldn't
+    # use it
+    eq_(random_words(nwords=4, lang='en', bits_per_word=0),
+        'the the the the')
+
+    # This not only tests random_ascii_words, it makes sure we didn't end
+    # up with 'eos' as a very common Japanese word
+    eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
+        'http http http http')
+
+
+@raises(ValueError)
+def test_not_enough_ascii():
+    random_ascii_words(lang='zh')
diff --git a/tests/test_build.py b/tests/test_build.py
deleted file mode 100644
index 56b0977..0000000
--- a/tests/test_build.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from nose.tools import eq_
-from wordfreq.build import load_all_data
-from wordfreq.query import wordlist_info
-from wordfreq.transfer import download_and_extract_raw_data
-from wordfreq import config
-import os
-import tempfile
-import shutil
-import sqlite3
-import sys
-
-PYTHON2 = (sys.version_info.major == 2)
-
-def flatten_list_of_dicts(list_of_dicts):
-    things = [sorted(d.items()) for d in list_of_dicts]
-    return sorted(things)
-
-
-def test_build():
-    """
-    Ensure that the build process builds the same DB that gets distributed.
-    """
-    if not os.path.exists(config.RAW_DATA_DIR):
-        download_and_extract_raw_data()
-
-    tempdir = tempfile.mkdtemp('.wordfreq')
-    try:
-        db_file = os.path.join(tempdir, 'test.db')
-        load_all_data(config.RAW_DATA_DIR, db_file, do_it_anyway=True)
-        conn = sqlite3.connect(db_file)
-
-        # Compare the information we got to the information in the default DB.
-        new_info = flatten_list_of_dicts(wordlist_info(conn))
-        old_info = flatten_list_of_dicts(wordlist_info(None))
-        eq_(len(new_info), len(old_info))
-        for i in range(len(new_info)):
-            # Don't test Greek and emoji on Python 2; we can't make them
-            # consistent with Python 3.
-            if PYTHON2 and ((u'lang', u'el') in new_info[i]):
-                continue
-            if PYTHON2 and ((u'wordlist', u'twitter') in new_info[i]):
-                continue
-            eq_(new_info[i], old_info[i])
-    finally:
-        shutil.rmtree(tempdir)
-
-
-def test_python2():
-    """
-    Python 2 got to skip two tests up there, because we built a slightly
-    wrong wordlist. Now let's test that, in normal operation, it will refuse
-    to build this wordlist.
-    """
-    if PYTHON2:
-        try:
-            load_all_data(config.RAW_DATA_DIR, tempfile.mkstemp())
-            assert False, "The database should not have been built"
-        except UnicodeError:
-            # This is the correct case
-            pass