diff --git a/tests/test.py b/tests/test.py new file mode 100644 index 0000000..0c57773 --- /dev/null +++ b/tests/test.py @@ -0,0 +1,113 @@ +from wordfreq import ( + word_frequency, available_languages, dB_to_freq, iter_wordlist, + top_n_list, random_words, random_ascii_words +) +from nose.tools import ( + eq_, assert_almost_equal, assert_greater, assert_less, raises +) + + +def test_freq_examples(): + # Stopwords are most common in the correct language + assert_greater(word_frequency('the', 'en'), + word_frequency('de', 'en')) + + assert_greater(word_frequency('de', 'es'), + word_frequency('the', 'es')) + + +def test_languages(): + # Make sure the number of available languages doesn't decrease + avail = available_languages() + assert_greater(len(avail), 14) + + # Laughter is the universal language + for lang in avail: + if lang != 'zh': # we don't have enough Chinese data yet + assert_greater(word_frequency('lol', lang), 0) + + # Make up a weirdly verbose language code and make sure + # we still get it + new_lang_code = '%s-001-x-fake-extension' % lang.upper() + assert_greater(word_frequency('lol', new_lang_code), 0) + + +def test_defaults(): + eq_(word_frequency('esquivalience', 'en'), 0) + eq_(word_frequency('esquivalience', 'en', default=1e-6), 1e-6) + + +def test_most_common_words(): + # If something causes the most common words in well-supported languages to + # change, we should know. + + def get_most_common(lang): + """ + Return the single most common word in the language. + """ + return top_n_list(lang, 1)[0] + + eq_(get_most_common('ar'), 'في') + eq_(get_most_common('de'), 'der') + eq_(get_most_common('en'), 'the') + eq_(get_most_common('es'), 'de') + eq_(get_most_common('fr'), 'de') + eq_(get_most_common('it'), 'di') + eq_(get_most_common('ja'), 'の') + eq_(get_most_common('nl'), 'de') + eq_(get_most_common('pt'), 'de') + eq_(get_most_common('ru'), 'в') + eq_(get_most_common('zh'), '的') + + +def test_language_matching(): + freq = word_frequency('的', 'zh') + eq_(word_frequency('的', 'zh-TW'), freq) + eq_(word_frequency('的', 'zh-CN'), freq) + eq_(word_frequency('的', 'zh-Hant'), freq) + eq_(word_frequency('的', 'zh-Hans'), freq) + eq_(word_frequency('的', 'yue-HK'), freq) + eq_(word_frequency('的', 'cmn'), freq) + + +def test_dB_conversion(): + eq_(dB_to_freq(0), 1.) + assert_almost_equal(dB_to_freq(-10), 0.1) + assert_almost_equal(dB_to_freq(-60), 1e-6) + + +@raises(ValueError) +def test_failed_dB_conversion(): + dB_to_freq(1) + + +def test_tokenization(): + # We preserve apostrophes within words, so "can't" is a single word in the + # data, while the fake word "plan't" can't be found. + assert_greater(word_frequency("can't", 'en'), 0) + eq_(word_frequency("plan't", 'en'), 0) + + # We do split at other punctuation, causing the word-combining rule to + # apply. + assert_greater(word_frequency("can.t", 'en'), 0) + plant = word_frequency("plan.t", 'en') + assert_greater(plant, 0) + assert_less(plant, word_frequency('plan', 'en')) + assert_less(plant, word_frequency('t', 'en')) + + +def test_not_really_random(): + # If your xkcd-style password comes out like this, maybe you shouldn't + # use it + eq_(random_words(nwords=4, lang='en', bits_per_word=0), + 'the the the the') + + # This not only tests random_ascii_words, it makes sure we didn't end + # up with 'eos' as a very common Japanese word + eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0), + 'http http http http') + + +@raises(ValueError) +def test_not_enough_ascii(): + random_ascii_words(lang='zh') diff --git a/tests/test_build.py b/tests/test_build.py deleted file mode 100644 index 56b0977..0000000 --- a/tests/test_build.py +++ /dev/null @@ -1,60 +0,0 @@ -from nose.tools import eq_ -from wordfreq.build import load_all_data -from wordfreq.query import wordlist_info -from wordfreq.transfer import download_and_extract_raw_data -from wordfreq import config -import os -import tempfile -import shutil -import sqlite3 -import sys - -PYTHON2 = (sys.version_info.major == 2) - -def flatten_list_of_dicts(list_of_dicts): - things = [sorted(d.items()) for d in list_of_dicts] - return sorted(things) - - -def test_build(): - """ - Ensure that the build process builds the same DB that gets distributed. - """ - if not os.path.exists(config.RAW_DATA_DIR): - download_and_extract_raw_data() - - tempdir = tempfile.mkdtemp('.wordfreq') - try: - db_file = os.path.join(tempdir, 'test.db') - load_all_data(config.RAW_DATA_DIR, db_file, do_it_anyway=True) - conn = sqlite3.connect(db_file) - - # Compare the information we got to the information in the default DB. - new_info = flatten_list_of_dicts(wordlist_info(conn)) - old_info = flatten_list_of_dicts(wordlist_info(None)) - eq_(len(new_info), len(old_info)) - for i in range(len(new_info)): - # Don't test Greek and emoji on Python 2; we can't make them - # consistent with Python 3. - if PYTHON2 and ((u'lang', u'el') in new_info[i]): - continue - if PYTHON2 and ((u'wordlist', u'twitter') in new_info[i]): - continue - eq_(new_info[i], old_info[i]) - finally: - shutil.rmtree(tempdir) - - -def test_python2(): - """ - Python 2 got to skip two tests up there, because we built a slightly - wrong wordlist. Now let's test that, in normal operation, it will refuse - to build this wordlist. - """ - if PYTHON2: - try: - load_all_data(config.RAW_DATA_DIR, tempfile.mkstemp()) - assert False, "The database should not have been built" - except UnicodeError: - # This is the correct case - pass