wordfreq/tests/test.py

from wordfreq import (
    word_frequency, available_languages, cB_to_freq, iter_wordlist,
    top_n_list, random_words, random_ascii_words, tokenize,
    half_harmonic_mean
)
from nose.tools import (
    eq_, assert_almost_equal, assert_greater, assert_less, raises
)


def test_freq_examples():
    # Stopwords are most common in the correct language
    assert_greater(word_frequency('the', 'en'),
                   word_frequency('de', 'en'))

    assert_greater(word_frequency('de', 'es'),
                   word_frequency('the', 'es'))


def test_languages():
    # Make sure the number of available languages doesn't decrease
    avail = available_languages()
    assert_greater(len(avail), 14)

    # Laughter is the universal language
    for lang in avail:
        if lang not in {'zh', 'ja'}:
            # we do not have enough Chinese data
            # Japanese people do not lol
            assert_greater(word_frequency('lol', lang), 0)

            # Make up a weirdly verbose language code and make sure
            # we still get it
            new_lang_code = '%s-001-x-fake-extension' % lang.upper()
            assert_greater(word_frequency('lol', new_lang_code), 0)


def test_twitter():
    avail = available_languages('twitter')
    assert_greater(len(avail), 12)

    for lang in avail:
        assert_greater(word_frequency('rt', lang, 'twitter'),
                       word_frequency('rt', lang, 'combined'))


def test_minimums():
    eq_(word_frequency('esquivalience', 'en'), 0)
    eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
    eq_(word_frequency('the', 'en', minimum=1), 1)

def test_most_common_words():
    # If something causes the most common words in well-supported languages to
    # change, we should know.

    def get_most_common(lang):
        """
        Return the single most common word in the language.
        """
        return top_n_list(lang, 1)[0]

    eq_(get_most_common('ar'), 'في')
    eq_(get_most_common('de'), 'die')
    eq_(get_most_common('en'), 'the')
    eq_(get_most_common('es'), 'de')
    eq_(get_most_common('fr'), 'de')
    eq_(get_most_common('it'), 'di')
    eq_(get_most_common('ja'), 'の')
    eq_(get_most_common('nl'), 'de')
    eq_(get_most_common('pt'), 'de')
    eq_(get_most_common('ru'), 'в')
    eq_(get_most_common('zh'), '的')


def test_language_matching():
    freq = word_frequency('的', 'zh')
    eq_(word_frequency('的', 'zh-TW'), freq)
    eq_(word_frequency('的', 'zh-CN'), freq)
    eq_(word_frequency('的', 'zh-Hant'), freq)
    eq_(word_frequency('的', 'zh-Hans'), freq)
    eq_(word_frequency('的', 'yue-HK'), freq)
    eq_(word_frequency('的', 'cmn'), freq)


def test_cB_conversion():
    eq_(cB_to_freq(0), 1.)
    assert_almost_equal(cB_to_freq(-100), 0.1)
    assert_almost_equal(cB_to_freq(-600), 1e-6)


@raises(ValueError)
def test_failed_cB_conversion():
    cB_to_freq(1)


def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data, while the fake word "plan't" can't be found.
    eq_(tokenize("can't", 'en'), ["can't"])

    eq_(tokenize('😂test', 'en'), ['😂', 'test'])

    # We do split at other punctuation, causing the word-combining rule to
    # apply.
    eq_(tokenize("can.t", 'en'), ['can', 't'])


def test_casefolding():
    eq_(tokenize('WEISS', 'de'), ['weiss'])
    eq_(tokenize('weiß', 'de'), ['weiss'])


def test_phrase_freq():
    plant = word_frequency("plan.t", 'en')
    assert_greater(plant, 0)
    assert_almost_equal(
        plant,
        half_harmonic_mean(
            word_frequency('plan', 'en'),
            word_frequency('t', 'en')
            )
        )


def test_not_really_random():
    # If your xkcd-style password comes out like this, maybe you shouldn't
    # use it
    eq_(random_words(nwords=4, lang='en', bits_per_word=0),
        'the the the the')

    # This not only tests random_ascii_words, it makes sure we didn't end
    # up with 'eos' as a very common Japanese word
    eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
        'rt rt rt rt')


@raises(ValueError)
def test_not_enough_ascii():
    random_ascii_words(lang='zh')

def test_ar():
    eq_(
        tokenize('متــــــــعب', 'ar'),
        ['متعب']
    )

    eq_(
        tokenize('حَرَكَات', 'ar'),
        ['حركات']
    )
tests for new wordfreq with full coverage Former-commit-id: df863a5169719a154a95c788f237088704b5e619 2015-05-22 00:34:17 +00:00			`from wordfreq import (`
Switch to a more precise centibel scale. Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2 2015-06-22 21:36:30 +00:00			`word_frequency, available_languages, cB_to_freq, iter_wordlist,`
updated tests Former-commit-id: ca66a5f883d4a19c2b9fa81e1f6c3c8309924f69 2015-07-07 18:13:28 +00:00			`top_n_list, random_words, random_ascii_words, tokenize,`
			`half_harmonic_mean`
tests for new wordfreq with full coverage Former-commit-id: df863a5169719a154a95c788f237088704b5e619 2015-05-22 00:34:17 +00:00			`)`
			`from nose.tools import (`
			`eq_, assert_almost_equal, assert_greater, assert_less, raises`
			`)`


			`def test_freq_examples():`
			`# Stopwords are most common in the correct language`
			`assert_greater(word_frequency('the', 'en'),`
			`word_frequency('de', 'en'))`

			`assert_greater(word_frequency('de', 'es'),`
			`word_frequency('the', 'es'))`


			`def test_languages():`
			`# Make sure the number of available languages doesn't decrease`
			`avail = available_languages()`
			`assert_greater(len(avail), 14)`

			`# Laughter is the universal language`
			`for lang in avail:`
Japanese people do not 'lol', they 'w' Former-commit-id: 17f11ebd260360af4e61140b895e89e1ad0db9a7 2015-06-29 15:01:13 +00:00			`if lang not in {'zh', 'ja'}:`
			`# we do not have enough Chinese data`
			`# Japanese people do not lol`
tests for new wordfreq with full coverage Former-commit-id: df863a5169719a154a95c788f237088704b5e619 2015-05-22 00:34:17 +00:00			`assert_greater(word_frequency('lol', lang), 0)`

			`# Make up a weirdly verbose language code and make sure`
			`# we still get it`
			`new_lang_code = '%s-001-x-fake-extension' % lang.upper()`
			`assert_greater(word_frequency('lol', new_lang_code), 0)`


test and document new twitter wordlists Former-commit-id: 14cb40810019eb8ca5d1350be46c41c645bf12b6 2015-07-01 21:53:38 +00:00			`def test_twitter():`
			`avail = available_languages('twitter')`
			`assert_greater(len(avail), 12)`

			`for lang in avail:`
			`assert_greater(word_frequency('rt', lang, 'twitter'),`
			`word_frequency('rt', lang, 'combined'))`


updated minimum Former-commit-id: 59c03e24118ffbd4159e1162a6a64ebf38bf4edb 2015-07-07 19:46:33 +00:00			`def test_minimums():`
tests for new wordfreq with full coverage Former-commit-id: df863a5169719a154a95c788f237088704b5e619 2015-05-22 00:34:17 +00:00			`eq_(word_frequency('esquivalience', 'en'), 0)`
changed default to minimum for word_frequency Former-commit-id: 9aa773aa2bba694c691d1ea7b18e16a64fe7695e 2015-07-07 19:03:26 +00:00			`eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)`
updated minimum Former-commit-id: 59c03e24118ffbd4159e1162a6a64ebf38bf4edb 2015-07-07 19:46:33 +00:00			`eq_(word_frequency('the', 'en', minimum=1), 1)`
tests for new wordfreq with full coverage Former-commit-id: df863a5169719a154a95c788f237088704b5e619 2015-05-22 00:34:17 +00:00
			`def test_most_common_words():`
			`# If something causes the most common words in well-supported languages to`
			`# change, we should know.`

			`def get_most_common(lang):`
			`"""`
			`Return the single most common word in the language.`
			`"""`
			`return top_n_list(lang, 1)[0]`

			`eq_(get_most_common('ar'), 'في')`
Switch to a more precise centibel scale. Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2 2015-06-22 21:36:30 +00:00			`eq_(get_most_common('de'), 'die')`
tests for new wordfreq with full coverage Former-commit-id: df863a5169719a154a95c788f237088704b5e619 2015-05-22 00:34:17 +00:00			`eq_(get_most_common('en'), 'the')`
			`eq_(get_most_common('es'), 'de')`
			`eq_(get_most_common('fr'), 'de')`
			`eq_(get_most_common('it'), 'di')`
			`eq_(get_most_common('ja'), 'の')`
			`eq_(get_most_common('nl'), 'de')`
			`eq_(get_most_common('pt'), 'de')`
			`eq_(get_most_common('ru'), 'в')`
			`eq_(get_most_common('zh'), '的')`


			`def test_language_matching():`
			`freq = word_frequency('的', 'zh')`
			`eq_(word_frequency('的', 'zh-TW'), freq)`
			`eq_(word_frequency('的', 'zh-CN'), freq)`
			`eq_(word_frequency('的', 'zh-Hant'), freq)`
			`eq_(word_frequency('的', 'zh-Hans'), freq)`
			`eq_(word_frequency('的', 'yue-HK'), freq)`
			`eq_(word_frequency('的', 'cmn'), freq)`


Switch to a more precise centibel scale. Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2 2015-06-22 21:36:30 +00:00			`def test_cB_conversion():`
			`eq_(cB_to_freq(0), 1.)`
			`assert_almost_equal(cB_to_freq(-100), 0.1)`
			`assert_almost_equal(cB_to_freq(-600), 1e-6)`
tests for new wordfreq with full coverage Former-commit-id: df863a5169719a154a95c788f237088704b5e619 2015-05-22 00:34:17 +00:00

			`@raises(ValueError)`
Switch to a more precise centibel scale. Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2 2015-06-22 21:36:30 +00:00			`def test_failed_cB_conversion():`
			`cB_to_freq(1)`
tests for new wordfreq with full coverage Former-commit-id: df863a5169719a154a95c788f237088704b5e619 2015-05-22 00:34:17 +00:00

			`def test_tokenization():`
			`# We preserve apostrophes within words, so "can't" is a single word in the`
			`# data, while the fake word "plan't" can't be found.`
updated tests for emoji splitting Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185 2015-06-25 15:25:51 +00:00			`eq_(tokenize("can't", 'en'), ["can't"])`

			`eq_(tokenize('😂test', 'en'), ['😂', 'test'])`
tests for new wordfreq with full coverage Former-commit-id: df863a5169719a154a95c788f237088704b5e619 2015-05-22 00:34:17 +00:00
			`# We do split at other punctuation, causing the word-combining rule to`
			`# apply.`
updated tests for emoji splitting Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185 2015-06-25 15:25:51 +00:00			`eq_(tokenize("can.t", 'en'), ['can', 't'])`

case-fold instead of just lowercasing tokens Former-commit-id: 638467f60022c6933a9a2fb8ff1280d39e9a3d70 2015-06-30 19:14:02 +00:00
			`def test_casefolding():`
			`eq_(tokenize('WEISS', 'de'), ['weiss'])`
			`eq_(tokenize('weiß', 'de'), ['weiss'])`


updated tests for emoji splitting Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185 2015-06-25 15:25:51 +00:00			`def test_phrase_freq():`
tests for new wordfreq with full coverage Former-commit-id: df863a5169719a154a95c788f237088704b5e619 2015-05-22 00:34:17 +00:00			`plant = word_frequency("plan.t", 'en')`
			`assert_greater(plant, 0)`
updated tests Former-commit-id: ca66a5f883d4a19c2b9fa81e1f6c3c8309924f69 2015-07-07 18:13:28 +00:00			`assert_almost_equal(`
			`plant,`
			`half_harmonic_mean(`
			`word_frequency('plan', 'en'),`
			`word_frequency('t', 'en')`
			`)`
			`)`
tests for new wordfreq with full coverage Former-commit-id: df863a5169719a154a95c788f237088704b5e619 2015-05-22 00:34:17 +00:00

			`def test_not_really_random():`
			`# If your xkcd-style password comes out like this, maybe you shouldn't`
			`# use it`
revert changes to test_not_really_random Former-commit-id: bbf7b9de34f4b1f7ff3ac4a3b3789f5f45fa1a86 2015-06-30 15:29:14 +00:00			`eq_(random_words(nwords=4, lang='en', bits_per_word=0),`
tests for new wordfreq with full coverage Former-commit-id: df863a5169719a154a95c788f237088704b5e619 2015-05-22 00:34:17 +00:00			`'the the the the')`

			`# This not only tests random_ascii_words, it makes sure we didn't end`
			`# up with 'eos' as a very common Japanese word`
revert changes to test_not_really_random Former-commit-id: bbf7b9de34f4b1f7ff3ac4a3b3789f5f45fa1a86 2015-06-30 15:29:14 +00:00			`eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),`
update data using new build Former-commit-id: f9a9ee7a82fb122124aec58a4fbf14cccaf27c35 2015-07-01 15:18:39 +00:00			`'rt rt rt rt')`
tests for new wordfreq with full coverage Former-commit-id: df863a5169719a154a95c788f237088704b5e619 2015-05-22 00:34:17 +00:00

			`@raises(ValueError)`
			`def test_not_enough_ascii():`
			`random_ascii_words(lang='zh')`
added arabic tests Former-commit-id: f83d31a35774b08d40ab5c6a9fb8c09616e71819 2015-07-07 19:10:59 +00:00
			`def test_ar():`
			`eq_(`
			`tokenize('متــــــــعب', 'ar'),`
			`['متعب']`
			`)`

			`eq_(`
			`tokenize('حَرَكَات', 'ar'),`
			`['حركات']`
			`)`