2015-05-22 00:34:17 +00:00
|
|
|
|
from wordfreq import (
|
2015-07-07 20:21:22 +00:00
|
|
|
|
word_frequency, available_languages, cB_to_freq,
|
2015-07-09 19:26:54 +00:00
|
|
|
|
top_n_list, random_words, random_ascii_words, tokenize
|
2015-05-22 00:34:17 +00:00
|
|
|
|
)
|
|
|
|
|
from nose.tools import (
|
2015-07-07 20:21:22 +00:00
|
|
|
|
eq_, assert_almost_equal, assert_greater, raises
|
2015-05-22 00:34:17 +00:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_freq_examples():
|
|
|
|
|
# Stopwords are most common in the correct language
|
|
|
|
|
assert_greater(word_frequency('the', 'en'),
|
|
|
|
|
word_frequency('de', 'en'))
|
|
|
|
|
|
|
|
|
|
assert_greater(word_frequency('de', 'es'),
|
|
|
|
|
word_frequency('the', 'es'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_languages():
|
|
|
|
|
# Make sure the number of available languages doesn't decrease
|
|
|
|
|
avail = available_languages()
|
2015-09-04 20:40:11 +00:00
|
|
|
|
assert_greater(len(avail), 15)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
2015-09-15 17:26:09 +00:00
|
|
|
|
# Laughter is the universal language. Look up either 'lol' or '笑' in each
|
|
|
|
|
# language and make sure it has a non-zero frequency.
|
2015-05-22 00:34:17 +00:00
|
|
|
|
for lang in avail:
|
2015-09-15 17:26:09 +00:00
|
|
|
|
if lang in {'zh', 'ja'}:
|
|
|
|
|
text = '笑'
|
|
|
|
|
else:
|
|
|
|
|
text = 'lol'
|
|
|
|
|
assert_greater(word_frequency(text, lang), 0)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
2015-09-15 17:26:09 +00:00
|
|
|
|
# Make up a weirdly verbose language code and make sure
|
|
|
|
|
# we still get it
|
|
|
|
|
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
|
|
|
|
|
assert_greater(word_frequency(text, new_lang_code), 0)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
2015-07-01 21:53:38 +00:00
|
|
|
|
def test_twitter():
|
|
|
|
|
avail = available_languages('twitter')
|
2015-09-04 20:40:11 +00:00
|
|
|
|
assert_greater(len(avail), 14)
|
2015-07-01 21:53:38 +00:00
|
|
|
|
|
|
|
|
|
for lang in avail:
|
|
|
|
|
assert_greater(word_frequency('rt', lang, 'twitter'),
|
|
|
|
|
word_frequency('rt', lang, 'combined'))
|
|
|
|
|
|
|
|
|
|
|
2015-07-07 19:46:33 +00:00
|
|
|
|
def test_minimums():
|
2015-05-22 00:34:17 +00:00
|
|
|
|
eq_(word_frequency('esquivalience', 'en'), 0)
|
2015-07-07 19:03:26 +00:00
|
|
|
|
eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
|
2015-07-07 19:46:33 +00:00
|
|
|
|
eq_(word_frequency('the', 'en', minimum=1), 1)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
def test_most_common_words():
|
|
|
|
|
# If something causes the most common words in well-supported languages to
|
|
|
|
|
# change, we should know.
|
|
|
|
|
|
|
|
|
|
def get_most_common(lang):
|
|
|
|
|
"""
|
|
|
|
|
Return the single most common word in the language.
|
|
|
|
|
"""
|
|
|
|
|
return top_n_list(lang, 1)[0]
|
|
|
|
|
|
|
|
|
|
eq_(get_most_common('ar'), 'في')
|
2015-06-22 21:36:30 +00:00
|
|
|
|
eq_(get_most_common('de'), 'die')
|
2015-05-22 00:34:17 +00:00
|
|
|
|
eq_(get_most_common('en'), 'the')
|
|
|
|
|
eq_(get_most_common('es'), 'de')
|
|
|
|
|
eq_(get_most_common('fr'), 'de')
|
|
|
|
|
eq_(get_most_common('it'), 'di')
|
|
|
|
|
eq_(get_most_common('ja'), 'の')
|
|
|
|
|
eq_(get_most_common('nl'), 'de')
|
|
|
|
|
eq_(get_most_common('pt'), 'de')
|
|
|
|
|
eq_(get_most_common('ru'), 'в')
|
2015-09-04 20:40:11 +00:00
|
|
|
|
eq_(get_most_common('tr'), 'bir')
|
2015-05-22 00:34:17 +00:00
|
|
|
|
eq_(get_most_common('zh'), '的')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_language_matching():
|
|
|
|
|
freq = word_frequency('的', 'zh')
|
|
|
|
|
eq_(word_frequency('的', 'zh-TW'), freq)
|
|
|
|
|
eq_(word_frequency('的', 'zh-CN'), freq)
|
|
|
|
|
eq_(word_frequency('的', 'zh-Hant'), freq)
|
|
|
|
|
eq_(word_frequency('的', 'zh-Hans'), freq)
|
|
|
|
|
eq_(word_frequency('的', 'yue-HK'), freq)
|
|
|
|
|
eq_(word_frequency('的', 'cmn'), freq)
|
|
|
|
|
|
|
|
|
|
|
2015-06-22 21:36:30 +00:00
|
|
|
|
def test_cB_conversion():
|
|
|
|
|
eq_(cB_to_freq(0), 1.)
|
|
|
|
|
assert_almost_equal(cB_to_freq(-100), 0.1)
|
|
|
|
|
assert_almost_equal(cB_to_freq(-600), 1e-6)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@raises(ValueError)
|
2015-06-22 21:36:30 +00:00
|
|
|
|
def test_failed_cB_conversion():
|
|
|
|
|
cB_to_freq(1)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_tokenization():
|
|
|
|
|
# We preserve apostrophes within words, so "can't" is a single word in the
|
2015-07-17 18:50:12 +00:00
|
|
|
|
# data
|
2015-08-24 20:24:49 +00:00
|
|
|
|
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
|
|
|
|
|
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
|
2015-09-15 17:26:09 +00:00
|
|
|
|
|
|
|
|
|
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
|
|
|
|
|
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
|
2015-06-25 15:25:51 +00:00
|
|
|
|
|
2015-08-24 20:24:49 +00:00
|
|
|
|
# Certain punctuation does not inherently split a word.
|
|
|
|
|
eq_(tokenize("Anything is possible at zombo.com", 'en'),
|
|
|
|
|
['anything', 'is', 'possible', 'at', 'zombo.com'])
|
|
|
|
|
|
|
|
|
|
# Splits occur after symbols, and at splitting punctuation such as hyphens.
|
2015-06-25 15:25:51 +00:00
|
|
|
|
eq_(tokenize('😂test', 'en'), ['😂', 'test'])
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
2015-08-24 20:24:49 +00:00
|
|
|
|
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
|
2015-06-25 15:25:51 +00:00
|
|
|
|
|
2015-09-15 17:26:09 +00:00
|
|
|
|
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
|
|
|
|
|
['this', 'text', 'has', '...', 'punctuation', ':)'])
|
|
|
|
|
|
2015-06-30 19:14:02 +00:00
|
|
|
|
|
|
|
|
|
def test_casefolding():
|
|
|
|
|
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
|
|
|
|
eq_(tokenize('weiß', 'de'), ['weiss'])
|
2015-09-04 20:40:11 +00:00
|
|
|
|
eq_(tokenize('İstanbul', 'tr'), ['istanbul'])
|
|
|
|
|
eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
|
2015-06-30 19:14:02 +00:00
|
|
|
|
|
|
|
|
|
|
2015-06-25 15:25:51 +00:00
|
|
|
|
def test_phrase_freq():
|
2015-08-24 20:24:49 +00:00
|
|
|
|
ff = word_frequency("flip-flop", 'en')
|
|
|
|
|
assert_greater(ff, 0)
|
2015-07-07 18:13:28 +00:00
|
|
|
|
assert_almost_equal(
|
2015-08-24 20:24:49 +00:00
|
|
|
|
1.0 / ff,
|
|
|
|
|
1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
|
2015-07-09 19:26:54 +00:00
|
|
|
|
)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_not_really_random():
|
|
|
|
|
# If your xkcd-style password comes out like this, maybe you shouldn't
|
|
|
|
|
# use it
|
2015-06-30 15:29:14 +00:00
|
|
|
|
eq_(random_words(nwords=4, lang='en', bits_per_word=0),
|
2015-05-22 00:34:17 +00:00
|
|
|
|
'the the the the')
|
|
|
|
|
|
|
|
|
|
# This not only tests random_ascii_words, it makes sure we didn't end
|
|
|
|
|
# up with 'eos' as a very common Japanese word
|
2015-06-30 15:29:14 +00:00
|
|
|
|
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
|
2015-07-01 15:18:39 +00:00
|
|
|
|
'rt rt rt rt')
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@raises(ValueError)
|
|
|
|
|
def test_not_enough_ascii():
|
|
|
|
|
random_ascii_words(lang='zh')
|
2015-07-07 19:10:59 +00:00
|
|
|
|
|
2015-07-20 20:48:36 +00:00
|
|
|
|
|
2015-08-24 20:24:49 +00:00
|
|
|
|
def test_ar():
|
2015-07-20 20:48:36 +00:00
|
|
|
|
# Remove tatweels
|
2015-07-07 19:10:59 +00:00
|
|
|
|
eq_(
|
|
|
|
|
tokenize('متــــــــعب', 'ar'),
|
|
|
|
|
['متعب']
|
|
|
|
|
)
|
|
|
|
|
|
2015-07-20 20:48:36 +00:00
|
|
|
|
# Remove combining marks
|
2015-07-07 19:10:59 +00:00
|
|
|
|
eq_(
|
|
|
|
|
tokenize('حَرَكَات', 'ar'),
|
|
|
|
|
['حركات']
|
|
|
|
|
)
|
2015-07-20 20:48:36 +00:00
|
|
|
|
|
|
|
|
|
eq_(
|
2015-08-03 15:09:44 +00:00
|
|
|
|
tokenize('\ufefb', 'ar'), # An Arabic ligature...
|
|
|
|
|
['\u0644\u0627'] # ...that is affected by NFKC normalization
|
2015-07-20 20:48:36 +00:00
|
|
|
|
)
|
2015-08-24 20:24:49 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_ideographic_fallback():
|
2015-09-05 07:16:56 +00:00
|
|
|
|
# Try tokenizing Chinese text as English -- it should remain stuck together.
|
|
|
|
|
eq_(tokenize('中国文字', 'en'), ['中国文字'])
|
2015-08-24 20:24:49 +00:00
|
|
|
|
|
|
|
|
|
# When Japanese is tagged with the wrong language, it will be split
|
|
|
|
|
# at script boundaries.
|
|
|
|
|
ja_text = 'ひらがなカタカナromaji'
|
|
|
|
|
eq_(
|
|
|
|
|
tokenize(ja_text, 'en'),
|
|
|
|
|
['ひらがな', 'カタカナ', 'romaji']
|
|
|
|
|
)
|