Revise multilingual tests

Former-commit-id: 21246f881f
This commit is contained in:
Robyn Speer 2016-07-29 12:19:12 -04:00
parent 2a41d4dc5e
commit 68c6d95131

View File

@ -16,55 +16,49 @@ def test_freq_examples():
word_frequency('the', 'es')) word_frequency('the', 'es'))
# To test the reasonableness of the Twitter list, we want to look up a
# common word representing laughter in each language. The default for
# languages not listed here is 'haha'.
LAUGHTER_WORDS = {
'en': 'lol',
'hi': 'lol',
'ru': 'лол',
'zh': '',
'ja': '',
'ar': '',
'ca': 'jaja',
'es': 'jaja',
'fr': 'ptdr',
'pt': 'kkkk',
'he': 'חחח',
'bg': 'xaxa',
}
def test_languages(): def test_languages():
# Make sure the number of available languages doesn't decrease # Make sure the number of available languages doesn't decrease
avail = available_languages() avail = available_languages()
assert_greater(len(avail), 26) assert_greater(len(avail), 26)
avail_twitter = available_languages('twitter') # Look up the digit '2' in the main word list for each language
assert_greater(len(avail_twitter), 15) for lang in avail:
# Look up a word representing laughter in each language, and make sure assert_greater(word_frequency('2', lang), 0, lang)
# it has a non-zero frequency in the informal 'twitter' list.
for lang in avail_twitter:
if lang == 'zh' or lang == 'ja':
text = ''
elif lang == 'ko':
text = 'ᄏᄏᄏ'
elif lang == 'ar':
text = 'ههههه'
elif lang == 'ca' or lang == 'es':
text = 'jaja'
elif lang in {'de', 'nb', 'sv', 'da'}:
text = 'haha'
elif lang == 'pt':
text = 'kkkk'
elif lang == 'he':
text = 'חחח'
elif lang == 'ru':
text = 'лол'
elif lang == 'bg':
text = 'хаха'
elif lang == 'ro':
text = 'haha'
elif lang == 'el':
text = 'χαχα'
else:
text = 'lol'
assert_greater(word_frequency(text, lang, wordlist='twitter'), 0, (text, lang))
# Make up a weirdly verbose language code and make sure # Make up a weirdly verbose language code and make sure
# we still get it # we still get it
new_lang_code = '%s-001-x-fake-extension' % lang.upper() new_lang_code = '%s-001-x-fake-extension' % lang.upper()
assert_greater(word_frequency(text, new_lang_code, wordlist='twitter'), 0, (text, new_lang_code)) assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code)
def test_twitter(): def test_twitter():
avail = available_languages('twitter') avail = available_languages('twitter')
assert_greater(len(avail), 14) assert_greater(len(avail), 15)
for lang in avail: for lang in avail:
assert_greater(word_frequency('rt', lang, 'twitter'), assert_greater(word_frequency('rt', lang, 'twitter'),
word_frequency('rt', lang, 'combined')) word_frequency('rt', lang, 'combined'))
text = LAUGHTER_WORDS.get(lang, 'haha')
assert_greater(word_frequency(text, lang, wordlist='twitter'), 0, (text, lang))
def test_minimums(): def test_minimums():
@ -72,6 +66,7 @@ def test_minimums():
eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6) eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
eq_(word_frequency('the', 'en', minimum=1), 1) eq_(word_frequency('the', 'en', minimum=1), 1)
def test_most_common_words(): def test_most_common_words():
# If something causes the most common words in well-supported languages to # If something causes the most common words in well-supported languages to
# change, we should know. # change, we should know.