Revise multilingual tests

2024-12-23 09:21:37 +00:00 · 2016-07-29 12:19:12 -04:00 · 2016-07-29 12:19:12 -04:00 · 21246f881f
commit 21246f881f
parent e6a8f028e3
1 changed files with 27 additions and 32 deletions
--- a/tests/test.py
+++ b/tests/test.py
@ -16,55 +16,49 @@ def test_freq_examples():
                   word_frequency('the', 'es'))


+# To test the reasonableness of the Twitter list, we want to look up a
+# common word representing laughter in each language. The default for
+# languages not listed here is 'haha'.
+LAUGHTER_WORDS = {
+    'en': 'lol',
+    'hi': 'lol',
+    'ru': 'лол',
+    'zh': '笑',
+    'ja': '笑',
+    'ar': 'ﻪﻬﻬﻬﻫ',
+    'ca': 'jaja',
+    'es': 'jaja',
+    'fr': 'ptdr',
+    'pt': 'kkkk',
+    'he': 'חחח',
+    'bg': 'xaxa',
+}
+
+
 def test_languages():
    # Make sure the number of available languages doesn't decrease
    avail = available_languages()
    assert_greater(len(avail), 26)

-    avail_twitter = available_languages('twitter')
-    assert_greater(len(avail_twitter), 15)
-    # Look up a word representing laughter in each language, and make sure
-    # it has a non-zero frequency in the informal 'twitter' list.
-    for lang in avail_twitter:
-        if lang == 'zh' or lang == 'ja':
-            text = '笑'
-        elif lang == 'ko':
-            text = 'ᄏᄏᄏ'
-        elif lang == 'ar':
-            text = 'ههههه'
-        elif lang == 'ca' or lang == 'es':
-            text = 'jaja'
-        elif lang in {'de', 'nb', 'sv', 'da'}:
-            text = 'haha'
-        elif lang == 'pt':
-            text = 'kkkk'
-        elif lang == 'he':
-            text = 'חחח'
-        elif lang == 'ru':
-            text = 'лол'
-        elif lang == 'bg':
-            text = 'хаха'
-        elif lang == 'ro':
-            text = 'haha'
-        elif lang == 'el':
-            text = 'χαχα'
-        else:
-            text = 'lol'
-        assert_greater(word_frequency(text, lang, wordlist='twitter'), 0, (text, lang))
+    # Look up the digit '2' in the main word list for each language
+    for lang in avail:
+        assert_greater(word_frequency('2', lang), 0, lang)

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
-        assert_greater(word_frequency(text, new_lang_code, wordlist='twitter'), 0, (text, new_lang_code))
+        assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code)


 def test_twitter():
    avail = available_languages('twitter')
-    assert_greater(len(avail), 14)
+    assert_greater(len(avail), 15)

    for lang in avail:
        assert_greater(word_frequency('rt', lang, 'twitter'),
                       word_frequency('rt', lang, 'combined'))
+        text = LAUGHTER_WORDS.get(lang, 'haha')
+        assert_greater(word_frequency(text, lang, wordlist='twitter'), 0, (text, lang))


 def test_minimums():
@ -72,6 +66,7 @@ def test_minimums():
    eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
    eq_(word_frequency('the', 'en', minimum=1), 1)

+
 def test_most_common_words():
    # If something causes the most common words in well-supported languages to
    # change, we should know.