from wordfreq import ( word_frequency, available_languages, cB_to_freq, top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize ) from nose.tools import ( eq_, assert_almost_equal, assert_greater, raises ) def test_freq_examples(): # Stopwords are most common in the correct language assert_greater(word_frequency('the', 'en'), word_frequency('de', 'en')) assert_greater(word_frequency('de', 'es'), word_frequency('the', 'es')) # To test the reasonableness of the Twitter list, we want to look up a # common word representing laughter in each language. The default for # languages not listed here is 'haha'. LAUGHTER_WORDS = { 'en': 'lol', 'hi': 'lol', 'cs': 'lol', 'ru': 'лол', 'zh': '笑', 'ja': '笑', 'ar': 'ﻪﻬﻬﻬﻫ', 'fa': 'خخخخ', 'ca': 'jaja', 'es': 'jaja', 'fr': 'ptdr', 'pt': 'kkkk', 'he': 'חחח', 'bg': 'ахаха', 'uk': 'хаха', 'bn': 'হা হা', 'mk': 'хаха' } def test_languages(): # Make sure the number of available languages doesn't decrease avail = available_languages() assert_greater(len(avail), 26) # Look up the digit '2' in the main word list for each language for lang in avail: assert_greater(word_frequency('2', lang), 0, lang) # Make up a weirdly verbose language code and make sure # we still get it new_lang_code = '%s-001-x-fake-extension' % lang.upper() assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code) def test_twitter(): avail = available_languages('twitter') assert_greater(len(avail), 15) for lang in avail: assert_greater(word_frequency('rt', lang, 'twitter'), word_frequency('rt', lang, 'combined')) text = LAUGHTER_WORDS.get(lang, 'haha') assert_greater(word_frequency(text, lang, wordlist='twitter'), 0, (text, lang)) def test_minimums(): eq_(word_frequency('esquivalience', 'en'), 0) eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6) eq_(word_frequency('the', 'en', minimum=1), 1) def test_most_common_words(): # If something causes the most common words in well-supported languages to # change, we should know. def get_most_common(lang): """ Return the single most common word in the language. """ return top_n_list(lang, 1)[0] eq_(get_most_common('ar'), 'في') eq_(get_most_common('de'), 'die') eq_(get_most_common('en'), 'the') eq_(get_most_common('es'), 'de') eq_(get_most_common('fr'), 'de') eq_(get_most_common('it'), 'di') eq_(get_most_common('ja'), 'の') eq_(get_most_common('nl'), 'de') eq_(get_most_common('pl'), 'w') eq_(get_most_common('pt'), 'de') eq_(get_most_common('ru'), 'в') eq_(get_most_common('tr'), 'bir') eq_(get_most_common('zh'), '的') def test_language_matching(): freq = word_frequency('的', 'zh') eq_(word_frequency('的', 'zh-TW'), freq) eq_(word_frequency('的', 'zh-CN'), freq) eq_(word_frequency('的', 'zh-Hant'), freq) eq_(word_frequency('的', 'zh-Hans'), freq) eq_(word_frequency('的', 'yue-HK'), freq) eq_(word_frequency('的', 'cmn'), freq) def test_cB_conversion(): eq_(cB_to_freq(0), 1.) assert_almost_equal(cB_to_freq(-100), 0.1) assert_almost_equal(cB_to_freq(-600), 1e-6) @raises(ValueError) def test_failed_cB_conversion(): cB_to_freq(1) def test_tokenization(): # We preserve apostrophes within words, so "can't" is a single word in the # data eq_(tokenize("I don't split at apostrophes, you see.", 'en'), ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']) eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True), ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']) # Certain punctuation does not inherently split a word. eq_(tokenize("Anything is possible at zombo.com", 'en'), ['anything', 'is', 'possible', 'at', 'zombo.com']) # Splits occur after symbols, and at splitting punctuation such as hyphens. eq_(tokenize('😂test', 'en'), ['😂', 'test']) eq_(tokenize("flip-flop", 'en'), ['flip', 'flop']) eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True), ['this', 'text', 'has', '...', 'punctuation', ':)']) # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf' # and 'David Bowie' stay together, because our Unicode segmentation algorithm # is up to date eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽']) eq_(tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'), ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's", 'nothing', 'i', 'can', 'do', '🌎', '🚀']) # Water wave, surfer, flag of California (indicates ridiculously complete support # for Unicode 10 and Emoji 5.0) eq_(tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en'), ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"]) def test_casefolding(): eq_(tokenize('WEISS', 'de'), ['weiss']) eq_(tokenize('weiß', 'de'), ['weiss']) eq_(tokenize('İstanbul', 'tr'), ['istanbul']) eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca']) def test_number_smashing(): eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'), ['715', 'crσσks', 'by', 'bon', 'iver']) eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'), ['000', 'crσσks', 'by', 'bon', 'iver']) eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True), ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']) eq_(lossy_tokenize('1', 'en'), ['1']) eq_(lossy_tokenize('3.14', 'en'), ['0.00']) eq_(lossy_tokenize('24601', 'en'), ['00000']) eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en')) def test_phrase_freq(): ff = word_frequency("flip-flop", 'en') assert_greater(ff, 0) assert_almost_equal( 1.0 / ff, 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en') ) def test_not_really_random(): # If your xkcd-style password comes out like this, maybe you shouldn't # use it eq_(random_words(nwords=4, lang='en', bits_per_word=0), 'the the the the') # This not only tests random_ascii_words, it makes sure we didn't end # up with 'eos' as a very common Japanese word eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0), '1 1 1 1') @raises(ValueError) def test_not_enough_ascii(): random_ascii_words(lang='zh', bits_per_word=14) def test_arabic(): # Remove tatweels eq_( tokenize('متــــــــعب', 'ar'), ['متعب'] ) # Remove combining marks eq_( tokenize('حَرَكَات', 'ar'), ['حركات'] ) eq_( tokenize('\ufefb', 'ar'), # An Arabic ligature... ['\u0644\u0627'] # ...that is affected by NFKC normalization ) def test_ideographic_fallback(): # Try tokenizing Chinese text as English -- it should remain stuck together. eq_(tokenize('中国文字', 'en'), ['中国文字']) # When Japanese is tagged with the wrong language, it will be split # at script boundaries. ja_text = 'ひらがなカタカナromaji' eq_( tokenize(ja_text, 'en'), ['ひらがな', 'カタカナ', 'romaji'] ) def test_other_languages(): # Test that we leave Thai letters stuck together. If we had better Thai support, # we would actually split this into a three-word phrase. eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี']) eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'), ['การเล่นดนตรี', 'means', 'playing', 'music']) # Test Khmer, a script similar to Thai eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍']) # Test Hindi -- tokens split where there are spaces, and not where there aren't eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी']) # Remove vowel points in Hebrew eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה']) # Deal with commas, cedillas, and I's in Turkish eq_(tokenize('kișinin', 'tr'), ['kişinin']) eq_(tokenize('KİȘİNİN', 'tr'), ['kişinin']) # Deal with cedillas that should be commas-below in Romanian eq_(tokenize('acelaşi', 'ro'), ['același']) eq_(tokenize('ACELAŞI', 'ro'), ['același'])