from wordfreq import ( word_frequency, available_languages, cB_to_freq, top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize, ) import pytest def test_freq_examples(): # Stopwords are most common in the correct language assert word_frequency("the", "en") > word_frequency("de", "en") assert word_frequency("de", "es") > word_frequency("the", "es") # We get word frequencies from the 'large' list when available assert word_frequency("infrequency", "en") > 0.0 def test_languages(): # Make sure we get all the languages when looking for the default # 'best' wordlist avail = available_languages() assert len(avail) >= 34 # 'small' covers the same languages, but with some different lists avail_small = available_languages("small") assert len(avail_small) == len(avail) assert avail_small != avail # 'combined' is the same as 'small' avail_old_name = available_languages("combined") assert avail_old_name == avail_small # 'large' covers fewer languages avail_large = available_languages("large") assert len(avail_large) >= 14 assert len(avail) > len(avail_large) # Look up the digit '2' in the main word list for each language for lang in avail: assert word_frequency("2", lang) > 0 # Make up a weirdly verbose language code and make sure # we still get it new_lang_code = "%s-001-x-fake-ext" % lang.upper() assert word_frequency("2", new_lang_code) > 0 def test_minimums(): assert word_frequency("esquivalience", "en") == 0 assert word_frequency("esquivalience", "en", minimum=1e-6) == 1e-6 assert word_frequency("the", "en", minimum=1) == 1 def test_most_common_words(): # If something causes the most common words in well-supported languages to # change, we should know. def get_most_common(lang): """ Return the single most common word in the language. """ return top_n_list(lang, 1)[0] assert get_most_common("ar") == "في" assert get_most_common("bg") == "на" assert get_most_common("bn") == "না" assert get_most_common("ca") == "de" assert get_most_common("cs") == "a" assert get_most_common("da") == "i" assert get_most_common("el") == "και" assert get_most_common("de") == "die" assert get_most_common("en") == "the" assert get_most_common("es") == "de" assert get_most_common("fi") == "ja" assert get_most_common("fil") == "sa" assert get_most_common("fr") == "de" assert get_most_common("he") == "את" assert get_most_common("hi") == "के" assert get_most_common("hu") == "a" assert get_most_common("id") == "yang" assert get_most_common("is") == "og" assert get_most_common("it") == "di" assert get_most_common("ja") == "の" assert get_most_common("ko") == "이" assert get_most_common("lt") == "ir" assert get_most_common("lv") == "un" assert get_most_common("mk") == "на" assert get_most_common("ms") == "yang" assert get_most_common("nb") == "i" assert get_most_common("nl") == "de" assert get_most_common("pl") == "w" assert get_most_common("pt") == "de" assert get_most_common("ro") == "de" assert get_most_common("ru") == "в" assert get_most_common("sh") == "je" assert get_most_common("sk") == "a" assert get_most_common("sl") == "je" assert get_most_common("sv") == "är" assert get_most_common("ta") == "ஒரு" assert get_most_common("tr") == "ve" assert get_most_common("uk") == "в" assert get_most_common("ur") == "کے" assert get_most_common("vi") == "là" assert get_most_common("zh") == "的" def test_language_matching(): freq = word_frequency("的", "zh") assert word_frequency("的", "zh-TW") == freq assert word_frequency("的", "zh-CN") == freq assert word_frequency("的", "zh-Hant") == freq assert word_frequency("的", "zh-Hans") == freq assert word_frequency("的", "yue-CN") == freq assert word_frequency("的", "cmn") == freq def test_cB_conversion(): assert cB_to_freq(0) == 1.0 assert cB_to_freq(-100) == pytest.approx(0.1) assert cB_to_freq(-600) == pytest.approx(1e-6) def test_failed_cB_conversion(): with pytest.raises(ValueError): cB_to_freq(1) def test_tokenization(): # We preserve apostrophes within words, so "can't" is a single word in the # data assert tokenize("I don't split at apostrophes, you see.", "en") == [ "i", "don't", "split", "at", "apostrophes", "you", "see", ] assert tokenize( "I don't split at apostrophes, you see.", "en", include_punctuation=True ) == ["i", "don't", "split", "at", "apostrophes", ",", "you", "see", "."] # Certain punctuation does not inherently split a word. assert tokenize("Anything is possible at zombo.com", "en") == [ "anything", "is", "possible", "at", "zombo.com", ] # Splits occur after symbols, and at splitting punctuation such as hyphens. assert tokenize("😂test", "en") == ["😂", "test"] assert tokenize("flip-flop", "en") == ["flip", "flop"] assert tokenize( "this text has... punctuation :)", "en", include_punctuation=True ) == ["this", "text", "has", "...", "punctuation", ":)"] # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf' # and 'David Bowie' stay together, because our Unicode segmentation algorithm # is up to date assert tokenize("emoji test 🧕🏽", "en") == ["emoji", "test", "🧕🏽"] assert tokenize( "👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", "en" ) == [ "👨‍🎤", "planet", "earth", "is", "blue", "and", "there's", "nothing", "i", "can", "do", "🌎", "🚀", ] # Water wave, surfer, flag of California (indicates ridiculously complete support # for Unicode 10 and Emoji 5.0) assert tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'", "en") == ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"] def test_casefolding(): assert tokenize("WEISS", "de") == ["weiss"] assert tokenize("weiß", "de") == ["weiss"] assert tokenize("İstanbul", "tr") == ["istanbul"] assert tokenize("SIKISINCA", "tr") == ["sıkısınca"] def test_normalization(): assert tokenize('"715 - CRΣΣKS" by Bon Iver', "en") == [ "715", "crσσks", "by", "bon", "iver", ] assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', "en") == [ "715", "crσσks", "by", "bon", "iver", ] def test_uncurl_quotes(): assert lossy_tokenize("let’s", "en") == ["let's"] assert word_frequency("let’s", "en") == word_frequency("let's", "en") def test_phrase_freq(): ff = word_frequency("flip-flop", "en") assert ff > 0 phrase_freq = 1.0 / word_frequency("flip", "en") + 1.0 / word_frequency( "flop", "en" ) assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01) def test_not_really_random(): # If your xkcd-style password comes out like this, maybe you shouldn't # use it assert random_words(nwords=4, lang="en", bits_per_word=0) == "the the the the" # This not only tests random_ascii_words, it makes sure we didn't end # up with 'eos' as a very common Japanese word assert random_ascii_words(nwords=4, lang="ja", bits_per_word=0) == "1 1 1 1" def test_not_enough_ascii(): with pytest.raises(ValueError): random_ascii_words(lang="zh", bits_per_word=16) def test_arabic(): # Remove tatweels assert tokenize("متــــــــعب", "ar") == ["متعب"] # Remove combining marks assert tokenize("حَرَكَات", "ar") == ["حركات"] # An Arabic ligature that is affected by NFKC normalization assert tokenize("\ufefb", "ar") == ["\u0644\u0627"] def test_ideographic_fallback(): # Try tokenizing Chinese text as English -- it should remain stuck together. # # More complex examples like this, involving the multiple scripts of Japanese, # are in test_japanese.py. assert tokenize("中国文字", "en") == ["中国文字"] def test_other_languages(): # Test that we leave Thai letters stuck together. If we had better Thai support, # we would actually split this into a three-word phrase. assert tokenize("การเล่นดนตรี", "th") == ["การเล่นดนตรี"] assert tokenize('"การเล่นดนตรี" means "playing music"', "en") == [ "การเล่นดนตรี", "means", "playing", "music", ] # Test Khmer, a script similar to Thai assert tokenize("សូមស្វាគមន៍", "km") == ["សូមស្វាគមន៍"] # Test Hindi -- tokens split where there are spaces, and not where there aren't assert tokenize("हिन्दी विक्षनरी", "hi") == ["हिन्दी", "विक्षनरी"] # Remove vowel points in Hebrew assert tokenize("דֻּגְמָה", "he") == ["דגמה"] # Deal with commas, cedillas, and I's in Turkish assert tokenize("kișinin", "tr") == ["kişinin"] assert tokenize("KİȘİNİN", "tr") == ["kişinin"] # Deal with cedillas that should be commas-below in Romanian assert tokenize("acelaşi", "ro") == ["același"] assert tokenize("ACELAŞI", "ro") == ["același"]