2023-11-21 23:07:04 +00:00
|
|
|
|
import pytest
|
2015-05-22 00:34:17 +00:00
|
|
|
|
from wordfreq import (
|
2022-03-10 23:33:42 +00:00
|
|
|
|
available_languages,
|
|
|
|
|
cB_to_freq,
|
2023-11-21 23:07:04 +00:00
|
|
|
|
lossy_tokenize,
|
2022-03-10 23:33:42 +00:00
|
|
|
|
random_ascii_words,
|
2023-11-21 23:07:04 +00:00
|
|
|
|
random_words,
|
2022-03-10 23:33:42 +00:00
|
|
|
|
tokenize,
|
2023-11-21 23:07:04 +00:00
|
|
|
|
top_n_list,
|
|
|
|
|
word_frequency,
|
2015-05-22 00:34:17 +00:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_freq_examples():
|
|
|
|
|
# Stopwords are most common in the correct language
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert word_frequency("the", "en") > word_frequency("de", "en")
|
|
|
|
|
assert word_frequency("de", "es") > word_frequency("the", "es")
|
2018-03-08 23:09:57 +00:00
|
|
|
|
# We get word frequencies from the 'large' list when available
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert word_frequency("infrequency", "en") > 0.0
|
2018-03-08 23:09:57 +00:00
|
|
|
|
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
def test_languages():
|
2018-03-08 22:52:44 +00:00
|
|
|
|
# Make sure we get all the languages when looking for the default
|
|
|
|
|
# 'best' wordlist
|
2015-05-22 00:34:17 +00:00
|
|
|
|
avail = available_languages()
|
2018-06-15 19:48:41 +00:00
|
|
|
|
assert len(avail) >= 34
|
2018-03-08 22:52:44 +00:00
|
|
|
|
|
|
|
|
|
# 'small' covers the same languages, but with some different lists
|
2022-03-10 23:33:42 +00:00
|
|
|
|
avail_small = available_languages("small")
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert len(avail_small) == len(avail)
|
|
|
|
|
assert avail_small != avail
|
2018-03-08 22:52:44 +00:00
|
|
|
|
|
|
|
|
|
# 'combined' is the same as 'small'
|
2022-03-10 23:33:42 +00:00
|
|
|
|
avail_old_name = available_languages("combined")
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert avail_old_name == avail_small
|
2018-03-08 22:52:44 +00:00
|
|
|
|
|
|
|
|
|
# 'large' covers fewer languages
|
2022-03-10 23:33:42 +00:00
|
|
|
|
avail_large = available_languages("large")
|
2018-06-12 21:55:44 +00:00
|
|
|
|
assert len(avail_large) >= 14
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert len(avail) > len(avail_large)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
2016-07-29 16:19:12 +00:00
|
|
|
|
# Look up the digit '2' in the main word list for each language
|
|
|
|
|
for lang in avail:
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert word_frequency("2", lang) > 0
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
2015-09-15 17:26:09 +00:00
|
|
|
|
# Make up a weirdly verbose language code and make sure
|
|
|
|
|
# we still get it
|
2022-03-10 23:33:42 +00:00
|
|
|
|
new_lang_code = "%s-001-x-fake-ext" % lang.upper()
|
|
|
|
|
assert word_frequency("2", new_lang_code) > 0
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
2015-07-07 19:46:33 +00:00
|
|
|
|
def test_minimums():
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert word_frequency("esquivalience", "en") == 0
|
|
|
|
|
assert word_frequency("esquivalience", "en", minimum=1e-6) == 1e-6
|
|
|
|
|
assert word_frequency("the", "en", minimum=1) == 1
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
2016-07-29 16:19:12 +00:00
|
|
|
|
|
2015-05-22 00:34:17 +00:00
|
|
|
|
def test_most_common_words():
|
|
|
|
|
# If something causes the most common words in well-supported languages to
|
|
|
|
|
# change, we should know.
|
|
|
|
|
|
|
|
|
|
def get_most_common(lang):
|
|
|
|
|
"""
|
|
|
|
|
Return the single most common word in the language.
|
|
|
|
|
"""
|
|
|
|
|
return top_n_list(lang, 1)[0]
|
|
|
|
|
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert get_most_common("ar") == "في"
|
|
|
|
|
assert get_most_common("bg") == "на"
|
|
|
|
|
assert get_most_common("bn") == "না"
|
|
|
|
|
assert get_most_common("ca") == "de"
|
|
|
|
|
assert get_most_common("cs") == "a"
|
|
|
|
|
assert get_most_common("da") == "i"
|
|
|
|
|
assert get_most_common("el") == "και"
|
|
|
|
|
assert get_most_common("de") == "die"
|
|
|
|
|
assert get_most_common("en") == "the"
|
|
|
|
|
assert get_most_common("es") == "de"
|
|
|
|
|
assert get_most_common("fi") == "ja"
|
|
|
|
|
assert get_most_common("fil") == "sa"
|
|
|
|
|
assert get_most_common("fr") == "de"
|
|
|
|
|
assert get_most_common("he") == "את"
|
|
|
|
|
assert get_most_common("hi") == "के"
|
|
|
|
|
assert get_most_common("hu") == "a"
|
|
|
|
|
assert get_most_common("id") == "yang"
|
|
|
|
|
assert get_most_common("is") == "og"
|
|
|
|
|
assert get_most_common("it") == "di"
|
|
|
|
|
assert get_most_common("ja") == "の"
|
|
|
|
|
assert get_most_common("ko") == "이"
|
|
|
|
|
assert get_most_common("lt") == "ir"
|
|
|
|
|
assert get_most_common("lv") == "un"
|
|
|
|
|
assert get_most_common("mk") == "на"
|
|
|
|
|
assert get_most_common("ms") == "yang"
|
|
|
|
|
assert get_most_common("nb") == "i"
|
|
|
|
|
assert get_most_common("nl") == "de"
|
|
|
|
|
assert get_most_common("pl") == "w"
|
|
|
|
|
assert get_most_common("pt") == "de"
|
|
|
|
|
assert get_most_common("ro") == "de"
|
|
|
|
|
assert get_most_common("ru") == "в"
|
|
|
|
|
assert get_most_common("sh") == "je"
|
|
|
|
|
assert get_most_common("sk") == "a"
|
|
|
|
|
assert get_most_common("sl") == "je"
|
|
|
|
|
assert get_most_common("sv") == "är"
|
|
|
|
|
assert get_most_common("ta") == "ஒரு"
|
|
|
|
|
assert get_most_common("tr") == "ve"
|
|
|
|
|
assert get_most_common("uk") == "в"
|
|
|
|
|
assert get_most_common("ur") == "کے"
|
|
|
|
|
assert get_most_common("vi") == "là"
|
|
|
|
|
assert get_most_common("zh") == "的"
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_language_matching():
|
2022-03-10 23:33:42 +00:00
|
|
|
|
freq = word_frequency("的", "zh")
|
|
|
|
|
assert word_frequency("的", "zh-TW") == freq
|
|
|
|
|
assert word_frequency("的", "zh-CN") == freq
|
|
|
|
|
assert word_frequency("的", "zh-Hant") == freq
|
|
|
|
|
assert word_frequency("的", "zh-Hans") == freq
|
|
|
|
|
assert word_frequency("的", "yue-CN") == freq
|
|
|
|
|
assert word_frequency("的", "cmn") == freq
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
2015-06-22 21:36:30 +00:00
|
|
|
|
def test_cB_conversion():
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert cB_to_freq(0) == 1.0
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert cB_to_freq(-100) == pytest.approx(0.1)
|
|
|
|
|
assert cB_to_freq(-600) == pytest.approx(1e-6)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
2015-06-22 21:36:30 +00:00
|
|
|
|
def test_failed_cB_conversion():
|
2018-06-01 20:33:06 +00:00
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
|
cB_to_freq(1)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_tokenization():
|
|
|
|
|
# We preserve apostrophes within words, so "can't" is a single word in the
|
2015-07-17 18:50:12 +00:00
|
|
|
|
# data
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("I don't split at apostrophes, you see.", "en") == [
|
|
|
|
|
"i",
|
|
|
|
|
"don't",
|
|
|
|
|
"split",
|
|
|
|
|
"at",
|
|
|
|
|
"apostrophes",
|
|
|
|
|
"you",
|
|
|
|
|
"see",
|
|
|
|
|
]
|
|
|
|
|
|
2023-11-21 23:07:04 +00:00
|
|
|
|
assert tokenize("I don't split at apostrophes, you see.", "en", include_punctuation=True) == [
|
|
|
|
|
"i",
|
|
|
|
|
"don't",
|
|
|
|
|
"split",
|
|
|
|
|
"at",
|
|
|
|
|
"apostrophes",
|
|
|
|
|
",",
|
|
|
|
|
"you",
|
|
|
|
|
"see",
|
|
|
|
|
".",
|
|
|
|
|
]
|
2015-06-25 15:25:51 +00:00
|
|
|
|
|
2015-08-24 20:24:49 +00:00
|
|
|
|
# Certain punctuation does not inherently split a word.
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("Anything is possible at zombo.com", "en") == [
|
|
|
|
|
"anything",
|
|
|
|
|
"is",
|
|
|
|
|
"possible",
|
|
|
|
|
"at",
|
|
|
|
|
"zombo.com",
|
|
|
|
|
]
|
2015-08-24 20:24:49 +00:00
|
|
|
|
|
|
|
|
|
# Splits occur after symbols, and at splitting punctuation such as hyphens.
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("😂test", "en") == ["😂", "test"]
|
|
|
|
|
assert tokenize("flip-flop", "en") == ["flip", "flop"]
|
2023-11-21 23:07:04 +00:00
|
|
|
|
assert tokenize("this text has... punctuation :)", "en", include_punctuation=True) == [
|
|
|
|
|
"this",
|
|
|
|
|
"text",
|
|
|
|
|
"has",
|
|
|
|
|
"...",
|
|
|
|
|
"punctuation",
|
|
|
|
|
":)",
|
|
|
|
|
]
|
2015-09-15 17:26:09 +00:00
|
|
|
|
|
2017-08-08 15:35:28 +00:00
|
|
|
|
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
|
|
|
|
|
# and 'David Bowie' stay together, because our Unicode segmentation algorithm
|
|
|
|
|
# is up to date
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("emoji test 🧕🏽", "en") == ["emoji", "test", "🧕🏽"]
|
2023-11-21 23:07:04 +00:00
|
|
|
|
assert tokenize("👨🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", "en") == [
|
2022-03-10 23:33:42 +00:00
|
|
|
|
"👨🎤",
|
|
|
|
|
"planet",
|
|
|
|
|
"earth",
|
|
|
|
|
"is",
|
|
|
|
|
"blue",
|
|
|
|
|
"and",
|
|
|
|
|
"there's",
|
|
|
|
|
"nothing",
|
|
|
|
|
"i",
|
|
|
|
|
"can",
|
|
|
|
|
"do",
|
|
|
|
|
"🌎",
|
|
|
|
|
"🚀",
|
|
|
|
|
]
|
2017-08-08 15:35:28 +00:00
|
|
|
|
|
|
|
|
|
# Water wave, surfer, flag of California (indicates ridiculously complete support
|
|
|
|
|
# for Unicode 10 and Emoji 5.0)
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("Surf's up 🌊🏄🏴'", "en") == ["surf's", "up", "🌊", "🏄", "🏴"]
|
2017-08-08 15:35:28 +00:00
|
|
|
|
|
2015-06-30 19:14:02 +00:00
|
|
|
|
|
|
|
|
|
def test_casefolding():
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("WEISS", "de") == ["weiss"]
|
|
|
|
|
assert tokenize("weiß", "de") == ["weiss"]
|
|
|
|
|
assert tokenize("İstanbul", "tr") == ["istanbul"]
|
|
|
|
|
assert tokenize("SIKISINCA", "tr") == ["sıkısınca"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_normalization():
|
|
|
|
|
assert tokenize('"715 - CRΣΣKS" by Bon Iver', "en") == [
|
|
|
|
|
"715",
|
|
|
|
|
"crσσks",
|
|
|
|
|
"by",
|
|
|
|
|
"bon",
|
|
|
|
|
"iver",
|
|
|
|
|
]
|
|
|
|
|
assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', "en") == [
|
|
|
|
|
"715",
|
|
|
|
|
"crσσks",
|
|
|
|
|
"by",
|
|
|
|
|
"bon",
|
|
|
|
|
"iver",
|
|
|
|
|
]
|
2017-01-05 22:59:26 +00:00
|
|
|
|
|
|
|
|
|
|
2021-09-02 17:47:47 +00:00
|
|
|
|
def test_uncurl_quotes():
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert lossy_tokenize("let’s", "en") == ["let's"]
|
|
|
|
|
assert word_frequency("let’s", "en") == word_frequency("let's", "en")
|
2021-09-02 17:47:47 +00:00
|
|
|
|
|
|
|
|
|
|
2015-06-25 15:25:51 +00:00
|
|
|
|
def test_phrase_freq():
|
2022-03-10 23:33:42 +00:00
|
|
|
|
ff = word_frequency("flip-flop", "en")
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert ff > 0
|
2023-11-21 23:07:04 +00:00
|
|
|
|
phrase_freq = 1.0 / word_frequency("flip", "en") + 1.0 / word_frequency("flop", "en")
|
2018-06-15 19:42:54 +00:00
|
|
|
|
assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_not_really_random():
|
|
|
|
|
# If your xkcd-style password comes out like this, maybe you shouldn't
|
|
|
|
|
# use it
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert random_words(nwords=4, lang="en", bits_per_word=0) == "the the the the"
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
# This not only tests random_ascii_words, it makes sure we didn't end
|
|
|
|
|
# up with 'eos' as a very common Japanese word
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert random_ascii_words(nwords=4, lang="ja", bits_per_word=0) == "1 1 1 1"
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_not_enough_ascii():
|
2018-06-01 20:33:06 +00:00
|
|
|
|
with pytest.raises(ValueError):
|
2022-03-10 23:33:42 +00:00
|
|
|
|
random_ascii_words(lang="zh", bits_per_word=16)
|
2015-07-07 19:10:59 +00:00
|
|
|
|
|
2015-07-20 20:48:36 +00:00
|
|
|
|
|
2016-07-15 19:10:25 +00:00
|
|
|
|
def test_arabic():
|
2015-07-20 20:48:36 +00:00
|
|
|
|
# Remove tatweels
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("متــــــــعب", "ar") == ["متعب"]
|
2015-07-07 19:10:59 +00:00
|
|
|
|
|
2015-07-20 20:48:36 +00:00
|
|
|
|
# Remove combining marks
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("حَرَكَات", "ar") == ["حركات"]
|
2015-07-20 20:48:36 +00:00
|
|
|
|
|
2018-06-01 20:33:06 +00:00
|
|
|
|
# An Arabic ligature that is affected by NFKC normalization
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("\ufefb", "ar") == ["\u0644\u0627"]
|
2015-08-24 20:24:49 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_ideographic_fallback():
|
2015-09-05 07:16:56 +00:00
|
|
|
|
# Try tokenizing Chinese text as English -- it should remain stuck together.
|
2018-04-26 19:53:07 +00:00
|
|
|
|
#
|
|
|
|
|
# More complex examples like this, involving the multiple scripts of Japanese,
|
|
|
|
|
# are in test_japanese.py.
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("中国文字", "en") == ["中国文字"]
|
2015-08-24 20:24:49 +00:00
|
|
|
|
|
2018-03-08 21:25:45 +00:00
|
|
|
|
|
2016-07-15 19:10:25 +00:00
|
|
|
|
def test_other_languages():
|
2016-03-10 16:56:04 +00:00
|
|
|
|
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
|
|
|
|
# we would actually split this into a three-word phrase.
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("การเล่นดนตรี", "th") == ["การเล่นดนตรี"]
|
|
|
|
|
assert tokenize('"การเล่นดนตรี" means "playing music"', "en") == [
|
|
|
|
|
"การเล่นดนตรี",
|
|
|
|
|
"means",
|
|
|
|
|
"playing",
|
|
|
|
|
"music",
|
|
|
|
|
]
|
2016-03-10 16:56:04 +00:00
|
|
|
|
|
2016-07-01 22:00:57 +00:00
|
|
|
|
# Test Khmer, a script similar to Thai
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("សូមស្វាគមន៍", "km") == ["សូមស្វាគមន៍"]
|
2016-07-01 22:00:57 +00:00
|
|
|
|
|
|
|
|
|
# Test Hindi -- tokens split where there are spaces, and not where there aren't
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("हिन्दी विक्षनरी", "hi") == ["हिन्दी", "विक्षनरी"]
|
2016-07-15 19:10:25 +00:00
|
|
|
|
|
|
|
|
|
# Remove vowel points in Hebrew
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("דֻּגְמָה", "he") == ["דגמה"]
|
2016-07-15 19:10:25 +00:00
|
|
|
|
|
2016-07-28 23:23:17 +00:00
|
|
|
|
# Deal with commas, cedillas, and I's in Turkish
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("kișinin", "tr") == ["kişinin"]
|
|
|
|
|
assert tokenize("KİȘİNİN", "tr") == ["kişinin"]
|
2016-07-28 23:23:17 +00:00
|
|
|
|
|
|
|
|
|
# Deal with cedillas that should be commas-below in Romanian
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("acelaşi", "ro") == ["același"]
|
|
|
|
|
assert tokenize("ACELAŞI", "ro") == ["același"]
|