wordfreq/tests/test_general.py

from wordfreq import (
    word_frequency,
    available_languages,
    cB_to_freq,
    top_n_list,
    random_words,
    random_ascii_words,
    tokenize,
    lossy_tokenize,
)
import pytest


def test_freq_examples():
    # Stopwords are most common in the correct language
    assert word_frequency("the", "en") > word_frequency("de", "en")
    assert word_frequency("de", "es") > word_frequency("the", "es")
    # We get word frequencies from the 'large' list when available
    assert word_frequency("infrequency", "en") > 0.0


def test_languages():
    # Make sure we get all the languages when looking for the default
    # 'best' wordlist
    avail = available_languages()
    assert len(avail) >= 34

    # 'small' covers the same languages, but with some different lists
    avail_small = available_languages("small")
    assert len(avail_small) == len(avail)
    assert avail_small != avail

    # 'combined' is the same as 'small'
    avail_old_name = available_languages("combined")
    assert avail_old_name == avail_small

    # 'large' covers fewer languages
    avail_large = available_languages("large")
    assert len(avail_large) >= 14
    assert len(avail) > len(avail_large)

    # Look up the digit '2' in the main word list for each language
    for lang in avail:
        assert word_frequency("2", lang) > 0

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = "%s-001-x-fake-ext" % lang.upper()
        assert word_frequency("2", new_lang_code) > 0


def test_minimums():
    assert word_frequency("esquivalience", "en") == 0
    assert word_frequency("esquivalience", "en", minimum=1e-6) == 1e-6
    assert word_frequency("the", "en", minimum=1) == 1


def test_most_common_words():
    # If something causes the most common words in well-supported languages to
    # change, we should know.

    def get_most_common(lang):
        """
        Return the single most common word in the language.
        """
        return top_n_list(lang, 1)[0]

    assert get_most_common("ar") == "في"
    assert get_most_common("bg") == "на"
    assert get_most_common("bn") == "না"
    assert get_most_common("ca") == "de"
    assert get_most_common("cs") == "a"
    assert get_most_common("da") == "i"
    assert get_most_common("el") == "και"
    assert get_most_common("de") == "die"
    assert get_most_common("en") == "the"
    assert get_most_common("es") == "de"
    assert get_most_common("fi") == "ja"
    assert get_most_common("fil") == "sa"
    assert get_most_common("fr") == "de"
    assert get_most_common("he") == "את"
    assert get_most_common("hi") == "के"
    assert get_most_common("hu") == "a"
    assert get_most_common("id") == "yang"
    assert get_most_common("is") == "og"
    assert get_most_common("it") == "di"
    assert get_most_common("ja") == "の"
    assert get_most_common("ko") == "이"
    assert get_most_common("lt") == "ir"
    assert get_most_common("lv") == "un"
    assert get_most_common("mk") == "на"
    assert get_most_common("ms") == "yang"
    assert get_most_common("nb") == "i"
    assert get_most_common("nl") == "de"
    assert get_most_common("pl") == "w"
    assert get_most_common("pt") == "de"
    assert get_most_common("ro") == "de"
    assert get_most_common("ru") == "в"
    assert get_most_common("sh") == "je"
    assert get_most_common("sk") == "a"
    assert get_most_common("sl") == "je"
    assert get_most_common("sv") == "är"
    assert get_most_common("ta") == "ஒரு"
    assert get_most_common("tr") == "ve"
    assert get_most_common("uk") == "в"
    assert get_most_common("ur") == "کے"
    assert get_most_common("vi") == "là"
    assert get_most_common("zh") == "的"


def test_language_matching():
    freq = word_frequency("的", "zh")
    assert word_frequency("的", "zh-TW") == freq
    assert word_frequency("的", "zh-CN") == freq
    assert word_frequency("的", "zh-Hant") == freq
    assert word_frequency("的", "zh-Hans") == freq
    assert word_frequency("的", "yue-CN") == freq
    assert word_frequency("的", "cmn") == freq


def test_cB_conversion():
    assert cB_to_freq(0) == 1.0
    assert cB_to_freq(-100) == pytest.approx(0.1)
    assert cB_to_freq(-600) == pytest.approx(1e-6)


def test_failed_cB_conversion():
    with pytest.raises(ValueError):
        cB_to_freq(1)


def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data
    assert tokenize("I don't split at apostrophes, you see.", "en") == [
        "i",
        "don't",
        "split",
        "at",
        "apostrophes",
        "you",
        "see",
    ]

    assert tokenize(
        "I don't split at apostrophes, you see.", "en", include_punctuation=True
    ) == ["i", "don't", "split", "at", "apostrophes", ",", "you", "see", "."]

    # Certain punctuation does not inherently split a word.
    assert tokenize("Anything is possible at zombo.com", "en") == [
        "anything",
        "is",
        "possible",
        "at",
        "zombo.com",
    ]

    # Splits occur after symbols, and at splitting punctuation such as hyphens.
    assert tokenize("😂test", "en") == ["😂", "test"]
    assert tokenize("flip-flop", "en") == ["flip", "flop"]
    assert tokenize(
        "this text has... punctuation :)", "en", include_punctuation=True
    ) == ["this", "text", "has", "...", "punctuation", ":)"]

    # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
    # and 'David Bowie' stay together, because our Unicode segmentation algorithm
    # is up to date
    assert tokenize("emoji test 🧕🏽", "en") == ["emoji", "test", "🧕🏽"]
    assert tokenize(
        "👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", "en"
    ) == [
        "👨‍🎤",
        "planet",
        "earth",
        "is",
        "blue",
        "and",
        "there's",
        "nothing",
        "i",
        "can",
        "do",
        "🌎",
        "🚀",
    ]

    # Water wave, surfer, flag of California (indicates ridiculously complete support
    # for Unicode 10 and Emoji 5.0)
    assert tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'", "en") == ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"]


def test_casefolding():
    assert tokenize("WEISS", "de") == ["weiss"]
    assert tokenize("weiß", "de") == ["weiss"]
    assert tokenize("İstanbul", "tr") == ["istanbul"]
    assert tokenize("SIKISINCA", "tr") == ["sıkısınca"]


def test_normalization():
    assert tokenize('"715 - CRΣΣKS" by Bon Iver', "en") == [
        "715",
        "crσσks",
        "by",
        "bon",
        "iver",
    ]
    assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', "en") == [
        "715",
        "crσσks",
        "by",
        "bon",
        "iver",
    ]


def test_uncurl_quotes():
    assert lossy_tokenize("let’s", "en") == ["let's"]
    assert word_frequency("let’s", "en") == word_frequency("let's", "en")


def test_phrase_freq():
    ff = word_frequency("flip-flop", "en")
    assert ff > 0
    phrase_freq = 1.0 / word_frequency("flip", "en") + 1.0 / word_frequency(
        "flop", "en"
    )
    assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01)


def test_not_really_random():
    # If your xkcd-style password comes out like this, maybe you shouldn't
    # use it
    assert random_words(nwords=4, lang="en", bits_per_word=0) == "the the the the"

    # This not only tests random_ascii_words, it makes sure we didn't end
    # up with 'eos' as a very common Japanese word
    assert random_ascii_words(nwords=4, lang="ja", bits_per_word=0) == "1 1 1 1"


def test_not_enough_ascii():
    with pytest.raises(ValueError):
        random_ascii_words(lang="zh", bits_per_word=16)


def test_arabic():
    # Remove tatweels
    assert tokenize("متــــــــعب", "ar") == ["متعب"]

    # Remove combining marks
    assert tokenize("حَرَكَات", "ar") == ["حركات"]

    # An Arabic ligature that is affected by NFKC normalization
    assert tokenize("\ufefb", "ar") == ["\u0644\u0627"]


def test_ideographic_fallback():
    # Try tokenizing Chinese text as English -- it should remain stuck together.
    #
    # More complex examples like this, involving the multiple scripts of Japanese,
    # are in test_japanese.py.
    assert tokenize("中国文字", "en") == ["中国文字"]


def test_other_languages():
    # Test that we leave Thai letters stuck together. If we had better Thai support,
    # we would actually split this into a three-word phrase.
    assert tokenize("การเล่นดนตรี", "th") == ["การเล่นดนตรี"]
    assert tokenize('"การเล่นดนตรี" means "playing music"', "en") == [
        "การเล่นดนตรี",
        "means",
        "playing",
        "music",
    ]

    # Test Khmer, a script similar to Thai
    assert tokenize("សូមស្វាគមន៍", "km") == ["សូមស្វាគមន៍"]

    # Test Hindi -- tokens split where there are spaces, and not where there aren't
    assert tokenize("हिन्दी विक्षनरी", "hi") == ["हिन्दी", "विक्षनरी"]

    # Remove vowel points in Hebrew
    assert tokenize("דֻּגְמָה", "he") == ["דגמה"]

    # Deal with commas, cedillas, and I's in Turkish
    assert tokenize("kișinin", "tr") == ["kişinin"]
    assert tokenize("KİȘİNİN", "tr") == ["kişinin"]

    # Deal with cedillas that should be commas-below in Romanian
    assert tokenize("acelaşi", "ro") == ["același"]
    assert tokenize("ACELAŞI", "ro") == ["același"]
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								from wordfreq import (
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    word_frequency,
 								    available_languages,
 								    cB_to_freq,
 								    top_n_list,
 								    random_words,
 								    random_ascii_words,
 								    tokenize,
 								    lossy_tokenize,
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								)
-												port test.py and test_chinese.py to pytest

											
										
										
											2018-06-01 20:33:06 +00:00
+								import pytest
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								def test_freq_examples():
 								    # Stopwords are most common in the correct language
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert word_frequency("the", "en") > word_frequency("de", "en")
 								    assert word_frequency("de", "es") > word_frequency("the", "es")
-												Test that we can leave the wordlist unspecified and get 'large' freqs

											
										
										
											2018-03-08 23:09:57 +00:00
+								    # We get word frequencies from the 'large' list when available
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert word_frequency("infrequency", "en") > 0.0
-												Test that we can leave the wordlist unspecified and get 'large' freqs

											
										
										
											2018-03-08 23:09:57 +00:00
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								def test_languages():
-												reorganize wordlists into 'small', 'large', and 'best'

											
										
										
											2018-03-08 22:52:44 +00:00
+								    # Make sure we get all the languages when looking for the default
 								    # 'best' wordlist
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								    avail = available_languages()
-												fixes to tests, including that 'test.py' wasn't found by pytest

											
										
										
											2018-06-15 19:48:41 +00:00
+								    assert len(avail) >= 34
-												reorganize wordlists into 'small', 'large', and 'best'

											
										
										
											2018-03-08 22:52:44 +00:00
 								    # 'small' covers the same languages, but with some different lists
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    avail_small = available_languages("small")
-												port test.py and test_chinese.py to pytest

											
										
										
											2018-06-01 20:33:06 +00:00
+								    assert len(avail_small) == len(avail)
 								    assert avail_small != avail
-												reorganize wordlists into 'small', 'large', and 'best'

											
										
										
											2018-03-08 22:52:44 +00:00
 								    # 'combined' is the same as 'small'
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    avail_old_name = available_languages("combined")
-												port test.py and test_chinese.py to pytest

											
										
										
											2018-06-01 20:33:06 +00:00
+								    assert avail_old_name == avail_small
-												reorganize wordlists into 'small', 'large', and 'best'

											
										
										
											2018-03-08 22:52:44 +00:00
 								    # 'large' covers fewer languages
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    avail_large = available_languages("large")
-												update tests to include new languages

Also, it's easy to say `>=` in pytest

											
										
										
											2018-06-12 21:55:44 +00:00
+								    assert len(avail_large) >= 14
-												port test.py and test_chinese.py to pytest

											
										
										
											2018-06-01 20:33:06 +00:00
+								    assert len(avail) > len(avail_large)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												Revise multilingual tests


Former-commit-id: 21246f881f0afe90408203f64e41059f02809da7
											
										
										
											2016-07-29 16:19:12 +00:00
+								    # Look up the digit '2' in the main word list for each language
 								    for lang in avail:
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								        assert word_frequency("2", lang) > 0
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
+								        # Make up a weirdly verbose language code and make sure
 								        # we still get it
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								        new_lang_code = "%s-001-x-fake-ext" % lang.upper()
 								        assert word_frequency("2", new_lang_code) > 0
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												updated minimum


Former-commit-id: 59c03e24118ffbd4159e1162a6a64ebf38bf4edb
											
										
										
											2015-07-07 19:46:33 +00:00
+								def test_minimums():
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert word_frequency("esquivalience", "en") == 0
 								    assert word_frequency("esquivalience", "en", minimum=1e-6) == 1e-6
 								    assert word_frequency("the", "en", minimum=1) == 1
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												Revise multilingual tests


Former-commit-id: 21246f881f0afe90408203f64e41059f02809da7
											
										
										
											2016-07-29 16:19:12 +00:00
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								def test_most_common_words():
 								    # If something causes the most common words in well-supported languages to
 								    # change, we should know.
 								    def get_most_common(lang):
 								        """
 								        Return the single most common word in the language.
 								        """
 								        return top_n_list(lang, 1)[0]
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert get_most_common("ar") == "في"
 								    assert get_most_common("bg") == "на"
 								    assert get_most_common("bn") == "না"
 								    assert get_most_common("ca") == "de"
 								    assert get_most_common("cs") == "a"
 								    assert get_most_common("da") == "i"
 								    assert get_most_common("el") == "και"
 								    assert get_most_common("de") == "die"
 								    assert get_most_common("en") == "the"
 								    assert get_most_common("es") == "de"
 								    assert get_most_common("fi") == "ja"
 								    assert get_most_common("fil") == "sa"
 								    assert get_most_common("fr") == "de"
 								    assert get_most_common("he") == "את"
 								    assert get_most_common("hi") == "के"
 								    assert get_most_common("hu") == "a"
 								    assert get_most_common("id") == "yang"
 								    assert get_most_common("is") == "og"
 								    assert get_most_common("it") == "di"
 								    assert get_most_common("ja") == "の"
 								    assert get_most_common("ko") == "이"
 								    assert get_most_common("lt") == "ir"
 								    assert get_most_common("lv") == "un"
 								    assert get_most_common("mk") == "на"
 								    assert get_most_common("ms") == "yang"
 								    assert get_most_common("nb") == "i"
 								    assert get_most_common("nl") == "de"
 								    assert get_most_common("pl") == "w"
 								    assert get_most_common("pt") == "de"
 								    assert get_most_common("ro") == "de"
 								    assert get_most_common("ru") == "в"
 								    assert get_most_common("sh") == "je"
 								    assert get_most_common("sk") == "a"
 								    assert get_most_common("sl") == "je"
 								    assert get_most_common("sv") == "är"
 								    assert get_most_common("ta") == "ஒரு"
 								    assert get_most_common("tr") == "ve"
 								    assert get_most_common("uk") == "в"
 								    assert get_most_common("ur") == "کے"
 								    assert get_most_common("vi") == "là"
 								    assert get_most_common("zh") == "的"
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								def test_language_matching():
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    freq = word_frequency("的", "zh")
 								    assert word_frequency("的", "zh-TW") == freq
 								    assert word_frequency("的", "zh-CN") == freq
 								    assert word_frequency("的", "zh-Hant") == freq
 								    assert word_frequency("的", "zh-Hans") == freq
 								    assert word_frequency("的", "yue-CN") == freq
 								    assert word_frequency("的", "cmn") == freq
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												Switch to a more precise centibel scale.


Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2
											
										
										
											2015-06-22 21:36:30 +00:00
+								def test_cB_conversion():
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert cB_to_freq(0) == 1.0
-												port test.py and test_chinese.py to pytest

											
										
										
											2018-06-01 20:33:06 +00:00
+								    assert cB_to_freq(-100) == pytest.approx(0.1)
 								    assert cB_to_freq(-600) == pytest.approx(1e-6)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												Switch to a more precise centibel scale.


Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2
											
										
										
											2015-06-22 21:36:30 +00:00
+								def test_failed_cB_conversion():
-												port test.py and test_chinese.py to pytest

											
										
										
											2018-06-01 20:33:06 +00:00
+								    with pytest.raises(ValueError):
 								        cB_to_freq(1)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								def test_tokenization():
 								    # We preserve apostrophes within words, so "can't" is a single word in the
-												updated comments


Former-commit-id: 131b916c579b8c45db0444a6eaffe51ef419039b
											
										
										
											2015-07-17 18:50:12 +00:00
+								    # data
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("I don't split at apostrophes, you see.", "en") == [
 								        "i",
 								        "don't",
 								        "split",
 								        "at",
 								        "apostrophes",
 								        "you",
 								        "see",
 								    ]
 								    assert tokenize(
 								        "I don't split at apostrophes, you see.", "en", include_punctuation=True
 								    ) == ["i", "don't", "split", "at", "apostrophes", ",", "you", "see", "."]
-												updated tests for emoji splitting


Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185
											
										
										
											2015-06-25 15:25:51 +00:00
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								    # Certain punctuation does not inherently split a word.
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("Anything is possible at zombo.com", "en") == [
 								        "anything",
 								        "is",
 								        "possible",
 								        "at",
 								        "zombo.com",
 								    ]
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
 								    # Splits occur after symbols, and at splitting punctuation such as hyphens.
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("😂test", "en") == ["😂", "test"]
 								    assert tokenize("flip-flop", "en") == ["flip", "flop"]
 								    assert tokenize(
 								        "this text has... punctuation :)", "en", include_punctuation=True
 								    ) == ["this", "text", "has", "...", "punctuation", ":)"]
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
-												Tokenize by graphemes, not codepoints (#50)

* Tokenize by graphemes, not codepoints

* Add more documentation to TOKEN_RE

* Remove extra line break

* Update docstring - Brahmic scripts are no longer an exception

* approve using version 2017.07.28 of regex

											
										
										
											2017-08-08 15:35:28 +00:00
+								    # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
 								    # and 'David Bowie' stay together, because our Unicode segmentation algorithm
 								    # is up to date
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("emoji test 🧕🏽", "en") == ["emoji", "test", "🧕🏽"]
 								    assert tokenize(
 								        "👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", "en"
 								    ) == [
 								        "👨‍🎤",
 								        "planet",
 								        "earth",
 								        "is",
 								        "blue",
 								        "and",
 								        "there's",
 								        "nothing",
 								        "i",
 								        "can",
 								        "do",
 								        "🌎",
 								        "🚀",
 								    ]
-												Tokenize by graphemes, not codepoints (#50)

* Tokenize by graphemes, not codepoints

* Add more documentation to TOKEN_RE

* Remove extra line break

* Update docstring - Brahmic scripts are no longer an exception

* approve using version 2017.07.28 of regex

											
										
										
											2017-08-08 15:35:28 +00:00
 								    # Water wave, surfer, flag of California (indicates ridiculously complete support
 								    # for Unicode 10 and Emoji 5.0)
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'", "en") == ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"]
-												Tokenize by graphemes, not codepoints (#50)

* Tokenize by graphemes, not codepoints

* Add more documentation to TOKEN_RE

* Remove extra line break

* Update docstring - Brahmic scripts are no longer an exception

* approve using version 2017.07.28 of regex

											
										
										
											2017-08-08 15:35:28 +00:00
-												case-fold instead of just lowercasing tokens


Former-commit-id: 638467f60022c6933a9a2fb8ff1280d39e9a3d70
											
										
										
											2015-06-30 19:14:02 +00:00
 								def test_casefolding():
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("WEISS", "de") == ["weiss"]
 								    assert tokenize("weiß", "de") == ["weiss"]
 								    assert tokenize("İstanbul", "tr") == ["istanbul"]
 								    assert tokenize("SIKISINCA", "tr") == ["sıkısınca"]
 								def test_normalization():
 								    assert tokenize('"715 - CRΣΣKS" by Bon Iver', "en") == [
 								        "715",
 								        "crσσks",
 								        "by",
 								        "bon",
 								        "iver",
 								    ]
 								    assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', "en") == [
 								        "715",
 								        "crσσks",
 								        "by",
 								        "bon",
 								        "iver",
 								    ]
-												import new wordlists from Exquisite Corpus

											
										
										
											2017-01-05 22:59:26 +00:00
-												use ftfy's uncurl_quotes in lossy_tokenize

											
										
										
											2021-09-02 17:47:47 +00:00
+								def test_uncurl_quotes():
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert lossy_tokenize("let’s", "en") == ["let's"]
 								    assert word_frequency("let’s", "en") == word_frequency("let's", "en")
-												use ftfy's uncurl_quotes in lossy_tokenize

											
										
										
											2021-09-02 17:47:47 +00:00
-												updated tests for emoji splitting


Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185
											
										
										
											2015-06-25 15:25:51 +00:00
+								def test_phrase_freq():
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    ff = word_frequency("flip-flop", "en")
-												port test.py and test_chinese.py to pytest

											
										
										
											2018-06-01 20:33:06 +00:00
+								    assert ff > 0
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    phrase_freq = 1.0 / word_frequency("flip", "en") + 1.0 / word_frequency(
 								        "flop", "en"
 								    )
-												Round frequencies to 3 significant digits

											
										
										
											2018-06-15 19:42:54 +00:00
+								    assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								def test_not_really_random():
 								    # If your xkcd-style password comes out like this, maybe you shouldn't
 								    # use it
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert random_words(nwords=4, lang="en", bits_per_word=0) == "the the the the"
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								    # This not only tests random_ascii_words, it makes sure we didn't end
 								    # up with 'eos' as a very common Japanese word
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert random_ascii_words(nwords=4, lang="ja", bits_per_word=0) == "1 1 1 1"
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								def test_not_enough_ascii():
-												port test.py and test_chinese.py to pytest

											
										
										
											2018-06-01 20:33:06 +00:00
+								    with pytest.raises(ValueError):
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								        random_ascii_words(lang="zh", bits_per_word=16)
-												added arabic tests


Former-commit-id: f83d31a35774b08d40ab5c6a9fb8c09616e71819
											
										
										
											2015-07-07 19:10:59 +00:00
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
-												Tokenization in Korean, plus abjad languages (#38)

* Remove marks from more languages

* Add Korean tokenization, and include MeCab files in data

* add a Hebrew tokenization test

* fix terminology in docstrings about abjad scripts

* combine Japanese and Korean tokenization into the same function


Former-commit-id: fec6eddcc3475f49a7541d8d3202ec87e581ed53
											
										
										
											2016-07-15 19:10:25 +00:00
+								def test_arabic():
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
+								    # Remove tatweels
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("متــــــــعب", "ar") == ["متعب"]
-												added arabic tests


Former-commit-id: f83d31a35774b08d40ab5c6a9fb8c09616e71819
											
										
										
											2015-07-07 19:10:59 +00:00
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
+								    # Remove combining marks
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("حَرَكَات", "ar") == ["حركات"]
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
-												port test.py and test_chinese.py to pytest

											
										
										
											2018-06-01 20:33:06 +00:00
+								    # An Arabic ligature that is affected by NFKC normalization
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("\ufefb", "ar") == ["\u0644\u0627"]
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
 								def test_ideographic_fallback():
-												tokenize Chinese using jieba and our own frequencies


Former-commit-id: 2327f2e4d61c25b29a00f8cbb4387cf59f520628
											
										
										
											2015-09-05 07:16:56 +00:00
+								    # Try tokenizing Chinese text as English -- it should remain stuck together.
-												Handle Japanese edge cases in simple_tokenize

											
										
										
											2018-04-26 19:53:07 +00:00
+								    #
 								    # More complex examples like this, involving the multiple scripts of Japanese,
 								    # are in test_japanese.py.
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("中国文字", "en") == ["中国文字"]
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
-												Separate preprocessing from tokenization

											
										
										
											2018-03-08 21:25:45 +00:00
-												Tokenization in Korean, plus abjad languages (#38)

* Remove marks from more languages

* Add Korean tokenization, and include MeCab files in data

* add a Hebrew tokenization test

* fix terminology in docstrings about abjad scripts

* combine Japanese and Korean tokenization into the same function


Former-commit-id: fec6eddcc3475f49a7541d8d3202ec87e581ed53
											
										
										
											2016-07-15 19:10:25 +00:00
+								def test_other_languages():
-												move Thai test to where it makes more sense


Former-commit-id: 4ec6b56faab4bc5a698e48cca1493ed45c9de6ea
											
										
										
											2016-03-10 16:56:04 +00:00
+								    # Test that we leave Thai letters stuck together. If we had better Thai support,
 								    # we would actually split this into a three-word phrase.
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("การเล่นดนตรี", "th") == ["การเล่นดนตรี"]
 								    assert tokenize('"การเล่นดนตรี" means "playing music"', "en") == [
 								        "การเล่นดนตรี",
 								        "means",
 								        "playing",
 								        "music",
 								    ]
-												move Thai test to where it makes more sense


Former-commit-id: 4ec6b56faab4bc5a698e48cca1493ed45c9de6ea
											
										
										
											2016-03-10 16:56:04 +00:00
-												Fix tokenization of SE Asian and South Asian scripts (#37)



Former-commit-id: 270f6c7ca616165e89ccbfa270d78eabc49782c4
											
										
										
											2016-07-01 22:00:57 +00:00
+								    # Test Khmer, a script similar to Thai
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("សូមស្វាគមន៍", "km") == ["សូមស្វាគមន៍"]
-												Fix tokenization of SE Asian and South Asian scripts (#37)



Former-commit-id: 270f6c7ca616165e89ccbfa270d78eabc49782c4
											
										
										
											2016-07-01 22:00:57 +00:00
 								    # Test Hindi -- tokens split where there are spaces, and not where there aren't
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("हिन्दी विक्षनरी", "hi") == ["हिन्दी", "विक्षनरी"]
-												Tokenization in Korean, plus abjad languages (#38)

* Remove marks from more languages

* Add Korean tokenization, and include MeCab files in data

* add a Hebrew tokenization test

* fix terminology in docstrings about abjad scripts

* combine Japanese and Korean tokenization into the same function


Former-commit-id: fec6eddcc3475f49a7541d8d3202ec87e581ed53
											
										
										
											2016-07-15 19:10:25 +00:00
 								    # Remove vowel points in Hebrew
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("דֻּגְמָה", "he") == ["דגמה"]
-												Tokenization in Korean, plus abjad languages (#38)

* Remove marks from more languages

* Add Korean tokenization, and include MeCab files in data

* add a Hebrew tokenization test

* fix terminology in docstrings about abjad scripts

* combine Japanese and Korean tokenization into the same function


Former-commit-id: fec6eddcc3475f49a7541d8d3202ec87e581ed53
											
										
										
											2016-07-15 19:10:25 +00:00
-												Add Common Crawl data and more languages (#39)

This changes the version from 1.4.2 to 1.5.  Things done in this update include:

* include Common Crawl; support 11 more languages

* new frequency-merging strategy

* New sources: Chinese from Wikipedia (mostly Trad.), Dutch big list

* Remove kinda bad sources, i.e. Greek Twitter (too often kaomoji are detected as Greek) and Ukrainian Common Crawl. This results in dropping Ukrainian as an available language, and causing Greek to not be a 'large' language after all.

* Add Korean tokenization, and include MeCab files in data

* Remove marks from more languages

* Deal with commas and cedillas in Turkish and Romanian



Former-commit-id: e6a8f028e3ff73b5d27b8f6a6ad8b26f439c00ec
											
										
										
											2016-07-28 23:23:17 +00:00
+								    # Deal with commas, cedillas, and I's in Turkish
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("kișinin", "tr") == ["kişinin"]
 								    assert tokenize("KİȘİNİN", "tr") == ["kişinin"]
-												Add Common Crawl data and more languages (#39)

This changes the version from 1.4.2 to 1.5.  Things done in this update include:

* include Common Crawl; support 11 more languages

* new frequency-merging strategy

* New sources: Chinese from Wikipedia (mostly Trad.), Dutch big list

* Remove kinda bad sources, i.e. Greek Twitter (too often kaomoji are detected as Greek) and Ukrainian Common Crawl. This results in dropping Ukrainian as an available language, and causing Greek to not be a 'large' language after all.

* Add Korean tokenization, and include MeCab files in data

* Remove marks from more languages

* Deal with commas and cedillas in Turkish and Romanian



Former-commit-id: e6a8f028e3ff73b5d27b8f6a6ad8b26f439c00ec
											
										
										
											2016-07-28 23:23:17 +00:00
 								    # Deal with cedillas that should be commas-below in Romanian
-												estimate the freq distribution of numbers

											
										
										
											2022-03-10 23:33:42 +00:00
+								    assert tokenize("acelaşi", "ro") == ["același"]
 								    assert tokenize("ACELAŞI", "ro") == ["același"]