wordfreq/tests/test_apostrophes.py

from wordfreq import tokenize


def test_apostrophes():
    # Test that we handle apostrophes in French reasonably.
    assert tokenize("qu'un", "fr") == ["qu", "un"]
    assert tokenize("qu'un", "fr", include_punctuation=True) == ["qu'", "un"]
    assert tokenize("langues d'oïl", "fr") == ["langues", "d", "oïl"]
    assert tokenize("langues d'oïl", "fr", include_punctuation=True) == [
        "langues",
        "d'",
        "oïl",
    ]
    assert tokenize("l'heure", "fr") == ["l", "heure"]
    assert tokenize("l'ànima", "ca") == ["l", "ànima"]
    assert tokenize("l'anima", "it") == ["l", "anima"]
    assert tokenize("l'heure", "fr", include_punctuation=True) == ["l'", "heure"]
    assert tokenize("L'Hôpital", "fr", include_punctuation=True) == ["l'", "hôpital"]
    assert tokenize("aujourd'hui", "fr") == ["aujourd'hui"]
    assert tokenize("This isn't French", "en") == ["this", "isn't", "french"]

    # This next behavior is not ideal -- we would prefer "dell'" to be handled
    # the same as "l'" -- but this is the most consistent result we can get without
    # Italian-specific rules.
    #
    # Versions of regex from 2019 and earlier would give ['dell', 'anima'], which
    # is better but inconsistent.
    assert tokenize("dell'anima", "it") == ["dell'anima"]

    # Versions of regex from 2019 and earlier would give ['hawai', 'i'], and that's
    # an example of why we don't want the apostrophe-vowel fix to apply everywhere.
    assert tokenize("hawai'i", "en") == ["hawai'i"]


def test_catastrophes():
    # More apostrophes, but this time they're in Catalan, and there's other
    # mid-word punctuation going on too.
    assert tokenize("M'acabo d'instal·lar.", "ca") == ["m", "acabo", "d", "instal·lar"]
    assert tokenize("M'acabo d'instal·lar.", "ca", include_punctuation=True) == [
        "m'",
        "acabo",
        "d'",
        "instal·lar",
        ".",
    ]


def test_alternate_codes():
    # Try over-long language codes for French and Catalan
    assert tokenize("qu'un", "fra") == ["qu", "un"]
    assert tokenize("qu'un", "fre") == ["qu", "un"]
    assert tokenize("M'acabo d'instal·lar.", "cat") == ["m", "acabo", "d", "instal·lar"]
v3.1: support py3.12, update formatting, replace pkg_resources with locate 2023-11-21 23:07:04 +00:00			`from wordfreq import tokenize`
add tests for French apostrophe tokenization 2016-12-05 23:42:16 +00:00

			`def test_apostrophes():`
Bake the 'h special case into the regex This lets me remove the French-specific code I just put in. 2016-12-06 22:37:35 +00:00			`# Test that we handle apostrophes in French reasonably.`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert tokenize("qu'un", "fr") == ["qu", "un"]`
			`assert tokenize("qu'un", "fr", include_punctuation=True) == ["qu'", "un"]`
			`assert tokenize("langues d'oïl", "fr") == ["langues", "d", "oïl"]`
			`assert tokenize("langues d'oïl", "fr", include_punctuation=True) == [`
			`"langues",`
			`"d'",`
			`"oïl",`
			`]`
			`assert tokenize("l'heure", "fr") == ["l", "heure"]`
			`assert tokenize("l'ànima", "ca") == ["l", "ànima"]`
			`assert tokenize("l'anima", "it") == ["l", "anima"]`
			`assert tokenize("l'heure", "fr", include_punctuation=True) == ["l'", "heure"]`
			`assert tokenize("L'Hôpital", "fr", include_punctuation=True) == ["l'", "hôpital"]`
			`assert tokenize("aujourd'hui", "fr") == ["aujourd'hui"]`
			`assert tokenize("This isn't French", "en") == ["this", "isn't", "french"]`
add tests for French apostrophe tokenization 2016-12-05 23:42:16 +00:00
update dependencies and test for consistent results 2020-09-08 20:03:33 +00:00			`# This next behavior is not ideal -- we would prefer "dell'" to be handled`
			`# the same as "l'" -- but this is the most consistent result we can get without`
			`# Italian-specific rules.`
			`#`
			`# Versions of regex from 2019 and earlier would give ['dell', 'anima'], which`
			`# is better but inconsistent.`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert tokenize("dell'anima", "it") == ["dell'anima"]`
update dependencies and test for consistent results 2020-09-08 20:03:33 +00:00
			`# Versions of regex from 2019 and earlier would give ['hawai', 'i'], and that's`
			`# an example of why we don't want the apostrophe-vowel fix to apply everywhere.`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert tokenize("hawai'i", "en") == ["hawai'i"]`
update dependencies and test for consistent results 2020-09-08 20:03:33 +00:00
add a specific test in Catalan 2016-12-05 23:48:02 +00:00
Bake the 'h special case into the regex This lets me remove the French-specific code I just put in. 2016-12-06 22:37:35 +00:00			`def test_catastrophes():`
			`# More apostrophes, but this time they're in Catalan, and there's other`
			`# mid-word punctuation going on too.`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert tokenize("M'acabo d'instal·lar.", "ca") == ["m", "acabo", "d", "instal·lar"]`
			`assert tokenize("M'acabo d'instal·lar.", "ca", include_punctuation=True) == [`
			`"m'",`
			`"acabo",`
			`"d'",`
			`"instal·lar",`
			`".",`
			`]`
Use langcodes when tokenizing again (it no longer connects to a DB) 2017-04-27 19:09:59 +00:00

			`def test_alternate_codes():`
			`# Try over-long language codes for French and Catalan`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert tokenize("qu'un", "fra") == ["qu", "un"]`
			`assert tokenize("qu'un", "fre") == ["qu", "un"]`
			`assert tokenize("M'acabo d'instal·lar.", "cat") == ["m", "acabo", "d", "instal·lar"]`