wordfreq/tests/test_french_and_related.py

from wordfreq import tokenize, word_frequency


def test_apostrophes():
    # Test that we handle apostrophes in French reasonably.
    assert tokenize("qu'un", 'fr') == ['qu', 'un']
    assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"]
    assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl']
    assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
    assert tokenize("l'heure", 'fr') == ['l', 'heure']
    assert tokenize("l'ànima", 'ca') == ['l', 'ànima']
    assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
    assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
    assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
    assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']


def test_catastrophes():
    # More apostrophes, but this time they're in Catalan, and there's other
    # mid-word punctuation going on too.
    assert tokenize("M'acabo d'instal·lar.", 'ca') == ['m', 'acabo', 'd', 'instal·lar']
    assert (
        tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True) ==
        ["m'", 'acabo', "d'", 'instal·lar', '.']
    )


def test_alternate_codes():
    # Try over-long language codes for French and Catalan
    assert tokenize("qu'un", 'fra') == ['qu', 'un']
    assert tokenize("qu'un", 'fre') == ['qu', 'un']
    assert tokenize("M'acabo d'instal·lar.", 'cat') == ['m', 'acabo', 'd', 'instal·lar']
add tests for French apostrophe tokenization 2016-12-05 23:42:16 +00:00			`from wordfreq import tokenize, word_frequency`


			`def test_apostrophes():`
Bake the 'h special case into the regex This lets me remove the French-specific code I just put in. 2016-12-06 22:37:35 +00:00			`# Test that we handle apostrophes in French reasonably.`
port remaining tests to pytest 2018-06-01 20:40:51 +00:00			`assert tokenize("qu'un", 'fr') == ['qu', 'un']`
			`assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"]`
			`assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl']`
			`assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']`
			`assert tokenize("l'heure", 'fr') == ['l', 'heure']`
fix regex's inconsistent word breaking around apostrophes 2020-04-28 19:19:56 +00:00			`assert tokenize("l'ànima", 'ca') == ['l', 'ànima']`
port remaining tests to pytest 2018-06-01 20:40:51 +00:00			`assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']`
			`assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']`
			`assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]`
			`assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']`
add tests for French apostrophe tokenization 2016-12-05 23:42:16 +00:00
add a specific test in Catalan 2016-12-05 23:48:02 +00:00
Bake the 'h special case into the regex This lets me remove the French-specific code I just put in. 2016-12-06 22:37:35 +00:00			`def test_catastrophes():`
			`# More apostrophes, but this time they're in Catalan, and there's other`
			`# mid-word punctuation going on too.`
port remaining tests to pytest 2018-06-01 20:40:51 +00:00			`assert tokenize("M'acabo d'instal·lar.", 'ca') == ['m', 'acabo', 'd', 'instal·lar']`
			`assert (`
			`tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True) ==`
			`["m'", 'acabo', "d'", 'instal·lar', '.']`
			`)`
Use langcodes when tokenizing again (it no longer connects to a DB) 2017-04-27 19:09:59 +00:00

			`def test_alternate_codes():`
			`# Try over-long language codes for French and Catalan`
port remaining tests to pytest 2018-06-01 20:40:51 +00:00			`assert tokenize("qu'un", 'fra') == ['qu', 'un']`
			`assert tokenize("qu'un", 'fre') == ['qu', 'un']`
			`assert tokenize("M'acabo d'instal·lar.", 'cat') == ['m', 'acabo', 'd', 'instal·lar']`
Use langcodes when tokenizing again (it no longer connects to a DB) 2017-04-27 19:09:59 +00:00