wordfreq/tests/test_french_and_related.py

34 lines
1.5 KiB
Python
Raw Permalink Normal View History

from wordfreq import tokenize, word_frequency
def test_apostrophes():
# Test that we handle apostrophes in French reasonably.
2018-06-01 20:40:51 +00:00
assert tokenize("qu'un", 'fr') == ['qu', 'un']
assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"]
assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl']
assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
assert tokenize("l'heure", 'fr') == ['l', 'heure']
assert tokenize("l'ànima", 'ca') == ['l', 'ànima']
2018-06-01 20:40:51 +00:00
assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
2016-12-05 23:48:02 +00:00
def test_catastrophes():
# More apostrophes, but this time they're in Catalan, and there's other
# mid-word punctuation going on too.
2018-06-01 20:40:51 +00:00
assert tokenize("M'acabo d'instal·lar.", 'ca') == ['m', 'acabo', 'd', 'instal·lar']
assert (
tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True) ==
["m'", 'acabo', "d'", 'instal·lar', '.']
)
def test_alternate_codes():
# Try over-long language codes for French and Catalan
2018-06-01 20:40:51 +00:00
assert tokenize("qu'un", 'fra') == ['qu', 'un']
assert tokenize("qu'un", 'fre') == ['qu', 'un']
assert tokenize("M'acabo d'instal·lar.", 'cat') == ['m', 'acabo', 'd', 'instal·lar']