2016-12-05 23:42:16 +00:00
|
|
|
from nose.tools import eq_, assert_almost_equal
|
|
|
|
from wordfreq import tokenize, word_frequency
|
|
|
|
|
|
|
|
|
|
|
|
def test_apostrophes():
|
2016-12-06 22:37:35 +00:00
|
|
|
# Test that we handle apostrophes in French reasonably.
|
|
|
|
eq_(tokenize("qu'un", 'fr'), ['qu', 'un'])
|
|
|
|
eq_(tokenize("qu'un", 'fr', include_punctuation=True),
|
|
|
|
["qu'", "un"])
|
|
|
|
eq_(tokenize("langues d'oïl", 'fr'),
|
|
|
|
['langues', "d", 'oïl'])
|
|
|
|
eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True),
|
|
|
|
['langues', "d'", 'oïl'])
|
|
|
|
eq_(tokenize("l'heure", 'fr'),
|
|
|
|
['l', 'heure'])
|
|
|
|
eq_(tokenize("l'heure", 'fr', include_punctuation=True),
|
|
|
|
["l'", 'heure'])
|
|
|
|
eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True),
|
|
|
|
["l'", 'hôpital'])
|
2016-12-06 22:39:40 +00:00
|
|
|
eq_(tokenize("aujourd'hui", 'fr'), ["aujourd'hui"])
|
2016-12-06 22:37:35 +00:00
|
|
|
eq_(tokenize("This isn't French", 'en'),
|
|
|
|
['this', "isn't", 'french'])
|
2016-12-05 23:42:16 +00:00
|
|
|
|
2016-12-05 23:48:02 +00:00
|
|
|
|
2016-12-06 22:37:35 +00:00
|
|
|
def test_catastrophes():
|
|
|
|
# More apostrophes, but this time they're in Catalan, and there's other
|
|
|
|
# mid-word punctuation going on too.
|
2016-12-05 23:48:02 +00:00
|
|
|
eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
|
|
|
|
['m', 'acabo', 'd', 'instal·lar'])
|
|
|
|
eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
|
|
|
|
["m'", 'acabo', "d'", 'instal·lar', '.'])
|
2017-04-27 19:09:59 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_alternate_codes():
|
|
|
|
# Try over-long language codes for French and Catalan
|
|
|
|
eq_(tokenize("qu'un", 'fra'), ['qu', 'un'])
|
|
|
|
eq_(tokenize("qu'un", 'fre'), ['qu', 'un'])
|
|
|
|
eq_(tokenize("M'acabo d'instal·lar.", 'cat'),
|
|
|
|
['m', 'acabo', 'd', 'instal·lar'])
|
|
|
|
|