From ff5a8f2a653a47c5a2c16ef0d25d470df853782d Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Mon, 5 Dec 2016 18:42:16 -0500 Subject: [PATCH] add tests for French apostrophe tokenization --- tests/test_french.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 tests/test_french.py diff --git a/tests/test_french.py b/tests/test_french.py new file mode 100644 index 0000000..621a118 --- /dev/null +++ b/tests/test_french.py @@ -0,0 +1,19 @@ +from nose.tools import eq_, assert_almost_equal +from wordfreq import tokenize, word_frequency + + +def test_apostrophes(): + for lang in ('fr', 'ca', 'oc'): + eq_(tokenize("langues d'oïl", lang), + ['langues', "d", 'oïl']) + eq_(tokenize("langues d'oïl", lang, include_punctuation=True), + ['langues', "d'", 'oïl']) + eq_(tokenize("l'heure", lang), + ['l', 'heure']) + eq_(tokenize("l'heure", lang, include_punctuation=True), + ["l'", 'heure']) + eq_(tokenize("L'Hôpital", lang, include_punctuation=True), + ["l'", 'hôpital']) + eq_(tokenize("This isn't French", lang), + ['this', "isn't", 'french']) +