mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
add tests for French apostrophe tokenization
This commit is contained in:
parent
596368ac6e
commit
ff5a8f2a65
19
tests/test_french.py
Normal file
19
tests/test_french.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from nose.tools import eq_, assert_almost_equal
|
||||||
|
from wordfreq import tokenize, word_frequency
|
||||||
|
|
||||||
|
|
||||||
|
def test_apostrophes():
|
||||||
|
for lang in ('fr', 'ca', 'oc'):
|
||||||
|
eq_(tokenize("langues d'oïl", lang),
|
||||||
|
['langues', "d", 'oïl'])
|
||||||
|
eq_(tokenize("langues d'oïl", lang, include_punctuation=True),
|
||||||
|
['langues', "d'", 'oïl'])
|
||||||
|
eq_(tokenize("l'heure", lang),
|
||||||
|
['l', 'heure'])
|
||||||
|
eq_(tokenize("l'heure", lang, include_punctuation=True),
|
||||||
|
["l'", 'heure'])
|
||||||
|
eq_(tokenize("L'Hôpital", lang, include_punctuation=True),
|
||||||
|
["l'", 'hôpital'])
|
||||||
|
eq_(tokenize("This isn't French", lang),
|
||||||
|
['this', "isn't", 'french'])
|
||||||
|
|
Loading…
Reference in New Issue
Block a user