wordfreq/tests/test_french_and_related.py

from nose.tools import eq_, assert_almost_equal
from wordfreq import tokenize, word_frequency


def test_apostrophes():
    # Test that we handle apostrophes in French reasonably.
    eq_(tokenize("qu'un", 'fr'), ['qu', 'un'])
    eq_(tokenize("qu'un", 'fr', include_punctuation=True),
        ["qu'", "un"])
    eq_(tokenize("langues d'oïl", 'fr'),
        ['langues', "d", 'oïl'])
    eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True),
        ['langues', "d'", 'oïl'])
    eq_(tokenize("l'heure", 'fr'),
        ['l', 'heure'])
    eq_(tokenize("l'heure", 'fr', include_punctuation=True),
        ["l'", 'heure'])
    eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True),
        ["l'", 'hôpital'])
    eq_(tokenize("aujourd'hui", 'fr'), ["aujourd'hui"])
    eq_(tokenize("This isn't French", 'en'),
        ['this', "isn't", 'french'])


def test_catastrophes():
    # More apostrophes, but this time they're in Catalan, and there's other
    # mid-word punctuation going on too.
    eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
        ['m', 'acabo', 'd', 'instal·lar'])
    eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
        ["m'", 'acabo', "d'", 'instal·lar', '.'])


def test_alternate_codes():
    # Try over-long language codes for French and Catalan
    eq_(tokenize("qu'un", 'fra'), ['qu', 'un'])
    eq_(tokenize("qu'un", 'fre'), ['qu', 'un'])
    eq_(tokenize("M'acabo d'instal·lar.", 'cat'),
        ['m', 'acabo', 'd', 'instal·lar'])
add tests for French apostrophe tokenization 2016-12-05 23:42:16 +00:00			`from nose.tools import eq_, assert_almost_equal`
			`from wordfreq import tokenize, word_frequency`


			`def test_apostrophes():`
Bake the 'h special case into the regex This lets me remove the French-specific code I just put in. 2016-12-06 22:37:35 +00:00			`# Test that we handle apostrophes in French reasonably.`
			`eq_(tokenize("qu'un", 'fr'), ['qu', 'un'])`
			`eq_(tokenize("qu'un", 'fr', include_punctuation=True),`
			`["qu'", "un"])`
			`eq_(tokenize("langues d'oïl", 'fr'),`
			`['langues', "d", 'oïl'])`
			`eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True),`
			`['langues', "d'", 'oïl'])`
			`eq_(tokenize("l'heure", 'fr'),`
			`['l', 'heure'])`
			`eq_(tokenize("l'heure", 'fr', include_punctuation=True),`
			`["l'", 'heure'])`
			`eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True),`
			`["l'", 'hôpital'])`
add a test for "aujourd'hui" 2016-12-06 22:39:40 +00:00			`eq_(tokenize("aujourd'hui", 'fr'), ["aujourd'hui"])`
Bake the 'h special case into the regex This lets me remove the French-specific code I just put in. 2016-12-06 22:37:35 +00:00			`eq_(tokenize("This isn't French", 'en'),`
			`['this', "isn't", 'french'])`
add tests for French apostrophe tokenization 2016-12-05 23:42:16 +00:00
add a specific test in Catalan 2016-12-05 23:48:02 +00:00
Bake the 'h special case into the regex This lets me remove the French-specific code I just put in. 2016-12-06 22:37:35 +00:00			`def test_catastrophes():`
			`# More apostrophes, but this time they're in Catalan, and there's other`
			`# mid-word punctuation going on too.`
add a specific test in Catalan 2016-12-05 23:48:02 +00:00			`eq_(tokenize("M'acabo d'instal·lar.", 'ca'),`
			`['m', 'acabo', 'd', 'instal·lar'])`
			`eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),`
			`["m'", 'acabo', "d'", 'instal·lar', '.'])`
Use langcodes when tokenizing again (it no longer connects to a DB) 2017-04-27 19:09:59 +00:00

			`def test_alternate_codes():`
			`# Try over-long language codes for French and Catalan`
			`eq_(tokenize("qu'un", 'fra'), ['qu', 'un'])`
			`eq_(tokenize("qu'un", 'fre'), ['qu', 'un'])`
			`eq_(tokenize("M'acabo d'instal·lar.", 'cat'),`
			`['m', 'acabo', 'd', 'instal·lar'])`