diff --git a/tests/test.py b/tests/test.py index 2d11e35..397ce97 100644 --- a/tests/test.py +++ b/tests/test.py @@ -95,6 +95,12 @@ def test_tokenization(): # apply. eq_(tokenize("can.t", 'en'), ['can', 't']) + +def test_casefolding(): + eq_(tokenize('WEISS', 'de'), ['weiss']) + eq_(tokenize('weiß', 'de'), ['weiss']) + + def test_phrase_freq(): plant = word_frequency("plan.t", 'en') assert_greater(plant, 0) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index f861c89..7f441ca 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -149,7 +149,7 @@ def simple_tokenize(text): sequence, but they are if they appear internally. "cats'" is not a token, but "cat's" is. """ - return [token.lower() for token in TOKEN_RE.findall(text)] + return [token.casefold() for token in TOKEN_RE.findall(text)] mecab_tokenize = None def tokenize(text, lang):