From 4997d776b97f169c1cbf6547590673f25facfe34 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Tue, 30 Jun 2015 15:14:02 -0400 Subject: [PATCH] case-fold instead of just lowercasing tokens Former-commit-id: 638467f60022c6933a9a2fb8ff1280d39e9a3d70 --- tests/test.py | 6 ++++++ wordfreq/__init__.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test.py b/tests/test.py index 2d11e35..397ce97 100644 --- a/tests/test.py +++ b/tests/test.py @@ -95,6 +95,12 @@ def test_tokenization(): # apply. eq_(tokenize("can.t", 'en'), ['can', 't']) + +def test_casefolding(): + eq_(tokenize('WEISS', 'de'), ['weiss']) + eq_(tokenize('weiß', 'de'), ['weiss']) + + def test_phrase_freq(): plant = word_frequency("plan.t", 'en') assert_greater(plant, 0) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index f861c89..7f441ca 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -149,7 +149,7 @@ def simple_tokenize(text): sequence, but they are if they appear internally. "cats'" is not a token, but "cat's" is. """ - return [token.lower() for token in TOKEN_RE.findall(text)] + return [token.casefold() for token in TOKEN_RE.findall(text)] mecab_tokenize = None def tokenize(text, lang):