case-fold instead of just lowercasing tokens

2024-12-23 17:31:41 +00:00 · 2015-06-30 15:14:02 -04:00 · 2015-06-30 15:14:02 -04:00 · 638467f600
commit 638467f600
parent 053f372ebc
2 changed files with 7 additions and 1 deletions
--- a/tests/test.py
+++ b/tests/test.py
@ -95,6 +95,12 @@ def test_tokenization():
    # apply.
    eq_(tokenize("can.t", 'en'), ['can', 't'])

+
+def test_casefolding():
+    eq_(tokenize('WEISS', 'de'), ['weiss'])
+    eq_(tokenize('weiß', 'de'), ['weiss'])
+
+
 def test_phrase_freq():
    plant = word_frequency("plan.t", 'en')
    assert_greater(plant, 0)
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -149,7 +149,7 @@ def simple_tokenize(text):
    sequence, but they are if they appear internally. "cats'" is not a token,
    but "cat's" is.
    """
-    return [token.lower() for token in TOKEN_RE.findall(text)]
+    return [token.casefold() for token in TOKEN_RE.findall(text)]

 mecab_tokenize = None
 def tokenize(text, lang):