From 4997d776b97f169c1cbf6547590673f25facfe34 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Tue, 30 Jun 2015 15:14:02 -0400
Subject: [PATCH] case-fold instead of just lowercasing tokens

Former-commit-id: 638467f60022c6933a9a2fb8ff1280d39e9a3d70
---
 tests/test.py        | 6 ++++++
 wordfreq/__init__.py | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/test.py b/tests/test.py
index 2d11e35..397ce97 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -95,6 +95,12 @@ def test_tokenization():
     # apply.
     eq_(tokenize("can.t", 'en'), ['can', 't'])
 
+
+def test_casefolding():
+    eq_(tokenize('WEISS', 'de'), ['weiss'])
+    eq_(tokenize('weiß', 'de'), ['weiss'])
+
+
 def test_phrase_freq():
     plant = word_frequency("plan.t", 'en')
     assert_greater(plant, 0)
diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index f861c89..7f441ca 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -149,7 +149,7 @@ def simple_tokenize(text):
     sequence, but they are if they appear internally. "cats'" is not a token,
     but "cat's" is.
     """
-    return [token.lower() for token in TOKEN_RE.findall(text)]
+    return [token.casefold() for token in TOKEN_RE.findall(text)]
 
 mecab_tokenize = None
 def tokenize(text, lang):