Merge pull request #13 from LuminosoInsight/casefold-tokens

Case-fold instead of just lowercasing tokens
2024-12-23 17:31:41 +00:00 · 2015-07-01 11:34:02 -04:00 · 2015-07-01 11:34:02 -04:00 · 95fc0c8e9d
commit 95fc0c8e9d
parent 7d25627e43 f9a9ee7a82
18 changed files with 9 additions and 3 deletions
--- a/setup.py
+++ b/setup.py
@ -33,7 +33,7 @@ if sys.version_info < (3, 4):

 setup(
    name="wordfreq",
-    version='1.0b2',
+    version='1.0b3',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
--- a/tests/test.py
+++ b/tests/test.py
@ -95,6 +95,12 @@ def test_tokenization():
    # apply.
    eq_(tokenize("can.t", 'en'), ['can', 't'])

+
+def test_casefolding():
+    eq_(tokenize('WEISS', 'de'), ['weiss'])
+    eq_(tokenize('weiß', 'de'), ['weiss'])
+
+
 def test_phrase_freq():
    plant = word_frequency("plan.t", 'en')
    assert_greater(plant, 0)
@ -111,7 +117,7 @@ def test_not_really_random():
    # This not only tests random_ascii_words, it makes sure we didn't end
    # up with 'eos' as a very common Japanese word
    eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
-        'e e e e')
+        'rt rt rt rt')


@raises(ValueError)
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -149,7 +149,7 @@ def simple_tokenize(text):
    sequence, but they are if they appear internally. "cats'" is not a token,
    but "cat's" is.
    """
-    return [token.lower() for token in TOKEN_RE.findall(text)]
+    return [token.casefold() for token in TOKEN_RE.findall(text)]

 mecab_tokenize = None
 def tokenize(text, lang):
--- a/wordfreq/data/combined_ar.msgpack.gz
+++ b/wordfreq/data/combined_ar.msgpack.gz
--- a/wordfreq/data/combined_de.msgpack.gz
+++ b/wordfreq/data/combined_de.msgpack.gz
--- a/wordfreq/data/combined_el.msgpack.gz
+++ b/wordfreq/data/combined_el.msgpack.gz
--- a/wordfreq/data/combined_en.msgpack.gz
+++ b/wordfreq/data/combined_en.msgpack.gz
--- a/wordfreq/data/combined_es.msgpack.gz
+++ b/wordfreq/data/combined_es.msgpack.gz
--- a/wordfreq/data/combined_fr.msgpack.gz
+++ b/wordfreq/data/combined_fr.msgpack.gz
--- a/wordfreq/data/combined_id.msgpack.gz
+++ b/wordfreq/data/combined_id.msgpack.gz
--- a/wordfreq/data/combined_it.msgpack.gz
+++ b/wordfreq/data/combined_it.msgpack.gz
--- a/wordfreq/data/combined_ja.msgpack.gz
+++ b/wordfreq/data/combined_ja.msgpack.gz
--- a/wordfreq/data/combined_ko.msgpack.gz
+++ b/wordfreq/data/combined_ko.msgpack.gz
--- a/wordfreq/data/combined_ms.msgpack.gz
+++ b/wordfreq/data/combined_ms.msgpack.gz
--- a/wordfreq/data/combined_nl.msgpack.gz
+++ b/wordfreq/data/combined_nl.msgpack.gz
--- a/wordfreq/data/combined_pt.msgpack.gz
+++ b/wordfreq/data/combined_pt.msgpack.gz
--- a/wordfreq/data/combined_ru.msgpack.gz
+++ b/wordfreq/data/combined_ru.msgpack.gz
--- a/wordfreq/data/combined_zh.msgpack.gz
+++ b/wordfreq/data/combined_zh.msgpack.gz