case-fold instead of just lowercasing tokens

This commit is contained in:
Rob Speer 2015-06-30 15:14:02 -04:00
parent 053f372ebc
commit 638467f600
2 changed files with 7 additions and 1 deletions

View File

@ -95,6 +95,12 @@ def test_tokenization():
# apply.
eq_(tokenize("can.t", 'en'), ['can', 't'])
def test_casefolding():
eq_(tokenize('WEISS', 'de'), ['weiss'])
eq_(tokenize('weiß', 'de'), ['weiss'])
def test_phrase_freq():
plant = word_frequency("plan.t", 'en')
assert_greater(plant, 0)

View File

@ -149,7 +149,7 @@ def simple_tokenize(text):
sequence, but they are if they appear internally. "cats'" is not a token,
but "cat's" is.
"""
return [token.lower() for token in TOKEN_RE.findall(text)]
return [token.casefold() for token in TOKEN_RE.findall(text)]
mecab_tokenize = None
def tokenize(text, lang):