mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
case-fold instead of just lowercasing tokens
Former-commit-id: 638467f600
This commit is contained in:
parent
4c2b766f46
commit
4997d776b9
@ -95,6 +95,12 @@ def test_tokenization():
|
||||
# apply.
|
||||
eq_(tokenize("can.t", 'en'), ['can', 't'])
|
||||
|
||||
|
||||
def test_casefolding():
|
||||
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
||||
eq_(tokenize('weiß', 'de'), ['weiss'])
|
||||
|
||||
|
||||
def test_phrase_freq():
|
||||
plant = word_frequency("plan.t", 'en')
|
||||
assert_greater(plant, 0)
|
||||
|
@ -149,7 +149,7 @@ def simple_tokenize(text):
|
||||
sequence, but they are if they appear internally. "cats'" is not a token,
|
||||
but "cat's" is.
|
||||
"""
|
||||
return [token.lower() for token in TOKEN_RE.findall(text)]
|
||||
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
||||
|
||||
mecab_tokenize = None
|
||||
def tokenize(text, lang):
|
||||
|
Loading…
Reference in New Issue
Block a user