mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
case-fold instead of just lowercasing tokens
Former-commit-id: 638467f600
This commit is contained in:
parent
4c2b766f46
commit
4997d776b9
@ -95,6 +95,12 @@ def test_tokenization():
|
|||||||
# apply.
|
# apply.
|
||||||
eq_(tokenize("can.t", 'en'), ['can', 't'])
|
eq_(tokenize("can.t", 'en'), ['can', 't'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_casefolding():
|
||||||
|
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
||||||
|
eq_(tokenize('weiß', 'de'), ['weiss'])
|
||||||
|
|
||||||
|
|
||||||
def test_phrase_freq():
|
def test_phrase_freq():
|
||||||
plant = word_frequency("plan.t", 'en')
|
plant = word_frequency("plan.t", 'en')
|
||||||
assert_greater(plant, 0)
|
assert_greater(plant, 0)
|
||||||
|
@ -149,7 +149,7 @@ def simple_tokenize(text):
|
|||||||
sequence, but they are if they appear internally. "cats'" is not a token,
|
sequence, but they are if they appear internally. "cats'" is not a token,
|
||||||
but "cat's" is.
|
but "cat's" is.
|
||||||
"""
|
"""
|
||||||
return [token.lower() for token in TOKEN_RE.findall(text)]
|
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
||||||
|
|
||||||
mecab_tokenize = None
|
mecab_tokenize = None
|
||||||
def tokenize(text, lang):
|
def tokenize(text, lang):
|
||||||
|
Loading…
Reference in New Issue
Block a user