mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 18:01:38 +00:00
Merge pull request #13 from LuminosoInsight/casefold-tokens
Case-fold instead of just lowercasing tokens
Former-commit-id: 95fc0c8e9d
This commit is contained in:
commit
7ac8c6be59
2
setup.py
2
setup.py
@ -33,7 +33,7 @@ if sys.version_info < (3, 4):
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
version='1.0b2',
|
||||
version='1.0b3',
|
||||
maintainer='Luminoso Technologies, Inc.',
|
||||
maintainer_email='info@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
|
@ -95,6 +95,12 @@ def test_tokenization():
|
||||
# apply.
|
||||
eq_(tokenize("can.t", 'en'), ['can', 't'])
|
||||
|
||||
|
||||
def test_casefolding():
|
||||
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
||||
eq_(tokenize('weiß', 'de'), ['weiss'])
|
||||
|
||||
|
||||
def test_phrase_freq():
|
||||
plant = word_frequency("plan.t", 'en')
|
||||
assert_greater(plant, 0)
|
||||
@ -111,7 +117,7 @@ def test_not_really_random():
|
||||
# This not only tests random_ascii_words, it makes sure we didn't end
|
||||
# up with 'eos' as a very common Japanese word
|
||||
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
|
||||
'e e e e')
|
||||
'rt rt rt rt')
|
||||
|
||||
|
||||
@raises(ValueError)
|
||||
|
@ -149,7 +149,7 @@ def simple_tokenize(text):
|
||||
sequence, but they are if they appear internally. "cats'" is not a token,
|
||||
but "cat's" is.
|
||||
"""
|
||||
return [token.lower() for token in TOKEN_RE.findall(text)]
|
||||
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
||||
|
||||
mecab_tokenize = None
|
||||
def tokenize(text, lang):
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user