mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Merge pull request #13 from LuminosoInsight/casefold-tokens
Case-fold instead of just lowercasing tokens
This commit is contained in:
commit
95fc0c8e9d
2
setup.py
2
setup.py
@ -33,7 +33,7 @@ if sys.version_info < (3, 4):
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
version='1.0b2',
|
||||
version='1.0b3',
|
||||
maintainer='Luminoso Technologies, Inc.',
|
||||
maintainer_email='info@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
|
@ -95,6 +95,12 @@ def test_tokenization():
|
||||
# apply.
|
||||
eq_(tokenize("can.t", 'en'), ['can', 't'])
|
||||
|
||||
|
||||
def test_casefolding():
|
||||
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
||||
eq_(tokenize('weiß', 'de'), ['weiss'])
|
||||
|
||||
|
||||
def test_phrase_freq():
|
||||
plant = word_frequency("plan.t", 'en')
|
||||
assert_greater(plant, 0)
|
||||
@ -111,7 +117,7 @@ def test_not_really_random():
|
||||
# This not only tests random_ascii_words, it makes sure we didn't end
|
||||
# up with 'eos' as a very common Japanese word
|
||||
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
|
||||
'e e e e')
|
||||
'rt rt rt rt')
|
||||
|
||||
|
||||
@raises(ValueError)
|
||||
|
@ -149,7 +149,7 @@ def simple_tokenize(text):
|
||||
sequence, but they are if they appear internally. "cats'" is not a token,
|
||||
but "cat's" is.
|
||||
"""
|
||||
return [token.lower() for token in TOKEN_RE.findall(text)]
|
||||
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
||||
|
||||
mecab_tokenize = None
|
||||
def tokenize(text, lang):
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user