Merge pull request #13 from LuminosoInsight/casefold-tokens

Case-fold instead of just lowercasing tokens
This commit is contained in:
Joshua Chin 2015-07-01 11:34:02 -04:00
commit 95fc0c8e9d
18 changed files with 9 additions and 3 deletions

View File

@ -33,7 +33,7 @@ if sys.version_info < (3, 4):
setup(
name="wordfreq",
version='1.0b2',
version='1.0b3',
maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/',

View File

@ -95,6 +95,12 @@ def test_tokenization():
# apply.
eq_(tokenize("can.t", 'en'), ['can', 't'])
def test_casefolding():
eq_(tokenize('WEISS', 'de'), ['weiss'])
eq_(tokenize('weiß', 'de'), ['weiss'])
def test_phrase_freq():
plant = word_frequency("plan.t", 'en')
assert_greater(plant, 0)
@ -111,7 +117,7 @@ def test_not_really_random():
# This not only tests random_ascii_words, it makes sure we didn't end
# up with 'eos' as a very common Japanese word
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
'e e e e')
'rt rt rt rt')
@raises(ValueError)

View File

@ -149,7 +149,7 @@ def simple_tokenize(text):
sequence, but they are if they appear internally. "cats'" is not a token,
but "cat's" is.
"""
return [token.lower() for token in TOKEN_RE.findall(text)]
return [token.casefold() for token in TOKEN_RE.findall(text)]
mecab_tokenize = None
def tokenize(text, lang):

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.