mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
updated tests for emoji splitting
This commit is contained in:
parent
35a80e5f50
commit
3bcb3e84a1
@ -1,6 +1,6 @@
|
|||||||
from wordfreq import (
|
from wordfreq import (
|
||||||
word_frequency, available_languages, cB_to_freq, iter_wordlist,
|
word_frequency, available_languages, cB_to_freq, iter_wordlist,
|
||||||
top_n_list, random_words, random_ascii_words
|
top_n_list, random_words, random_ascii_words, tokenize
|
||||||
)
|
)
|
||||||
from nose.tools import (
|
from nose.tools import (
|
||||||
eq_, assert_almost_equal, assert_greater, assert_less, raises
|
eq_, assert_almost_equal, assert_greater, assert_less, raises
|
||||||
@ -84,12 +84,16 @@ def test_failed_cB_conversion():
|
|||||||
def test_tokenization():
|
def test_tokenization():
|
||||||
# We preserve apostrophes within words, so "can't" is a single word in the
|
# We preserve apostrophes within words, so "can't" is a single word in the
|
||||||
# data, while the fake word "plan't" can't be found.
|
# data, while the fake word "plan't" can't be found.
|
||||||
assert_greater(word_frequency("can't", 'en'), 0)
|
eq_(tokenize("can't", 'en'), ["can't"])
|
||||||
eq_(word_frequency("plan't", 'en'), 0)
|
eq_(tokenize("plan't", 'en'), ["plan't"])
|
||||||
|
|
||||||
|
eq_(tokenize('😂test', 'en'), ['😂', 'test'])
|
||||||
|
|
||||||
# We do split at other punctuation, causing the word-combining rule to
|
# We do split at other punctuation, causing the word-combining rule to
|
||||||
# apply.
|
# apply.
|
||||||
assert_greater(word_frequency("can.t", 'en'), 0)
|
eq_(tokenize("can.t", 'en'), ['can', 't'])
|
||||||
|
|
||||||
|
def test_phrase_freq():
|
||||||
plant = word_frequency("plan.t", 'en')
|
plant = word_frequency("plan.t", 'en')
|
||||||
assert_greater(plant, 0)
|
assert_greater(plant, 0)
|
||||||
assert_less(plant, word_frequency('plan', 'en'))
|
assert_less(plant, word_frequency('plan', 'en'))
|
||||||
|
Loading…
Reference in New Issue
Block a user