From 3bcb3e84a111ecba5b663ce18697109641b6a185 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Thu, 25 Jun 2015 11:25:51 -0400 Subject: [PATCH] updated tests for emoji splitting --- tests/test.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test.py b/tests/test.py index 91f990a..470d4fe 100644 --- a/tests/test.py +++ b/tests/test.py @@ -1,6 +1,6 @@ from wordfreq import ( word_frequency, available_languages, cB_to_freq, iter_wordlist, - top_n_list, random_words, random_ascii_words + top_n_list, random_words, random_ascii_words, tokenize ) from nose.tools import ( eq_, assert_almost_equal, assert_greater, assert_less, raises @@ -84,12 +84,16 @@ def test_failed_cB_conversion(): def test_tokenization(): # We preserve apostrophes within words, so "can't" is a single word in the # data, while the fake word "plan't" can't be found. - assert_greater(word_frequency("can't", 'en'), 0) - eq_(word_frequency("plan't", 'en'), 0) + eq_(tokenize("can't", 'en'), ["can't"]) + eq_(tokenize("plan't", 'en'), ["plan't"]) + + eq_(tokenize('๐Ÿ˜‚test', 'en'), ['๐Ÿ˜‚', 'test']) # We do split at other punctuation, causing the word-combining rule to # apply. - assert_greater(word_frequency("can.t", 'en'), 0) + eq_(tokenize("can.t", 'en'), ['can', 't']) + +def test_phrase_freq(): plant = word_frequency("plan.t", 'en') assert_greater(plant, 0) assert_less(plant, word_frequency('plan', 'en'))