From 3bcb3e84a111ecba5b663ce18697109641b6a185 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Thu, 25 Jun 2015 11:25:51 -0400
Subject: [PATCH] updated tests for emoji splitting

---
 tests/test.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index 91f990a..470d4fe 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -1,6 +1,6 @@
 from wordfreq import (
     word_frequency, available_languages, cB_to_freq, iter_wordlist,
-    top_n_list, random_words, random_ascii_words
+    top_n_list, random_words, random_ascii_words, tokenize
 )
 from nose.tools import (
     eq_, assert_almost_equal, assert_greater, assert_less, raises
@@ -84,12 +84,16 @@ def test_failed_cB_conversion():
 def test_tokenization():
     # We preserve apostrophes within words, so "can't" is a single word in the
     # data, while the fake word "plan't" can't be found.
-    assert_greater(word_frequency("can't", 'en'), 0)
-    eq_(word_frequency("plan't", 'en'), 0)
+    eq_(tokenize("can't", 'en'), ["can't"])
+    eq_(tokenize("plan't", 'en'), ["plan't"])
+
+    eq_(tokenize('😂test', 'en'), ['😂', 'test'])
 
     # We do split at other punctuation, causing the word-combining rule to
     # apply.
-    assert_greater(word_frequency("can.t", 'en'), 0)
+    eq_(tokenize("can.t", 'en'), ['can', 't'])
+
+def test_phrase_freq():
     plant = word_frequency("plan.t", 'en')
     assert_greater(plant, 0)
     assert_less(plant, word_frequency('plan', 'en'))