diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py
index 7f1ce9f..4391f3a 100644
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@@ -1,5 +1,22 @@
 import argparse
 import unicodedata
+import chardata
+
+def _emoji_char_class():
+    """
+    Build a regex for emoji substitution.  First we create a regex character set
+    (like "[a-cv-z]") matching characters we consider emoji The final regex
+    matches one such character followed by any number of spaces and identical
+    characters.
+    """
+    ranges = []
+    for i, c in enumerate(chardata.CHAR_CLASS_STRING):
+        if c == '3' and i >= 0x2600 and i != 0xfffd:
+            if ranges and i == ranges[-1][1] + 1:
+                ranges[-1][1] = i
+            else:
+                ranges.append([i, i])
+    return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
 
 def func_to_regex(accept):
     """
diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index 9697238..a3d0cd0 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -87,6 +87,7 @@ def tokenize(text, lang):
     """
     if lang == 'ja':
         from wordfreq.mecab import mecab_tokenize
+        return mecab_tokenize(text)
 
     if lang == 'ar':
         text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))