diff --git a/MANIFEST.in b/MANIFEST.in
index 4f20a26..012f4ca 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,2 +1,3 @@
 recursive-include wordfreq/data *.gz
 include README.md
+recursive-include wordfreq/data *.txt
diff --git a/tests/test.py b/tests/test.py
index 91f990a..470d4fe 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -1,6 +1,6 @@
 from wordfreq import (
     word_frequency, available_languages, cB_to_freq, iter_wordlist,
-    top_n_list, random_words, random_ascii_words
+    top_n_list, random_words, random_ascii_words, tokenize
 )
 from nose.tools import (
     eq_, assert_almost_equal, assert_greater, assert_less, raises
@@ -84,12 +84,16 @@ def test_failed_cB_conversion():
 def test_tokenization():
     # We preserve apostrophes within words, so "can't" is a single word in the
     # data, while the fake word "plan't" can't be found.
-    assert_greater(word_frequency("can't", 'en'), 0)
-    eq_(word_frequency("plan't", 'en'), 0)
+    eq_(tokenize("can't", 'en'), ["can't"])
+    eq_(tokenize("plan't", 'en'), ["plan't"])
+
+    eq_(tokenize('😂test', 'en'), ['😂', 'test'])
 
     # We do split at other punctuation, causing the word-combining rule to
     # apply.
-    assert_greater(word_frequency("can.t", 'en'), 0)
+    eq_(tokenize("can.t", 'en'), ['can', 't'])
+
+def test_phrase_freq():
     plant = word_frequency("plan.t", 'en')
     assert_greater(plant, 0)
     assert_less(plant, word_frequency('plan', 'en'))
diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index 87e82f4..f861c89 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -1,6 +1,9 @@
 from pkg_resources import resource_filename
 from functools import lru_cache
+import unicodedata
+from ftfy import chardata
 import langcodes
+import itertools
 import msgpack
 import re
 import gzip
@@ -9,14 +12,128 @@ import random
 import logging
 logger = logging.getLogger(__name__)
 
-
-NON_PUNCT_RANGE = '[0-9A-Za-zª²³¹º\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff０-９Ａ-Ｚａ-ｚ\uff66-\U0002ffff]'
-NON_PUNCT_RE = re.compile(NON_PUNCT_RANGE)
-TOKEN_RE = re.compile("{0}+(?:'{0}+)*".format(NON_PUNCT_RANGE))
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 
 CACHE_SIZE = 100000
 
+def _emoji_char_class():
+    """
+    Build a regex for emoji substitution.  First we create a regex character set
+    (like "[a-cv-z]") matching characters we consider emoji (see the docstring
+    of _replace_problem_text()).  The final regex matches one such character
+    followed by any number of spaces and identical characters.
+    """
+    ranges = []
+    for i, c in enumerate(chardata.CHAR_CLASS_STRING):
+        if c == '3' and i >= 0x2600 and i != 0xfffd:
+            if ranges and i == ranges[-1][1] + 1:
+                ranges[-1][1] = i
+            else:
+                ranges.append([i, i])
+    return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
+
+EMOJI_RANGE = _emoji_char_class()
+
+def _non_punct_class():
+    """
+    Builds a regex that matches anything that is not a one of the following
+    classes:
+    - P: punctuation
+    - S: symbols
+    - Z: separators
+    - C: control characters
+    This will classify symbols, including emoji, as punctuation; callers that
+    want to treat emoji separately should filter them out first.
+    """
+    non_punct_file = DATA_PATH / 'non_punct.txt'
+    try:
+        with non_punct_file.open() as file:
+            return file.read()
+    except FileNotFoundError:
+
+        out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
+
+        with non_punct_file.open(mode='w') as file:
+            file.write(out)
+
+        return out
+
+def _combining_mark_class():
+    """
+    Builds a regex that matches anything that is a combining mark
+    """
+    _combining_mark_file = DATA_PATH / 'combining_mark.txt'
+    try:
+        with _combining_mark_file.open() as file:
+            return file.read()
+    except FileNotFoundError:
+
+        out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
+
+        with _combining_mark_file.open(mode='w') as file:
+            file.write(out)
+
+        return out
+
+
+def func_to_ranges(accept):
+    """
+    Converts a function that accepts a single unicode character into a list of
+    ranges. Unassigned unicode are automatically accepted.
+    """
+    ranges = []
+    start = None
+    for x in range(0x110000):
+        cat = unicodedata.category(chr(x))
+        if cat == 'Cn' or accept(chr(x)):
+            if start is None:
+                start = x
+        else:
+            if start is not None:
+                ranges.append((start, x-1))
+                start = None
+
+    if start is not None:
+        ranges.append((start, x))
+
+    return ranges
+
+unassigned_ranges = None
+
+def func_to_regex(accept):
+    """
+    Converts a function that accepts a single unicode character into a regex.
+    Unassigned unicode characters are treated like their neighbors.
+    """
+    ranges = []
+    start = None
+    for x in range(0x110000):
+        cat = unicodedata.category(chr(x))
+        if cat == 'Cn' or accept(chr(x)):
+            if start is None:
+                start = x
+        else:
+            if start is not None:
+                ranges.append((start, x-1))
+                start = None
+
+    if start is not None:
+        ranges.append((start, x))
+
+    global unassigned_ranges
+    if unassigned_ranges is None:
+        unassigned_ranges = set(func_to_ranges(lambda _: False))
+
+    ranges = [range for range in ranges if range not in unassigned_ranges]
+
+    return '[%s]' % ''.join("%s-%s" % (chr(start), chr(end))
+                                for start, end in ranges)
+
+
+COMBINING_MARK_RE = re.compile(_combining_mark_class())
+NON_PUNCT_RANGE = _non_punct_class()
+
+TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
 
 def simple_tokenize(text):
     """
@@ -34,21 +151,29 @@ def simple_tokenize(text):
     """
     return [token.lower() for token in TOKEN_RE.findall(text)]
 
-
+mecab_tokenize = None
 def tokenize(text, lang):
     """
     Tokenize this text in a way that's straightforward but appropriate for
     the language.
 
     So far, this means that Japanese is handled by mecab_tokenize, and
-    everything else is handled by simple_tokenize.
+    everything else is handled by simple_tokenize. Additionally, Arabic commas
+    and combining marks are removed.
 
     Strings that are looked up in wordfreq will be run through this function
     first, so that they can be expected to match the data.
     """
     if lang == 'ja':
-        from wordfreq.mecab import mecab_tokenize
+        global mecab_tokenize
+        if mecab_tokenize is None:
+            from wordfreq.mecab import mecab_tokenize
         return mecab_tokenize(text)
+    elif lang == 'ar':
+        tokens = simple_tokenize(text)
+        tokens = [token.replace('ـ', '') for token in tokens] # remove tatweel
+        tokens = [COMBINING_MARK_RE.sub('', token) for token in tokens]
+        return [token for token in tokens if token] # remove empty strings
     else:
         return simple_tokenize(text)
 
diff --git a/wordfreq/data/combining_mark.txt b/wordfreq/data/combining_mark.txt
new file mode 100644
index 0000000..ac83af1
--- /dev/null
+++ b/wordfreq/data/combining_mark.txt
@@ -0,0 +1 @@
+[̀-ͯ҃-҉֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-׏ؐ-ًؚ-ٰٟ-ٰۖ-ۜ۟-ۤۧ-۪ۨ-ܑۭ-ܑܰ-݌ަ-ް߫-߳ࠖ-࠙ࠛ-ࠣࠥ-ࠧࠩ-࠯࡙-࡝ࢭ-ःऺ-़ा-ॏ॑-ॗॢ-ॣঀ-঄঺-়া-্৏-৛ৢ-৥ৼ-਄਺-੘ੰ-ੱੵ-઄઺-઼ા-૏ૢ-૥૲-଄଺-଼ା-୛ୢ-୥୸-ஂ஺-௏௑-௥௻-ఄా-౗ౢ-౥ಀ-಄಺-಼ಾ-ೝೢ-೥ೳ-ഄാ-്൏-ൟൢ-൥඀-඄෇-ෳั-ัิ-฾็-๎ັ-ັິ-ຼ໇-໏༘-༙༵-༵༷-༹༷-༹༾-༿཭-྄྆-྇ྍ-྽࿆-࿆ါ-ှၖ-ၙၞ-ၠၢ-ၤၧ-ၭၱ-ၴႂ-ႍႏ-ႏႚ-ႝ፛-፟ᜒ-ᜟᜲ-᜴ᝒ-᝟᝱-᝿឴-៓៝-៟᠋-᠍ᢩ-ᢩᤝ-᤿᦬-ᧀᧈ-᧏ᨗ-᨝ᩕ-᩿᪮-ᬄ᬴-᭄᭫-᭳᭽-ᮂᮡ-ᮭ᯦-᯻ᰤ-᰺᳈-᳔᳒-᳨᳭-᳭ᳲ-᳴᷀-᷿₻-⃿⳯-⳱⵱-⵿⷟-〪ⷿ-〯゗-゚꙯-꙲ꙴ-꙽Ꚙ-ꚟ꛰-꛱ꠂ-ꠂ꠆-꠆ꠋ-ꠋꠣ-ꠧ꡸-ꢁꢴ-꣍꣚-꣱ꤦ-꤭ꥇ-꥞꥽-ꦃ꦳-꧀ꨩ-꨿ꩃ-ꩃꩌ-꩏ꩻ-ꩿꪰ-ꪰꪲ-ꪴꪷ-ꪸꪾ-꪿꫁-꫁ꫫ-ꫯꫵ-꬀ꯣ-ꯪ꯬-꯯ﬞ-ﬞ﷾-️︚-𐇽︯-𐉿𐨁-𐨏𐨴-𐨿𐹿-𑀂𑀸-𑁆𑁰-𑂂𑂰-𑂺𑃺-𑄂𑄧-𑄵𑅄-𑆂𑆳-𑇀𑚫-𑚿𖽑-𖾒𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄󠂀-󯿿]
\ No newline at end of file
diff --git a/wordfreq/data/non_punct.txt b/wordfreq/data/non_punct.txt
new file mode 100644
index 0000000..85af3b5
--- /dev/null
+++ b/wordfreq/data/non_punct.txt
@@ -0,0 +1 @@
+[0-9A-Za-zª-ª²-³µ-µ¹-º¼-¾À-ÖØ-öø-ˁˆ-ˑˠ-ˤˬ-ˬˮ-ˮ̀-ʹͶ-ͽΆ-ΆΈ-ϵϷ-ҁ҃-ՙՠ-ֈ֐-ֽֿ-ֿׁ-ׂׄ-ׇׅ-ײؐ-ؚؠ-٩ٮ-ۓە-ۜ۟-۪ۨ-ۼۿ-ۿܐ-ߵߺ-࠯࠿-࡝࡟-ॣ०-९ॱ-ৱ৴-৹ৼ-૯૲-୯ୱ-௲௻-౾ಀ-൸ൺ-ෳ෵-฾เ-๎๐-๙๜-ༀ༘-༙༠-༳༵-༵༷-༹༷-༹༾-྄྆-྽࿆-࿆࿛-၉ၐ-ႝႠ-ჺჼ-፟፩-ᎏ᎚-᏿ᐁ-ᙬᙯ-ᙿᚁ-ᚚ᚝-ᛪᛮ-᜴᜷-៓ៗ-ៗៜ-៿᠋-᠍᠏-᤿᥆-᧝ᨀ-᨝ᨠ-᪟ᪧ-ᪧ᪮-᭙᭫-᭳᭽-᯻ᰀ-᰺᱀-ᱽ᳈-᳔᳒-ᾼι-ιῂ-ῌῐ-῜ῠ-Ῥ῰-ῼ⁰-⁹ⁿ-₉₏-₟₻-⃿ℂ-ℂℇ-ℇℊ-ℓℕ-ℕℙ-ℝℤ-ℤΩ-Ωℨ-ℨK-ℭℯ-ℹℼ-ℿⅅ-ⅉⅎ-ⅎ⅐-↏⑋-⒛⓪-⓿❶-➓⭚-ⳤⳫ-⳸⳽-⳽ⴀ-ⵯ⵱-ⷿⸯ-ⸯ々-〇〡-〯〱-〵〸-〼぀-゚ゝ-ゟァ-ヺー-㆏㆒-㆕ㆠ-ㆿ㇤-ㇿ㈟-㈩㉈-㉏㉑-㉟㊀-㊉㊱-㊿㐀-䶿一-꒏꓇-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱ꜗ-ꜟꜢ-ꞈꞋ-ꠧ꠬-꠵꠺-ꡳ꡸-꣍꣐-ꣷꣻ-꤭ꤰ-꥞ꥠ-꧀꧎-꧝ꧠ-꩛ꩠ-ꩶꩺ-ꫝꫠ-ꫯꫲ-ꯪ꯬-퟿豈-ﬨשׁ-ﮱ﯂-ﴽ﵀-ﷻ﷾-️︚-︯﹬-﻾０-９Ａ-Ｚａ-ｚｦ-￟￾-𐃿𐄃-𐄶𐅀-𐅸𐆊-𐆏𐇽-𐎞𐎠-𐏏𐏑-𐡖𐡘-𐤞𐤠-𐤾𐥀-𐩏𐩙-𐩾𐪀-𐬸𐭀-𑁆𑁎-𑂺𑃂-𑄿𑅄-𑇄𑇉-𒑯𒑴-𜿿𝅥-𝅩𝅭-𝅲𝅻-𝆂𝆅-𝆋𝆪-𝆭𝉂-𝉄𝍗-𝛀𝛂-𝛚𝛜-𝛺𝛼-𝜔𝜖-𝜴𝜶-𝝎𝝐-𝝮𝝰-𝞈𝞊-𝞨𝞪-𝟂𝟄-𞻯🃠-🄏🝴-󠀀󠂀-󯿿]
\ No newline at end of file