From f04ca8fc9e40b5d8bfb1563414fc4a15a8c8edb0 Mon Sep 17 00:00:00 2001
From: Joshua Chin <jchin@luminoso.com>
Date: Tue, 7 Jul 2015 15:43:34 -0400
Subject: [PATCH] updated emoji parser

---
 scripts/gen_regex.py    | 17 ++++++-----------
 wordfreq/data/emoji.txt |  2 +-
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py
index ea50186..1a32ac7 100644
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@@ -15,18 +15,13 @@ def _emoji_char_class():
     """
     emoji_file = DATA_PATH / 'emoji.txt'
 
-    ranges = []
-    for i, c in enumerate(chardata.CHAR_CLASS_STRING):
-        # c represents the character class (3 corresponds to emoji)
-        if c == '3' and i >= 0x2600 and i != 0xfffd:
-            if ranges and i == ranges[-1][1] + 1:
-                ranges[-1][1] = i
-            else:
-                ranges.append([i, i])
-    out = '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
+    def accept(c):
+        x = ord(c)
+        return chardata.CHAR_CLASS_STRING[x] == '3' and \
+                x >= 0x2600 and x != 0xfffd
 
-    with emoji_file.open(mode='w') as file:
-        file.write(out)
+    with (DATA_PATH / 'emoji.txt').open(mode='w') as file:
+        file.write(func_to_regex(accept))
 
 
 def _non_punct_class():
diff --git a/wordfreq/data/emoji.txt b/wordfreq/data/emoji.txt
index f09f7b9..15c56fb 100644
--- a/wordfreq/data/emoji.txt
+++ b/wordfreq/data/emoji.txt
@@ -1 +1 @@
-[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⭳⭶-⮕⮘-⮹⮽-⯈⯊-⯑⳥-⳪⺀-⺙⺛-⻳⼀-⿕⿰-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㋾㌀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽￤-￤￨-￨￭-￮￼-￼𐄷-𐄿𐅹-𐆉𐆌-𐆌𐆐-𐆛𐆠-𐆠𐇐-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖬿𖭅-𖭅𛲜-𛲜𝀀-𝃵𝄀-𝄦𝄩-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝇝𝈀-𝉁𝉅-𝉅𝌀-𝍖🀀-🃿🄍-🣿]
\ No newline at end of file
+[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯿⳥-⳪⸼-⿿〄-〄〒-〓〠-〠〶-〷〾-぀㆏-㆑㆖-㆟ㆻ-㇯㈀-㈟㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䶶-䷿꒍-꓏꠨-꠯꠶-꠷꠹-꠿꩷-꩹﷽-﷿￤-￤￧-￨￭-￸￼-￼𐄴-𐄿𐅹-𐆉𐆋-𐇼𐡠-𐣿𐪀-𐫿𖨹-𖻿𛀂-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍟𞻲-🃿🄋-🿿]
\ No newline at end of file