From 4389422958adb6ca0c9ca36036661ad87950bfe0 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 15:43:34 -0400 Subject: [PATCH] updated emoji parser Former-commit-id: f04ca8fc9e40b5d8bfb1563414fc4a15a8c8edb0 --- scripts/gen_regex.py | 17 ++++++----------- wordfreq/data/emoji.txt | 2 +- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index ea50186..1a32ac7 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -15,18 +15,13 @@ def _emoji_char_class(): """ emoji_file = DATA_PATH / 'emoji.txt' - ranges = [] - for i, c in enumerate(chardata.CHAR_CLASS_STRING): - # c represents the character class (3 corresponds to emoji) - if c == '3' and i >= 0x2600 and i != 0xfffd: - if ranges and i == ranges[-1][1] + 1: - ranges[-1][1] = i - else: - ranges.append([i, i]) - out = '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges) + def accept(c): + x = ord(c) + return chardata.CHAR_CLASS_STRING[x] == '3' and \ + x >= 0x2600 and x != 0xfffd - with emoji_file.open(mode='w') as file: - file.write(out) + with (DATA_PATH / 'emoji.txt').open(mode='w') as file: + file.write(func_to_regex(accept)) def _non_punct_class(): diff --git a/wordfreq/data/emoji.txt b/wordfreq/data/emoji.txt index f09f7b9..15c56fb 100644 --- a/wordfreq/data/emoji.txt +++ b/wordfreq/data/emoji.txt @@ -1 +1 @@ -[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⭳⭶-⮕⮘-⮹⮽-⯈⯊-⯑⳥-⳪⺀-⺙⺛-⻳⼀-⿕⿰-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㋾㌀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽¦-¦│-│■-○-𐄷-𐄿𐅹-𐆉𐆌-𐆌𐆐-𐆛𐆠-𐆠𐇐-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖬿𖭅-𖭅𛲜-𛲜𝀀-𝃵𝄀-𝄦𝄩-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝇝𝈀-𝉁𝉅-𝉅𝌀-𝍖🀀-🃿🄍-🣿] \ No newline at end of file +[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯿⳥-⳪⸼-⿿〄-〄〒-〓〠-〠〶-〷〾-぀㆏-㆑㆖-㆟ㆻ-㇯㈀-㈟㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䶶-䷿꒍-꓏꠨-꠯꠶-꠷꠹-꠿꩷-꩹﷽-﷿¦-¦￧-│■-￸-𐄴-𐄿𐅹-𐆉𐆋-𐇼𐡠-𐣿𐪀-𐫿𖨹-𖻿𛀂-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍟𞻲-🃿🄋-🿿] \ No newline at end of file