updated emoji parser

2024-12-23 17:31:41 +00:00 · 2015-07-07 15:43:34 -04:00 · 2015-07-07 15:43:34 -04:00 · f04ca8fc9e
commit f04ca8fc9e
parent 9b851f3afe
2 changed files with 7 additions and 12 deletions
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@ -15,18 +15,13 @@ def _emoji_char_class():
    """
    emoji_file = DATA_PATH / 'emoji.txt'
-    ranges = []
+    def accept(c):
-    for i, c in enumerate(chardata.CHAR_CLASS_STRING):
+        x = ord(c)
-        # c represents the character class (3 corresponds to emoji)
+        return chardata.CHAR_CLASS_STRING[x] == '3' and \
-        if c == '3' and i >= 0x2600 and i != 0xfffd:
+                x >= 0x2600 and x != 0xfffd
            if ranges and i == ranges[-1][1] + 1:
                ranges[-1][1] = i
            else:
                ranges.append([i, i])
    out = '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
-    with emoji_file.open(mode='w') as file:
+    with (DATA_PATH / 'emoji.txt').open(mode='w') as file:
-        file.write(out)
+        file.write(func_to_regex(accept))
 def _non_punct_class():
--- a/wordfreq/data/emoji.txt
+++ b/wordfreq/data/emoji.txt
@ -1 +1 @@
-[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⭳⭶-⮕⮘-⮹⮽-⯈⯊-⯑⳥-⳪⺀-⺙⺛-⻳⼀-⿕⿰-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㋾㌀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽￤-￤￨-￨￭-￮-𐄷-𐄿𐅹-𐆉𐆌-𐆌𐆐-𐆛𐆠-𐆠𐇐-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖬿𖭅-𖭅𛲜-𛲜𝀀-𝃵𝄀-𝄦𝄩-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝇝𝈀-𝉁𝉅-𝉅𝌀-𝍖🀀-🃿🄍-🣿]
+[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯿⳥-⳪⸼-⿿〄-〄〒-〓〠-〠〶-〷〾-぀㆏-㆑㆖-㆟ㆻ-㇯㈀-㈟㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䶶-䷿꒍-꓏꠨-꠯꠶-꠷꠹-꠿꩷-꩹﷽-﷿￤-￤￧-￨￭-￸-𐄴-𐄿𐅹-𐆉𐆋-𐇼𐡠-𐣿𐪀-𐫿𖨹-𖻿𛀂-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍟𞻲-🃿🄋-🿿]
		`@ -1 +1 @@`
			`[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⭳⭶-⮕⮘-⮹⮽-⯈⯊-⯑⳥-⳪⺀-⺙⺛-⻳⼀-⿕⿰-⿻〄-〄〒-〓〠-〠〶-〷〾-〿㆐-㆑㆖-㆟㇀-㇣㈀-㈞㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㋾㌀-㏿䷀-䷿꒐-꓆꠨-꠫꠶-꠷꠹-꠹꩷-꩹﷽-﷽￤-￤￨-￨￭-￮-𐄷-𐄿𐅹-𐆉𐆌-𐆌𐆐-𐆛𐆠-𐆠𐇐-𐇼𐡷-𐡸𐫈-𐫈𖬼-𖬿𖭅-𖭅𛲜-𛲜𝀀-𝃵𝄀-𝄦𝄩-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝇝𝈀-𝉁𝉅-𝉅𝌀-𝍖🀀-🃿🄍-🣿]`				`[☀-♮♰-❧➔-➿⠀-⣿⬀-⬯⭅-⭆⭍-⯿⳥-⳪⸼-⿿〄-〄〒-〓〠-〠〶-〷〾-぀㆏-㆑㆖-㆟ㆻ-㇯㈀-㈟㈪-㉇㉐-㉐㉠-㉿㊊-㊰㋀-㏿䶶-䷿꒍-꓏꠨-꠯꠶-꠷꠹-꠿꩷-꩹﷽-﷿￤-￤￧-￨￭-￸-𐄴-𐄿𐅹-𐆉𐆋-𐇼𐡠-𐣿𐪀-𐫿𖨹-𖻿𛀂-𝅘𝅥𝅲𝅪-𝅬𝆃-𝆄𝆌-𝆩𝆮-𝉁𝉅-𝍟𞻲-🃿🄋-🿿]`