diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index 7f1ce9f..4391f3a 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -1,5 +1,22 @@ import argparse import unicodedata +import chardata + +def _emoji_char_class(): + """ + Build a regex for emoji substitution. First we create a regex character set + (like "[a-cv-z]") matching characters we consider emoji The final regex + matches one such character followed by any number of spaces and identical + characters. + """ + ranges = [] + for i, c in enumerate(chardata.CHAR_CLASS_STRING): + if c == '3' and i >= 0x2600 and i != 0xfffd: + if ranges and i == ranges[-1][1] + 1: + ranges[-1][1] = i + else: + ranges.append([i, i]) + return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges) def func_to_regex(accept): """ diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index 9697238..a3d0cd0 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -87,6 +87,7 @@ def tokenize(text, lang): """ if lang == 'ja': from wordfreq.mecab import mecab_tokenize + return mecab_tokenize(text) if lang == 'ar': text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))