updated imports

2024-12-23 09:21:37 +00:00 · 2015-07-07 14:46:42 -04:00 · 2015-07-07 14:46:42 -04:00 · f2b615b0f0
commit f2b615b0f0
parent b1cd2e01d3
2 changed files with 18 additions and 0 deletions
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@ -1,5 +1,22 @@
 import argparse
 import unicodedata
+import chardata
+
+def _emoji_char_class():
+    """
+    Build a regex for emoji substitution.  First we create a regex character set
+    (like "[a-cv-z]") matching characters we consider emoji The final regex
+    matches one such character followed by any number of spaces and identical
+    characters.
+    """
+    ranges = []
+    for i, c in enumerate(chardata.CHAR_CLASS_STRING):
+        if c == '3' and i >= 0x2600 and i != 0xfffd:
+            if ranges and i == ranges[-1][1] + 1:
+                ranges[-1][1] = i
+            else:
+                ranges.append([i, i])
+    return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)

 def func_to_regex(accept):
    """
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -87,6 +87,7 @@ def tokenize(text, lang):
    """
    if lang == 'ja':
        from wordfreq.mecab import mecab_tokenize
+        return mecab_tokenize(text)

    if lang == 'ar':
        text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))