remove obsolete gen_regex.py

Former-commit-id: 102bc715ae
2024-12-23 09:21:37 +00:00 · 2015-08-24 17:09:50 -04:00 · 2015-08-24 17:09:50 -04:00 · 3674d35501
commit 3674d35501
parent 8795525372
1 changed files with 0 additions and 76 deletions
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@ -1,76 +0,0 @@
-import unicodedata
-from ftfy import chardata
-import pathlib
-from pkg_resources import resource_filename
-
-
-CATEGORIES = [unicodedata.category(chr(i)) for i in range(0x110000)]
-DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
-
-
-def func_to_regex(accept_func):
-    """
-    Given a function that returns True or False for a numerical codepoint,
-    return a regex character class accepting the characters resulting in True.
-    Ranges separated only by unassigned characters are merged for efficiency.
-    """
-    # parsing_range is True if the current codepoint might be in a range that
-    # the regex will accept
-    parsing_range = False
-    ranges = []
-
-    for codepoint, category in enumerate(CATEGORIES):
-        if accept_func(codepoint):
-            if not parsing_range:
-                ranges.append([codepoint, codepoint])
-                parsing_range = True
-            else:
-                ranges[-1][1] = codepoint
-        elif category != 'Cn':
-            parsing_range = False
-
-    return '[%s]' % ''.join('%c-%c' % tuple(r) for r in ranges)
-
-
-def cache_regex_from_func(filename, func):
-    """
-    Generates a regex from a function that accepts a single unicode character,
-    and caches it in the data path at filename.
-    """
-    with (DATA_PATH / filename).open(mode='w') as file:
-        file.write(func_to_regex(func))
-
-
-def _is_emoji_codepoint(i):
-    """
-    Report whether a numerical codepoint is (likely) an emoji: a Unicode 'So'
-    character (as future-proofed by the ftfy chardata module) but excluding
-    symbols like © and ™ below U+2600 and the replacement character U+FFFD.
-    """
-    return chardata.CHAR_CLASS_STRING[i] == '3' and i >= 0x2600 and i != 0xfffd
-
-
-def _is_non_punct_codepoint(i):
-    """
-    Report whether a numerical codepoint is not one of the following classes:
-    - P: punctuation
-    - S: symbols
-    - Z: separators
-    - C: control characters
-    This will classify symbols, including emoji, as punctuation; users that
-    want to accept emoji should add them separately.
-    """
-    return CATEGORIES[i][0] not in 'PSZC'
-
-
-def _is_combining_mark_codepoint(i):
-    """
-    Report whether a numerical codepoint is a combining mark (Unicode 'M').
-    """
-    return CATEGORIES[i][0] == 'M'
-
-
-if __name__ == '__main__':
-    cache_regex_from_func('emoji.txt', _is_emoji_codepoint)
-    cache_regex_from_func('non_punct.txt', _is_non_punct_codepoint)
-    cache_regex_from_func('combining_mark.txt', _is_combining_mark_codepoint)