updated gen_regex to be run as script

2024-12-23 17:31:41 +00:00 · 2015-07-07 14:50:56 -04:00 · 2015-07-07 14:50:56 -04:00 · 22fbea4248
commit 22fbea4248
parent f3f9a654ea
1 changed files with 40 additions and 7 deletions
--- a/scripts/gen_regex.py
+++ b/scripts/gen_regex.py
@ -1,6 +1,9 @@
 import argparse
 import unicodedata
 import chardata
+import pathlib
+
+DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))

 def _emoji_char_class():
    """
@ -9,6 +12,8 @@ def _emoji_char_class():
    matches one such character followed by any number of spaces and identical
    characters.
    """
+    emoji_file = DATA_PATH / 'emoji.txt'
+
    ranges = []
    for i, c in enumerate(chardata.CHAR_CLASS_STRING):
        if c == '3' and i >= 0x2600 and i != 0xfffd:
@ -16,7 +21,38 @@ def _emoji_char_class():
                ranges[-1][1] = i
            else:
                ranges.append([i, i])
-    return '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
+    out = '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
+
+    with emoji_file.open(mode='w') as file:
+        file.write(out)
+
+def _non_punct_class():
+    """
+    Builds a regex that matches anything that is not a one of the following
+    classes:
+    - P: punctuation
+    - S: symbols
+    - Z: separators
+    - C: control characters
+    This will classify symbols, including emoji, as punctuation; callers that
+    want to treat emoji separately should filter them out first.
+    """
+    non_punct_file = DATA_PATH / 'non_punct.txt
+
+    out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
+
+    with non_punct_file.open(mode='w') as file:
+        file.write(out)
+
+def _combining_mark_class():
+    """
+    Builds a regex that matches anything that is a combining mark
+    """
+    combining_mark_file = DATA_PATH / 'combining_mark.txt'
+    out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
+
+    with _combining_mark_file.open(mode='w') as file:
+        file.write(out)

 def func_to_regex(accept):
    """
@ -48,9 +84,6 @@ def func_to_regex(accept):
    return '[%s]' % ''.join(ranges)

 if __name__ == '__main__':
-    import argparse
-
-    parser = argparse.ArgumentParser(description='Generate a regex matching a function')
-    parser.add_argument('acceptor', help='an python function that accepts a single char')
-    args = parser.parse_args()
-    print(func_to_regex(eval(args.acceptor)))
+    _combining_mark_class()
+    _non_punct_class()
+    _emoji_char_class()