From 476a909e4d68a7fe79244620441e3400124925e0 Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 14:38:21 -0400 Subject: [PATCH] factored out regex generation --- scripts/gen_regex.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 scripts/gen_regex.py diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py new file mode 100644 index 0000000..7f1ce9f --- /dev/null +++ b/scripts/gen_regex.py @@ -0,0 +1,39 @@ +import argparse +import unicodedata + +def func_to_regex(accept): + """ + Converts a function that accepts a single unicode character into a regex. + Unassigned unicode characters are treated like their neighbors. + """ + ranges = [] + start = None + has_accepted = False + for x in range(0x110000): + c = chr(x) + + if accept(c): + has_accepted = True + if start is None: + start = None + elif unicodedata.category(c) == 'Cn': + if start is None: + start = c + elif start is not None: + if has_accepted: + ranges.append('-'.join([start, chr(x-1)])) + has_accepted = False + start = None + else: + if has_accepted and start is not None: + ranges.append('-'.join([start, chr(x-1)])) + + return '[%s]' % ''.join(ranges) + +if __name__ == '__main__': + import argparse + + parser = argparse.ArgumentParser(description='Generate a regex matching a function') + parser.add_argument('acceptor', help='an python function that accepts a single char') + args = parser.parse_args() + print(func_to_regex(eval(args.acceptor)))