factored out regex generation

This commit is contained in:
Joshua Chin 2015-07-07 14:38:21 -04:00
parent 781a072713
commit 476a909e4d

39
scripts/gen_regex.py Normal file
View File

@ -0,0 +1,39 @@
import argparse
import unicodedata
def func_to_regex(accept):
"""
Converts a function that accepts a single unicode character into a regex.
Unassigned unicode characters are treated like their neighbors.
"""
ranges = []
start = None
has_accepted = False
for x in range(0x110000):
c = chr(x)
if accept(c):
has_accepted = True
if start is None:
start = None
elif unicodedata.category(c) == 'Cn':
if start is None:
start = c
elif start is not None:
if has_accepted:
ranges.append('-'.join([start, chr(x-1)]))
has_accepted = False
start = None
else:
if has_accepted and start is not None:
ranges.append('-'.join([start, chr(x-1)]))
return '[%s]' % ''.join(ranges)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Generate a regex matching a function')
parser.add_argument('acceptor', help='an python function that accepts a single char')
args = parser.parse_args()
print(func_to_regex(eval(args.acceptor)))