mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-25 10:15:23 +00:00
40 lines
1.2 KiB
Python
40 lines
1.2 KiB
Python
|
import argparse
|
||
|
import unicodedata
|
||
|
|
||
|
def func_to_regex(accept):
|
||
|
"""
|
||
|
Converts a function that accepts a single unicode character into a regex.
|
||
|
Unassigned unicode characters are treated like their neighbors.
|
||
|
"""
|
||
|
ranges = []
|
||
|
start = None
|
||
|
has_accepted = False
|
||
|
for x in range(0x110000):
|
||
|
c = chr(x)
|
||
|
|
||
|
if accept(c):
|
||
|
has_accepted = True
|
||
|
if start is None:
|
||
|
start = None
|
||
|
elif unicodedata.category(c) == 'Cn':
|
||
|
if start is None:
|
||
|
start = c
|
||
|
elif start is not None:
|
||
|
if has_accepted:
|
||
|
ranges.append('-'.join([start, chr(x-1)]))
|
||
|
has_accepted = False
|
||
|
start = None
|
||
|
else:
|
||
|
if has_accepted and start is not None:
|
||
|
ranges.append('-'.join([start, chr(x-1)]))
|
||
|
|
||
|
return '[%s]' % ''.join(ranges)
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
import argparse
|
||
|
|
||
|
parser = argparse.ArgumentParser(description='Generate a regex matching a function')
|
||
|
parser.add_argument('acceptor', help='an python function that accepts a single char')
|
||
|
args = parser.parse_args()
|
||
|
print(func_to_regex(eval(args.acceptor)))
|