created alternate implementation of func-to-regex

Former-commit-id: 7c189ef129
This commit is contained in:
Joshua Chin 2015-07-10 11:03:57 -04:00
parent 8a3638bc59
commit e23a8c0dc6

View File

@ -8,29 +8,30 @@ CATEGORIES = [unicodedata.category(chr(i)) for i in range(0x110000)]
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
def func_to_regex(func): def func_to_regex(accept):
""" """
Given a function that returns True or False for a numerical codepoint, Given a function that returns True or False for a numerical codepoint,
return a regex character class accepting the characters resulting in True. return a regex character class accepting the characters resulting in True.
Ranges separated only by unassigned characters are merged for efficiency. Ranges separated only by unassigned characters are merged for efficiency.
""" """
# A list of [start, end (accepted), end (accepted or unassigned)] lists # start and end of the range we are currently parsing. `start` is None if
# we are not parsing a range.
start = end = None
ranges = [] ranges = []
for i, cat in enumerate(CATEGORIES): for codepoint, category in enumerate(CATEGORIES):
if func(i): if accept(codepoint):
# If the last range can be extended, do so; else start a new one if start is None:
if ranges and ranges[-1][2] == i - 1: start = codepoint
ranges[-1][1] = i end = codepoint
ranges[-1][2] = i elif category != 'Cn' and start is not None:
else: ranges.append((start, end))
ranges.append([i, i, i]) start = end = None
elif cat == 'Cn':
# If the last range can be extended, do so
if ranges and ranges[-1][2] == i - 1:
ranges[-1][2] = i
return '[%s]' % ''.join(chr(r[0]) + '-' + chr(r[1]) for r in ranges) if start is not None:
ranges.append((start, end))
return '[%s]' % ''.join('%s-%s' % (chr(r[0]), chr(r[1])) for r in ranges)
def cache_regex_from_func(filename, func): def cache_regex_from_func(filename, func):