Merge pull request #17 from LuminosoInsight/alternative-func-to-regex

created alternate implementation of func-to-regex

Former-commit-id: 6efdaa308c
This commit is contained in:
Andrew Lin 2015-07-10 14:32:49 -04:00
commit 8439f14595

View File

@ -14,23 +14,22 @@ def func_to_regex(accept_func):
return a regex character class accepting the characters resulting in True. return a regex character class accepting the characters resulting in True.
Ranges separated only by unassigned characters are merged for efficiency. Ranges separated only by unassigned characters are merged for efficiency.
""" """
# Where the last range would end if it also included unassigned codepoints. # parsing_range is True if the current codepoint might be in a range that
# If we need to add a codepoint right after this point, we extend the # the regex will accept
# range; otherwise we start a new one. parsing_range = False
tentative_end = None
ranges = [] ranges = []
for codepoint, category in enumerate(CATEGORIES): for codepoint, category in enumerate(CATEGORIES):
if accept_func(codepoint): if accept_func(codepoint):
if tentative_end == codepoint - 1: if not parsing_range:
ranges[-1][1] = codepoint
else:
ranges.append([codepoint, codepoint]) ranges.append([codepoint, codepoint])
tentative_end = codepoint parsing_range = True
elif category == 'Cn' and tentative_end == codepoint - 1: else:
tentative_end = codepoint ranges[-1][1] = codepoint
elif category != 'Cn':
parsing_range = False
return '[%s]' % ''.join(chr(r[0]) + '-' + chr(r[1]) for r in ranges) return '[%s]' % ''.join('%c-%c' % tuple(r) for r in ranges)
def cache_regex_from_func(filename, func): def cache_regex_from_func(filename, func):