mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
created alternate implementation of func-to-regex
Former-commit-id: 7c189ef129
This commit is contained in:
parent
8a3638bc59
commit
e23a8c0dc6
@ -8,29 +8,30 @@ CATEGORIES = [unicodedata.category(chr(i)) for i in range(0x110000)]
|
|||||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||||
|
|
||||||
|
|
||||||
def func_to_regex(func):
|
def func_to_regex(accept):
|
||||||
"""
|
"""
|
||||||
Given a function that returns True or False for a numerical codepoint,
|
Given a function that returns True or False for a numerical codepoint,
|
||||||
return a regex character class accepting the characters resulting in True.
|
return a regex character class accepting the characters resulting in True.
|
||||||
Ranges separated only by unassigned characters are merged for efficiency.
|
Ranges separated only by unassigned characters are merged for efficiency.
|
||||||
"""
|
"""
|
||||||
# A list of [start, end (accepted), end (accepted or unassigned)] lists
|
# start and end of the range we are currently parsing. `start` is None if
|
||||||
|
# we are not parsing a range.
|
||||||
|
start = end = None
|
||||||
ranges = []
|
ranges = []
|
||||||
|
|
||||||
for i, cat in enumerate(CATEGORIES):
|
for codepoint, category in enumerate(CATEGORIES):
|
||||||
if func(i):
|
if accept(codepoint):
|
||||||
# If the last range can be extended, do so; else start a new one
|
if start is None:
|
||||||
if ranges and ranges[-1][2] == i - 1:
|
start = codepoint
|
||||||
ranges[-1][1] = i
|
end = codepoint
|
||||||
ranges[-1][2] = i
|
elif category != 'Cn' and start is not None:
|
||||||
else:
|
ranges.append((start, end))
|
||||||
ranges.append([i, i, i])
|
start = end = None
|
||||||
elif cat == 'Cn':
|
|
||||||
# If the last range can be extended, do so
|
|
||||||
if ranges and ranges[-1][2] == i - 1:
|
|
||||||
ranges[-1][2] = i
|
|
||||||
|
|
||||||
return '[%s]' % ''.join(chr(r[0]) + '-' + chr(r[1]) for r in ranges)
|
if start is not None:
|
||||||
|
ranges.append((start, end))
|
||||||
|
|
||||||
|
return '[%s]' % ''.join('%s-%s' % (chr(r[0]), chr(r[1])) for r in ranges)
|
||||||
|
|
||||||
|
|
||||||
def cache_regex_from_func(filename, func):
|
def cache_regex_from_func(filename, func):
|
||||||
|
Loading…
Reference in New Issue
Block a user