mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
parent
54eece5e8c
commit
2262088b5f
@ -8,7 +8,7 @@ CATEGORIES = [unicodedata.category(chr(i)) for i in range(0x110000)]
|
|||||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||||
|
|
||||||
|
|
||||||
def func_to_regex(func):
|
def func_to_regex(accept_func):
|
||||||
"""
|
"""
|
||||||
Given a function that returns True or False for a numerical codepoint,
|
Given a function that returns True or False for a numerical codepoint,
|
||||||
return a regex character class accepting the characters resulting in True.
|
return a regex character class accepting the characters resulting in True.
|
||||||
@ -20,15 +20,15 @@ def func_to_regex(func):
|
|||||||
tentative_end = None
|
tentative_end = None
|
||||||
ranges = []
|
ranges = []
|
||||||
|
|
||||||
for i, cat in enumerate(CATEGORIES):
|
for codepoint, category in enumerate(CATEGORIES):
|
||||||
if func(i):
|
if accept_func(codepoint):
|
||||||
if tentative_end == i - 1:
|
if tentative_end == codepoint - 1:
|
||||||
ranges[-1][1] = i
|
ranges[-1][1] = codepoint
|
||||||
else:
|
else:
|
||||||
ranges.append([i, i])
|
ranges.append([codepoint, codepoint])
|
||||||
tentative_end = i
|
tentative_end = codepoint
|
||||||
elif cat == 'Cn' and tentative_end == i - 1:
|
elif category == 'Cn' and tentative_end == codepoint - 1:
|
||||||
tentative_end = i
|
tentative_end = codepoint
|
||||||
|
|
||||||
return '[%s]' % ''.join(chr(r[0]) + '-' + chr(r[1]) for r in ranges)
|
return '[%s]' % ''.join(chr(r[0]) + '-' + chr(r[1]) for r in ranges)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user