cleaned up gen regex

Former-commit-id: 27ea107e6f
This commit is contained in:
Joshua Chin 2015-07-07 16:00:24 -04:00
parent a72b4abb48
commit 7e9338f87e

View File

@ -8,25 +8,31 @@ from pkg_resources import resource_filename
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
def cache_regex_from_func(filename, func):
"""
Generates a regex from a function that accepts a single unicode character,
and caches it in the data path at filename.
"""
with (DATA_PATH / filename).open(mode='w') as file:
file.write(func_to_regex(func))
def _emoji_char_class(): def _emoji_char_class():
""" """
Build a regex for emoji substitution. We create a regex character set Build a regex for emoji substitution. We create a regex character set
(like "[a-cv-z]") matching characters we consider emoji. (like "[a-cv-z]") matching characters we consider emoji.
""" """
emoji_file = DATA_PATH / 'emoji.txt' cache_regex_from_func(
'emoji.txt',
def accept(c): lambda c:
x = ord(c) chardata.CHAR_CLASS_STRING[ord(c)] == '3' and
return chardata.CHAR_CLASS_STRING[x] == '3' and \ c >= '\u2600' and c != '\ufffd'
x >= 0x2600 and x != 0xfffd )
with (DATA_PATH / 'emoji.txt').open(mode='w') as file:
file.write(func_to_regex(accept))
def _non_punct_class(): def _non_punct_class():
""" """
Builds a regex that matches anything that is not a one of the following Builds a regex that matches anything that is not one of the following
classes: classes:
- P: punctuation - P: punctuation
- S: symbols - S: symbols
@ -35,23 +41,20 @@ def _non_punct_class():
This will classify symbols, including emoji, as punctuation; callers that This will classify symbols, including emoji, as punctuation; callers that
want to treat emoji separately should filter them out first. want to treat emoji separately should filter them out first.
""" """
non_punct_file = DATA_PATH / 'non_punct.txt' cache_regex_from_func(
'non_punct.txt',
out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC') lambda c: unicodedata.category(c)[0] not in 'PSZC'
)
with non_punct_file.open(mode='w') as file:
file.write(out)
def _combining_mark_class(): def _combining_mark_class():
""" """
Builds a regex that matches anything that is a combining mark Builds a regex that matches anything that is a combining mark
""" """
combining_mark_file = DATA_PATH / 'combining_mark.txt' cache_regex_from_func(
out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M') 'combining_mark.txt',
lambda c: unicodedata.category(c)[0] == 'M'
with combining_mark_file.open(mode='w') as file: )
file.write(out)
def func_to_regex(accept): def func_to_regex(accept):