wordfreq/scripts/gen_regex.py

77 lines
2.4 KiB
Python
Raw Normal View History

2015-07-07 18:38:21 +00:00
import unicodedata
2015-07-07 19:22:04 +00:00
from ftfy import chardata
2015-07-07 18:50:56 +00:00
import pathlib
2015-07-07 19:22:04 +00:00
from pkg_resources import resource_filename
2015-07-07 18:50:56 +00:00
2015-07-07 19:23:15 +00:00
CATEGORIES = [unicodedata.category(chr(i)) for i in range(0x110000)]
2015-07-07 18:50:56 +00:00
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
2015-07-07 18:46:42 +00:00
2015-07-07 19:23:15 +00:00
2015-07-10 18:02:33 +00:00
def func_to_regex(accept_func):
"""
Given a function that returns True or False for a numerical codepoint,
return a regex character class accepting the characters resulting in True.
Ranges separated only by unassigned characters are merged for efficiency.
"""
2015-07-10 18:27:48 +00:00
# parsing_range is True if the current codepoint might be in a range that
# the regex will accept
parsing_range = False
ranges = []
for codepoint, category in enumerate(CATEGORIES):
2015-07-10 18:02:33 +00:00
if accept_func(codepoint):
if not parsing_range:
ranges.append([codepoint, codepoint])
parsing_range = True
else:
ranges[-1][1] = codepoint
elif category != 'Cn':
parsing_range = False
2015-07-10 18:23:06 +00:00
return '[%s]' % ''.join('%c-%c' % tuple(r) for r in ranges)
2015-07-07 20:00:24 +00:00
def cache_regex_from_func(filename, func):
"""
Generates a regex from a function that accepts a single unicode character,
and caches it in the data path at filename.
"""
with (DATA_PATH / filename).open(mode='w') as file:
file.write(func_to_regex(func))
2015-07-07 20:00:24 +00:00
def _is_emoji_codepoint(i):
2015-07-07 18:46:42 +00:00
"""
Report whether a numerical codepoint is (likely) an emoji: a Unicode 'So'
character (as future-proofed by the ftfy chardata module) but excluding
symbols like © and below U+2600 and the replacement character U+FFFD.
2015-07-07 18:46:42 +00:00
"""
return chardata.CHAR_CLASS_STRING[i] == '3' and i >= 0x2600 and i != 0xfffd
2015-07-07 18:50:56 +00:00
2015-07-07 19:23:15 +00:00
def _is_non_punct_codepoint(i):
2015-07-07 18:50:56 +00:00
"""
Report whether a numerical codepoint is not one of the following classes:
2015-07-07 18:50:56 +00:00
- P: punctuation
- S: symbols
- Z: separators
- C: control characters
This will classify symbols, including emoji, as punctuation; users that
want to accept emoji should add them separately.
2015-07-07 18:50:56 +00:00
"""
return CATEGORIES[i][0] not in 'PSZC'
2015-07-07 18:38:21 +00:00
2015-07-07 19:23:15 +00:00
def _is_combining_mark_codepoint(i):
2015-07-07 18:38:21 +00:00
"""
Report whether a numerical codepoint is a combining mark (Unicode 'M').
2015-07-07 18:38:21 +00:00
"""
return CATEGORIES[i][0] == 'M'
2015-07-07 18:38:21 +00:00
2015-07-07 19:23:15 +00:00
2015-07-07 18:38:21 +00:00
if __name__ == '__main__':
cache_regex_from_func('emoji.txt', _is_emoji_codepoint)
cache_regex_from_func('non_punct.txt', _is_non_punct_codepoint)
cache_regex_from_func('combining_mark.txt', _is_combining_mark_codepoint)