2015-07-07 18:38:21 +00:00
|
|
|
import unicodedata
|
2015-07-07 19:22:04 +00:00
|
|
|
from ftfy import chardata
|
2015-07-07 18:50:56 +00:00
|
|
|
import pathlib
|
2015-07-07 19:22:04 +00:00
|
|
|
from pkg_resources import resource_filename
|
2015-07-07 18:50:56 +00:00
|
|
|
|
2015-07-07 19:23:15 +00:00
|
|
|
|
2015-07-08 19:29:31 +00:00
|
|
|
CATEGORIES = [unicodedata.category(chr(i)) for i in range(0x110000)]
|
2015-07-07 18:50:56 +00:00
|
|
|
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
2015-07-07 18:46:42 +00:00
|
|
|
|
2015-07-07 19:23:15 +00:00
|
|
|
|
2015-07-08 19:29:31 +00:00
|
|
|
def func_to_regex(func):
|
|
|
|
"""
|
|
|
|
Given a function that returns True or False for a numerical codepoint,
|
|
|
|
return a regex character class accepting the characters resulting in True.
|
|
|
|
Ranges separated only by unassigned characters are merged for efficiency.
|
|
|
|
"""
|
|
|
|
# A list of [start, end (accepted), end (accepted or unassigned)] lists
|
|
|
|
ranges = []
|
|
|
|
|
|
|
|
for i, cat in enumerate(CATEGORIES):
|
|
|
|
if func(i):
|
|
|
|
# If the last range can be extended, do so; else start a new one
|
|
|
|
if ranges and ranges[-1][2] == i - 1:
|
|
|
|
ranges[-1][1] = i
|
|
|
|
ranges[-1][2] = i
|
|
|
|
else:
|
|
|
|
ranges.append([i, i, i])
|
|
|
|
elif cat == 'Cn':
|
|
|
|
# If the last range can be extended, do so
|
|
|
|
if ranges and ranges[-1][2] == i - 1:
|
|
|
|
ranges[-1][2] = i
|
|
|
|
|
|
|
|
return '[%s]' % ''.join(chr(r[0]) + '-' + chr(r[1]) for r in ranges)
|
|
|
|
|
|
|
|
|
2015-07-07 20:00:24 +00:00
|
|
|
def cache_regex_from_func(filename, func):
|
|
|
|
"""
|
|
|
|
Generates a regex from a function that accepts a single unicode character,
|
|
|
|
and caches it in the data path at filename.
|
|
|
|
"""
|
2015-07-09 20:18:56 +00:00
|
|
|
with (DATA_PATH / filename).open(mode='w') as file:
|
|
|
|
file.write(func_to_regex(func))
|
2015-07-07 20:00:24 +00:00
|
|
|
|
|
|
|
|
2015-07-08 19:29:31 +00:00
|
|
|
def _is_emoji_codepoint(i):
|
2015-07-07 18:46:42 +00:00
|
|
|
"""
|
2015-07-08 19:29:31 +00:00
|
|
|
Report whether a numerical codepoint is (likely) an emoji: a Unicode 'So'
|
|
|
|
character (as future-proofed by the ftfy chardata module) but excluding
|
|
|
|
symbols like © and ™ below U+2600 and the replacement character U+FFFD.
|
2015-07-07 18:46:42 +00:00
|
|
|
"""
|
2015-07-08 19:29:31 +00:00
|
|
|
return chardata.CHAR_CLASS_STRING[i] == '3' and i >= 0x2600 and i != 0xfffd
|
2015-07-07 18:50:56 +00:00
|
|
|
|
2015-07-07 19:23:15 +00:00
|
|
|
|
2015-07-08 19:29:31 +00:00
|
|
|
def _is_non_punct_codepoint(i):
|
2015-07-07 18:50:56 +00:00
|
|
|
"""
|
2015-07-08 19:29:31 +00:00
|
|
|
Report whether a numerical codepoint is not one of the following classes:
|
2015-07-07 18:50:56 +00:00
|
|
|
- P: punctuation
|
|
|
|
- S: symbols
|
|
|
|
- Z: separators
|
|
|
|
- C: control characters
|
2015-07-08 19:29:31 +00:00
|
|
|
This will classify symbols, including emoji, as punctuation; users that
|
|
|
|
want to accept emoji should add them separately.
|
2015-07-07 18:50:56 +00:00
|
|
|
"""
|
2015-07-08 19:29:31 +00:00
|
|
|
return CATEGORIES[i][0] not in 'PSZC'
|
2015-07-07 18:38:21 +00:00
|
|
|
|
2015-07-07 19:23:15 +00:00
|
|
|
|
2015-07-08 19:29:31 +00:00
|
|
|
def _is_combining_mark_codepoint(i):
|
2015-07-07 18:38:21 +00:00
|
|
|
"""
|
2015-07-08 19:29:31 +00:00
|
|
|
Report whether a numerical codepoint is a combining mark (Unicode 'M').
|
2015-07-07 18:38:21 +00:00
|
|
|
"""
|
2015-07-08 19:29:31 +00:00
|
|
|
return CATEGORIES[i][0] == 'M'
|
2015-07-07 18:38:21 +00:00
|
|
|
|
2015-07-07 19:23:15 +00:00
|
|
|
|
2015-07-07 18:38:21 +00:00
|
|
|
if __name__ == '__main__':
|
2015-07-08 19:29:31 +00:00
|
|
|
cache_regex_from_func('emoji.txt', _is_emoji_codepoint)
|
|
|
|
cache_regex_from_func('non_punct.txt', _is_non_punct_codepoint)
|
|
|
|
cache_regex_from_func('combining_mark.txt', _is_combining_mark_codepoint)
|