wordfreq/scripts/gen_regex.py

98 lines
2.8 KiB
Python
Raw Normal View History

2015-07-07 18:38:21 +00:00
import argparse
import unicodedata
2015-07-07 19:22:04 +00:00
from ftfy import chardata
2015-07-07 18:50:56 +00:00
import pathlib
2015-07-07 19:22:04 +00:00
from pkg_resources import resource_filename
2015-07-07 18:50:56 +00:00
2015-07-07 19:23:15 +00:00
2015-07-07 18:50:56 +00:00
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
2015-07-07 18:46:42 +00:00
2015-07-07 19:23:15 +00:00
2015-07-07 18:46:42 +00:00
def _emoji_char_class():
"""
Build a regex for emoji substitution. First we create a regex character set
2015-07-07 18:59:28 +00:00
(like "[a-cv-z]") matching characters we consider emoji. The final regex
2015-07-07 18:46:42 +00:00
matches one such character followed by any number of spaces and identical
characters.
"""
2015-07-07 18:50:56 +00:00
emoji_file = DATA_PATH / 'emoji.txt'
2015-07-07 18:46:42 +00:00
ranges = []
for i, c in enumerate(chardata.CHAR_CLASS_STRING):
2015-07-07 18:58:50 +00:00
# c represents the character class (3 corresponds to emoji)
2015-07-07 18:46:42 +00:00
if c == '3' and i >= 0x2600 and i != 0xfffd:
if ranges and i == ranges[-1][1] + 1:
ranges[-1][1] = i
else:
ranges.append([i, i])
2015-07-07 18:50:56 +00:00
out = '[%s]' % ''.join(chr(a) + '-' + chr(b) for a, b in ranges)
with emoji_file.open(mode='w') as file:
file.write(out)
2015-07-07 19:23:15 +00:00
2015-07-07 18:50:56 +00:00
def _non_punct_class():
"""
Builds a regex that matches anything that is not a one of the following
classes:
- P: punctuation
- S: symbols
- Z: separators
- C: control characters
This will classify symbols, including emoji, as punctuation; callers that
want to treat emoji separately should filter them out first.
"""
2015-07-07 19:22:04 +00:00
non_punct_file = DATA_PATH / 'non_punct.txt'
2015-07-07 18:50:56 +00:00
out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
with non_punct_file.open(mode='w') as file:
file.write(out)
2015-07-07 19:23:15 +00:00
2015-07-07 18:50:56 +00:00
def _combining_mark_class():
"""
Builds a regex that matches anything that is a combining mark
"""
combining_mark_file = DATA_PATH / 'combining_mark.txt'
out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
2015-07-07 19:22:04 +00:00
with combining_mark_file.open(mode='w') as file:
2015-07-07 18:50:56 +00:00
file.write(out)
2015-07-07 18:38:21 +00:00
2015-07-07 19:23:15 +00:00
2015-07-07 18:38:21 +00:00
def func_to_regex(accept):
"""
Converts a function that accepts a single unicode character into a regex.
Unassigned unicode characters are treated like their neighbors.
"""
ranges = []
start = None
has_accepted = False
for x in range(0x110000):
c = chr(x)
if accept(c):
has_accepted = True
if start is None:
2015-07-07 19:22:04 +00:00
start = c
2015-07-07 18:38:21 +00:00
elif unicodedata.category(c) == 'Cn':
if start is None:
start = c
elif start is not None:
if has_accepted:
ranges.append('-'.join([start, chr(x-1)]))
has_accepted = False
start = None
else:
if has_accepted and start is not None:
ranges.append('-'.join([start, chr(x-1)]))
return '[%s]' % ''.join(ranges)
2015-07-07 19:23:15 +00:00
2015-07-07 18:38:21 +00:00
if __name__ == '__main__':
2015-07-07 18:50:56 +00:00
_combining_mark_class()
_non_punct_class()
_emoji_char_class()