2015-07-07 18:38:21 +00:00
|
|
|
import argparse
|
|
|
|
import unicodedata
|
2015-07-07 19:22:04 +00:00
|
|
|
from ftfy import chardata
|
2015-07-07 18:50:56 +00:00
|
|
|
import pathlib
|
2015-07-07 19:22:04 +00:00
|
|
|
from pkg_resources import resource_filename
|
2015-07-07 18:50:56 +00:00
|
|
|
|
2015-07-07 19:23:15 +00:00
|
|
|
|
2015-07-07 18:50:56 +00:00
|
|
|
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
2015-07-07 18:46:42 +00:00
|
|
|
|
2015-07-07 19:23:15 +00:00
|
|
|
|
2015-07-07 18:46:42 +00:00
|
|
|
def _emoji_char_class():
|
|
|
|
"""
|
2015-07-07 19:33:51 +00:00
|
|
|
Build a regex for emoji substitution. We create a regex character set
|
|
|
|
(like "[a-cv-z]") matching characters we consider emoji.
|
2015-07-07 18:46:42 +00:00
|
|
|
"""
|
2015-07-07 18:50:56 +00:00
|
|
|
emoji_file = DATA_PATH / 'emoji.txt'
|
|
|
|
|
2015-07-07 19:43:34 +00:00
|
|
|
def accept(c):
|
|
|
|
x = ord(c)
|
|
|
|
return chardata.CHAR_CLASS_STRING[x] == '3' and \
|
|
|
|
x >= 0x2600 and x != 0xfffd
|
|
|
|
|
|
|
|
with (DATA_PATH / 'emoji.txt').open(mode='w') as file:
|
|
|
|
file.write(func_to_regex(accept))
|
2015-07-07 18:50:56 +00:00
|
|
|
|
2015-07-07 19:23:15 +00:00
|
|
|
|
2015-07-07 18:50:56 +00:00
|
|
|
def _non_punct_class():
|
|
|
|
"""
|
|
|
|
Builds a regex that matches anything that is not a one of the following
|
|
|
|
classes:
|
|
|
|
- P: punctuation
|
|
|
|
- S: symbols
|
|
|
|
- Z: separators
|
|
|
|
- C: control characters
|
|
|
|
This will classify symbols, including emoji, as punctuation; callers that
|
|
|
|
want to treat emoji separately should filter them out first.
|
|
|
|
"""
|
2015-07-07 19:22:04 +00:00
|
|
|
non_punct_file = DATA_PATH / 'non_punct.txt'
|
2015-07-07 18:50:56 +00:00
|
|
|
|
|
|
|
out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
|
|
|
|
|
|
|
|
with non_punct_file.open(mode='w') as file:
|
|
|
|
file.write(out)
|
|
|
|
|
2015-07-07 19:23:15 +00:00
|
|
|
|
2015-07-07 18:50:56 +00:00
|
|
|
def _combining_mark_class():
|
|
|
|
"""
|
|
|
|
Builds a regex that matches anything that is a combining mark
|
|
|
|
"""
|
|
|
|
combining_mark_file = DATA_PATH / 'combining_mark.txt'
|
|
|
|
out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
|
|
|
|
|
2015-07-07 19:22:04 +00:00
|
|
|
with combining_mark_file.open(mode='w') as file:
|
2015-07-07 18:50:56 +00:00
|
|
|
file.write(out)
|
2015-07-07 18:38:21 +00:00
|
|
|
|
2015-07-07 19:23:15 +00:00
|
|
|
|
2015-07-07 18:38:21 +00:00
|
|
|
def func_to_regex(accept):
|
|
|
|
"""
|
|
|
|
Converts a function that accepts a single unicode character into a regex.
|
|
|
|
Unassigned unicode characters are treated like their neighbors.
|
|
|
|
"""
|
|
|
|
ranges = []
|
|
|
|
start = None
|
|
|
|
has_accepted = False
|
|
|
|
for x in range(0x110000):
|
|
|
|
c = chr(x)
|
|
|
|
|
|
|
|
if accept(c):
|
|
|
|
has_accepted = True
|
|
|
|
if start is None:
|
2015-07-07 19:22:04 +00:00
|
|
|
start = c
|
2015-07-07 18:38:21 +00:00
|
|
|
elif unicodedata.category(c) == 'Cn':
|
|
|
|
if start is None:
|
|
|
|
start = c
|
|
|
|
elif start is not None:
|
|
|
|
if has_accepted:
|
|
|
|
ranges.append('-'.join([start, chr(x-1)]))
|
|
|
|
has_accepted = False
|
|
|
|
start = None
|
|
|
|
else:
|
|
|
|
if has_accepted and start is not None:
|
|
|
|
ranges.append('-'.join([start, chr(x-1)]))
|
|
|
|
|
|
|
|
return '[%s]' % ''.join(ranges)
|
|
|
|
|
2015-07-07 19:23:15 +00:00
|
|
|
|
2015-07-07 18:38:21 +00:00
|
|
|
if __name__ == '__main__':
|
2015-07-07 18:50:56 +00:00
|
|
|
_combining_mark_class()
|
|
|
|
_non_punct_class()
|
|
|
|
_emoji_char_class()
|