mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
parent
a72b4abb48
commit
7e9338f87e
@ -8,25 +8,31 @@ from pkg_resources import resource_filename
|
|||||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||||
|
|
||||||
|
|
||||||
|
def cache_regex_from_func(filename, func):
|
||||||
|
"""
|
||||||
|
Generates a regex from a function that accepts a single unicode character,
|
||||||
|
and caches it in the data path at filename.
|
||||||
|
"""
|
||||||
|
with (DATA_PATH / filename).open(mode='w') as file:
|
||||||
|
file.write(func_to_regex(func))
|
||||||
|
|
||||||
|
|
||||||
def _emoji_char_class():
|
def _emoji_char_class():
|
||||||
"""
|
"""
|
||||||
Build a regex for emoji substitution. We create a regex character set
|
Build a regex for emoji substitution. We create a regex character set
|
||||||
(like "[a-cv-z]") matching characters we consider emoji.
|
(like "[a-cv-z]") matching characters we consider emoji.
|
||||||
"""
|
"""
|
||||||
emoji_file = DATA_PATH / 'emoji.txt'
|
cache_regex_from_func(
|
||||||
|
'emoji.txt',
|
||||||
def accept(c):
|
lambda c:
|
||||||
x = ord(c)
|
chardata.CHAR_CLASS_STRING[ord(c)] == '3' and
|
||||||
return chardata.CHAR_CLASS_STRING[x] == '3' and \
|
c >= '\u2600' and c != '\ufffd'
|
||||||
x >= 0x2600 and x != 0xfffd
|
)
|
||||||
|
|
||||||
with (DATA_PATH / 'emoji.txt').open(mode='w') as file:
|
|
||||||
file.write(func_to_regex(accept))
|
|
||||||
|
|
||||||
|
|
||||||
def _non_punct_class():
|
def _non_punct_class():
|
||||||
"""
|
"""
|
||||||
Builds a regex that matches anything that is not a one of the following
|
Builds a regex that matches anything that is not one of the following
|
||||||
classes:
|
classes:
|
||||||
- P: punctuation
|
- P: punctuation
|
||||||
- S: symbols
|
- S: symbols
|
||||||
@ -35,23 +41,20 @@ def _non_punct_class():
|
|||||||
This will classify symbols, including emoji, as punctuation; callers that
|
This will classify symbols, including emoji, as punctuation; callers that
|
||||||
want to treat emoji separately should filter them out first.
|
want to treat emoji separately should filter them out first.
|
||||||
"""
|
"""
|
||||||
non_punct_file = DATA_PATH / 'non_punct.txt'
|
cache_regex_from_func(
|
||||||
|
'non_punct.txt',
|
||||||
out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
|
lambda c: unicodedata.category(c)[0] not in 'PSZC'
|
||||||
|
)
|
||||||
with non_punct_file.open(mode='w') as file:
|
|
||||||
file.write(out)
|
|
||||||
|
|
||||||
|
|
||||||
def _combining_mark_class():
|
def _combining_mark_class():
|
||||||
"""
|
"""
|
||||||
Builds a regex that matches anything that is a combining mark
|
Builds a regex that matches anything that is a combining mark
|
||||||
"""
|
"""
|
||||||
combining_mark_file = DATA_PATH / 'combining_mark.txt'
|
cache_regex_from_func(
|
||||||
out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
|
'combining_mark.txt',
|
||||||
|
lambda c: unicodedata.category(c)[0] == 'M'
|
||||||
with combining_mark_file.open(mode='w') as file:
|
)
|
||||||
file.write(out)
|
|
||||||
|
|
||||||
|
|
||||||
def func_to_regex(accept):
|
def func_to_regex(accept):
|
||||||
|
Loading…
Reference in New Issue
Block a user