mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
fixed gen_regex
This commit is contained in:
parent
f83d31a357
commit
5510fce675
@ -1,7 +1,8 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import chardata
|
from ftfy import chardata
|
||||||
import pathlib
|
import pathlib
|
||||||
|
from pkg_resources import resource_filename
|
||||||
|
|
||||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||||
|
|
||||||
@ -38,7 +39,7 @@ def _non_punct_class():
|
|||||||
This will classify symbols, including emoji, as punctuation; callers that
|
This will classify symbols, including emoji, as punctuation; callers that
|
||||||
want to treat emoji separately should filter them out first.
|
want to treat emoji separately should filter them out first.
|
||||||
"""
|
"""
|
||||||
non_punct_file = DATA_PATH / 'non_punct.txt
|
non_punct_file = DATA_PATH / 'non_punct.txt'
|
||||||
|
|
||||||
out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
|
out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC')
|
||||||
|
|
||||||
@ -52,7 +53,7 @@ def _combining_mark_class():
|
|||||||
combining_mark_file = DATA_PATH / 'combining_mark.txt'
|
combining_mark_file = DATA_PATH / 'combining_mark.txt'
|
||||||
out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
|
out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M')
|
||||||
|
|
||||||
with _combining_mark_file.open(mode='w') as file:
|
with combining_mark_file.open(mode='w') as file:
|
||||||
file.write(out)
|
file.write(out)
|
||||||
|
|
||||||
def func_to_regex(accept):
|
def func_to_regex(accept):
|
||||||
@ -69,7 +70,7 @@ def func_to_regex(accept):
|
|||||||
if accept(c):
|
if accept(c):
|
||||||
has_accepted = True
|
has_accepted = True
|
||||||
if start is None:
|
if start is None:
|
||||||
start = None
|
start = c
|
||||||
elif unicodedata.category(c) == 'Cn':
|
elif unicodedata.category(c) == 'Cn':
|
||||||
if start is None:
|
if start is None:
|
||||||
start = c
|
start = c
|
||||||
|
Loading…
Reference in New Issue
Block a user