From 5510fce675c8008ddd28b3070557b5669ab27b5e Mon Sep 17 00:00:00 2001 From: Joshua Chin Date: Tue, 7 Jul 2015 15:22:04 -0400 Subject: [PATCH] fixed gen_regex --- scripts/gen_regex.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index 5340933..fb94f17 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -1,7 +1,8 @@ import argparse import unicodedata -import chardata +from ftfy import chardata import pathlib +from pkg_resources import resource_filename DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) @@ -38,7 +39,7 @@ def _non_punct_class(): This will classify symbols, including emoji, as punctuation; callers that want to treat emoji separately should filter them out first. """ - non_punct_file = DATA_PATH / 'non_punct.txt + non_punct_file = DATA_PATH / 'non_punct.txt' out = func_to_regex(lambda c: unicodedata.category(c)[0] not in 'PSZC') @@ -52,7 +53,7 @@ def _combining_mark_class(): combining_mark_file = DATA_PATH / 'combining_mark.txt' out = func_to_regex(lambda c: unicodedata.category(c)[0] == 'M') - with _combining_mark_file.open(mode='w') as file: + with combining_mark_file.open(mode='w') as file: file.write(out) def func_to_regex(accept): @@ -69,7 +70,7 @@ def func_to_regex(accept): if accept(c): has_accepted = True if start is None: - start = None + start = c elif unicodedata.category(c) == 'Cn': if start is None: start = c