diff --git a/scripts/gen_regex.py b/scripts/gen_regex.py index fb94f17..1c5f6f5 100644 --- a/scripts/gen_regex.py +++ b/scripts/gen_regex.py @@ -4,8 +4,10 @@ from ftfy import chardata import pathlib from pkg_resources import resource_filename + DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) + def _emoji_char_class(): """ Build a regex for emoji substitution. First we create a regex character set @@ -28,6 +30,7 @@ def _emoji_char_class(): with emoji_file.open(mode='w') as file: file.write(out) + def _non_punct_class(): """ Builds a regex that matches anything that is not a one of the following @@ -46,6 +49,7 @@ def _non_punct_class(): with non_punct_file.open(mode='w') as file: file.write(out) + def _combining_mark_class(): """ Builds a regex that matches anything that is a combining mark @@ -56,6 +60,7 @@ def _combining_mark_class(): with combining_mark_file.open(mode='w') as file: file.write(out) + def func_to_regex(accept): """ Converts a function that accepts a single unicode character into a regex. @@ -85,6 +90,7 @@ def func_to_regex(accept): return '[%s]' % ''.join(ranges) + if __name__ == '__main__': _combining_mark_class() _non_punct_class() diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index f7a1948..9cc1b8d 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -14,6 +14,7 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) CACHE_SIZE = 100000 + def _emoji_char_class(): """ Build a regex for emoji substitution. First we create a regex character set @@ -25,6 +26,7 @@ def _emoji_char_class(): with non_punct_file.open() as file: return file.read() + def _non_punct_class(): """ Builds a regex that matches anything that is not a one of the following @@ -40,6 +42,7 @@ def _non_punct_class(): with non_punct_file.open() as file: return file.read() + def _combining_mark_class(): """ Builds a regex that matches anything that is a combining mark @@ -55,6 +58,7 @@ NON_PUNCT_RANGE = _non_punct_class() TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE)) + def simple_tokenize(text): """ A simple tokenizer that can be applied to most languages. @@ -71,6 +75,7 @@ def simple_tokenize(text): """ return [token.casefold() for token in TOKEN_RE.findall(text)] + def tokenize(text, lang): """ Tokenize this text in a way that's straightforward but appropriate for