fixed spacing

Former-commit-id: ae4699029d
This commit is contained in:
Joshua Chin 2015-07-07 15:23:15 -04:00
parent cb4e444723
commit a87d84b796
2 changed files with 11 additions and 0 deletions

View File

@ -4,8 +4,10 @@ from ftfy import chardata
import pathlib import pathlib
from pkg_resources import resource_filename from pkg_resources import resource_filename
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
def _emoji_char_class(): def _emoji_char_class():
""" """
Build a regex for emoji substitution. First we create a regex character set Build a regex for emoji substitution. First we create a regex character set
@ -28,6 +30,7 @@ def _emoji_char_class():
with emoji_file.open(mode='w') as file: with emoji_file.open(mode='w') as file:
file.write(out) file.write(out)
def _non_punct_class(): def _non_punct_class():
""" """
Builds a regex that matches anything that is not a one of the following Builds a regex that matches anything that is not a one of the following
@ -46,6 +49,7 @@ def _non_punct_class():
with non_punct_file.open(mode='w') as file: with non_punct_file.open(mode='w') as file:
file.write(out) file.write(out)
def _combining_mark_class(): def _combining_mark_class():
""" """
Builds a regex that matches anything that is a combining mark Builds a regex that matches anything that is a combining mark
@ -56,6 +60,7 @@ def _combining_mark_class():
with combining_mark_file.open(mode='w') as file: with combining_mark_file.open(mode='w') as file:
file.write(out) file.write(out)
def func_to_regex(accept): def func_to_regex(accept):
""" """
Converts a function that accepts a single unicode character into a regex. Converts a function that accepts a single unicode character into a regex.
@ -85,6 +90,7 @@ def func_to_regex(accept):
return '[%s]' % ''.join(ranges) return '[%s]' % ''.join(ranges)
if __name__ == '__main__': if __name__ == '__main__':
_combining_mark_class() _combining_mark_class()
_non_punct_class() _non_punct_class()

View File

@ -14,6 +14,7 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
CACHE_SIZE = 100000 CACHE_SIZE = 100000
def _emoji_char_class(): def _emoji_char_class():
""" """
Build a regex for emoji substitution. First we create a regex character set Build a regex for emoji substitution. First we create a regex character set
@ -25,6 +26,7 @@ def _emoji_char_class():
with non_punct_file.open() as file: with non_punct_file.open() as file:
return file.read() return file.read()
def _non_punct_class(): def _non_punct_class():
""" """
Builds a regex that matches anything that is not a one of the following Builds a regex that matches anything that is not a one of the following
@ -40,6 +42,7 @@ def _non_punct_class():
with non_punct_file.open() as file: with non_punct_file.open() as file:
return file.read() return file.read()
def _combining_mark_class(): def _combining_mark_class():
""" """
Builds a regex that matches anything that is a combining mark Builds a regex that matches anything that is a combining mark
@ -55,6 +58,7 @@ NON_PUNCT_RANGE = _non_punct_class()
TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE)) TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
def simple_tokenize(text): def simple_tokenize(text):
""" """
A simple tokenizer that can be applied to most languages. A simple tokenizer that can be applied to most languages.
@ -71,6 +75,7 @@ def simple_tokenize(text):
""" """
return [token.casefold() for token in TOKEN_RE.findall(text)] return [token.casefold() for token in TOKEN_RE.findall(text)]
def tokenize(text, lang): def tokenize(text, lang):
""" """
Tokenize this text in a way that's straightforward but appropriate for Tokenize this text in a way that's straightforward but appropriate for