mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
parent
cb4e444723
commit
a87d84b796
@ -4,8 +4,10 @@ from ftfy import chardata
|
|||||||
import pathlib
|
import pathlib
|
||||||
from pkg_resources import resource_filename
|
from pkg_resources import resource_filename
|
||||||
|
|
||||||
|
|
||||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||||
|
|
||||||
|
|
||||||
def _emoji_char_class():
|
def _emoji_char_class():
|
||||||
"""
|
"""
|
||||||
Build a regex for emoji substitution. First we create a regex character set
|
Build a regex for emoji substitution. First we create a regex character set
|
||||||
@ -28,6 +30,7 @@ def _emoji_char_class():
|
|||||||
with emoji_file.open(mode='w') as file:
|
with emoji_file.open(mode='w') as file:
|
||||||
file.write(out)
|
file.write(out)
|
||||||
|
|
||||||
|
|
||||||
def _non_punct_class():
|
def _non_punct_class():
|
||||||
"""
|
"""
|
||||||
Builds a regex that matches anything that is not a one of the following
|
Builds a regex that matches anything that is not a one of the following
|
||||||
@ -46,6 +49,7 @@ def _non_punct_class():
|
|||||||
with non_punct_file.open(mode='w') as file:
|
with non_punct_file.open(mode='w') as file:
|
||||||
file.write(out)
|
file.write(out)
|
||||||
|
|
||||||
|
|
||||||
def _combining_mark_class():
|
def _combining_mark_class():
|
||||||
"""
|
"""
|
||||||
Builds a regex that matches anything that is a combining mark
|
Builds a regex that matches anything that is a combining mark
|
||||||
@ -56,6 +60,7 @@ def _combining_mark_class():
|
|||||||
with combining_mark_file.open(mode='w') as file:
|
with combining_mark_file.open(mode='w') as file:
|
||||||
file.write(out)
|
file.write(out)
|
||||||
|
|
||||||
|
|
||||||
def func_to_regex(accept):
|
def func_to_regex(accept):
|
||||||
"""
|
"""
|
||||||
Converts a function that accepts a single unicode character into a regex.
|
Converts a function that accepts a single unicode character into a regex.
|
||||||
@ -85,6 +90,7 @@ def func_to_regex(accept):
|
|||||||
|
|
||||||
return '[%s]' % ''.join(ranges)
|
return '[%s]' % ''.join(ranges)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
_combining_mark_class()
|
_combining_mark_class()
|
||||||
_non_punct_class()
|
_non_punct_class()
|
||||||
|
@ -14,6 +14,7 @@ DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
|||||||
|
|
||||||
CACHE_SIZE = 100000
|
CACHE_SIZE = 100000
|
||||||
|
|
||||||
|
|
||||||
def _emoji_char_class():
|
def _emoji_char_class():
|
||||||
"""
|
"""
|
||||||
Build a regex for emoji substitution. First we create a regex character set
|
Build a regex for emoji substitution. First we create a regex character set
|
||||||
@ -25,6 +26,7 @@ def _emoji_char_class():
|
|||||||
with non_punct_file.open() as file:
|
with non_punct_file.open() as file:
|
||||||
return file.read()
|
return file.read()
|
||||||
|
|
||||||
|
|
||||||
def _non_punct_class():
|
def _non_punct_class():
|
||||||
"""
|
"""
|
||||||
Builds a regex that matches anything that is not a one of the following
|
Builds a regex that matches anything that is not a one of the following
|
||||||
@ -40,6 +42,7 @@ def _non_punct_class():
|
|||||||
with non_punct_file.open() as file:
|
with non_punct_file.open() as file:
|
||||||
return file.read()
|
return file.read()
|
||||||
|
|
||||||
|
|
||||||
def _combining_mark_class():
|
def _combining_mark_class():
|
||||||
"""
|
"""
|
||||||
Builds a regex that matches anything that is a combining mark
|
Builds a regex that matches anything that is a combining mark
|
||||||
@ -55,6 +58,7 @@ NON_PUNCT_RANGE = _non_punct_class()
|
|||||||
|
|
||||||
TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
|
TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
|
||||||
|
|
||||||
|
|
||||||
def simple_tokenize(text):
|
def simple_tokenize(text):
|
||||||
"""
|
"""
|
||||||
A simple tokenizer that can be applied to most languages.
|
A simple tokenizer that can be applied to most languages.
|
||||||
@ -71,6 +75,7 @@ def simple_tokenize(text):
|
|||||||
"""
|
"""
|
||||||
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
||||||
|
|
||||||
|
|
||||||
def tokenize(text, lang):
|
def tokenize(text, lang):
|
||||||
"""
|
"""
|
||||||
Tokenize this text in a way that's straightforward but appropriate for
|
Tokenize this text in a way that's straightforward but appropriate for
|
||||||
|
Loading…
Reference in New Issue
Block a user