mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 18:01:38 +00:00
Use the regex implementation of Unicode segmentation
Former-commit-id: 95998205ad
This commit is contained in:
parent
0721707d92
commit
f4cf46ab9c
2
setup.py
2
setup.py
@ -26,7 +26,7 @@ classifiers = [
|
|||||||
current_dir = os.path.dirname(__file__)
|
current_dir = os.path.dirname(__file__)
|
||||||
README_contents = open(os.path.join(current_dir, 'README.md')).read()
|
README_contents = open(os.path.join(current_dir, 'README.md')).read()
|
||||||
doclines = README_contents.split("\n")
|
doclines = README_contents.split("\n")
|
||||||
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes']
|
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
|
||||||
if sys.version_info < (3, 4):
|
if sys.version_info < (3, 4):
|
||||||
dependencies.append('pathlib')
|
dependencies.append('pathlib')
|
||||||
|
|
||||||
|
@ -95,13 +95,17 @@ def test_failed_cB_conversion():
|
|||||||
def test_tokenization():
|
def test_tokenization():
|
||||||
# We preserve apostrophes within words, so "can't" is a single word in the
|
# We preserve apostrophes within words, so "can't" is a single word in the
|
||||||
# data
|
# data
|
||||||
eq_(tokenize("can't", 'en'), ["can't"])
|
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
|
||||||
|
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
|
||||||
|
|
||||||
|
# Certain punctuation does not inherently split a word.
|
||||||
|
eq_(tokenize("Anything is possible at zombo.com", 'en'),
|
||||||
|
['anything', 'is', 'possible', 'at', 'zombo.com'])
|
||||||
|
|
||||||
|
# Splits occur after symbols, and at splitting punctuation such as hyphens.
|
||||||
eq_(tokenize('😂test', 'en'), ['😂', 'test'])
|
eq_(tokenize('😂test', 'en'), ['😂', 'test'])
|
||||||
|
|
||||||
# We do split at other punctuation, causing the word-combining rule to
|
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
|
||||||
# apply.
|
|
||||||
eq_(tokenize("can.t", 'en'), ['can', 't'])
|
|
||||||
|
|
||||||
|
|
||||||
def test_casefolding():
|
def test_casefolding():
|
||||||
@ -110,11 +114,11 @@ def test_casefolding():
|
|||||||
|
|
||||||
|
|
||||||
def test_phrase_freq():
|
def test_phrase_freq():
|
||||||
plant = word_frequency("plan.t", 'en')
|
ff = word_frequency("flip-flop", 'en')
|
||||||
assert_greater(plant, 0)
|
assert_greater(ff, 0)
|
||||||
assert_almost_equal(
|
assert_almost_equal(
|
||||||
1.0 / plant,
|
1.0 / ff,
|
||||||
1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
|
1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -134,8 +138,8 @@ def test_not_really_random():
|
|||||||
def test_not_enough_ascii():
|
def test_not_enough_ascii():
|
||||||
random_ascii_words(lang='zh')
|
random_ascii_words(lang='zh')
|
||||||
|
|
||||||
def test_ar():
|
|
||||||
|
|
||||||
|
def test_ar():
|
||||||
# Remove tatweels
|
# Remove tatweels
|
||||||
eq_(
|
eq_(
|
||||||
tokenize('متــــــــعب', 'ar'),
|
tokenize('متــــــــعب', 'ar'),
|
||||||
@ -152,3 +156,16 @@ def test_ar():
|
|||||||
tokenize('\ufefb', 'ar'), # An Arabic ligature...
|
tokenize('\ufefb', 'ar'), # An Arabic ligature...
|
||||||
['\u0644\u0627'] # ...that is affected by NFKC normalization
|
['\u0644\u0627'] # ...that is affected by NFKC normalization
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_ideographic_fallback():
|
||||||
|
# Try tokenizing Chinese text -- it should remain stuck together.
|
||||||
|
eq_(tokenize('中国文字', 'zh'), ['中国文字'])
|
||||||
|
|
||||||
|
# When Japanese is tagged with the wrong language, it will be split
|
||||||
|
# at script boundaries.
|
||||||
|
ja_text = 'ひらがなカタカナromaji'
|
||||||
|
eq_(
|
||||||
|
tokenize(ja_text, 'en'),
|
||||||
|
['ひらがな', 'カタカナ', 'romaji']
|
||||||
|
)
|
||||||
|
@ -1,14 +1,13 @@
|
|||||||
|
from wordfreq.tokens import tokenize, simple_tokenize
|
||||||
from pkg_resources import resource_filename
|
from pkg_resources import resource_filename
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
import langcodes
|
import langcodes
|
||||||
import msgpack
|
import msgpack
|
||||||
import re
|
|
||||||
import gzip
|
import gzip
|
||||||
import itertools
|
import itertools
|
||||||
import pathlib
|
import pathlib
|
||||||
import random
|
import random
|
||||||
import logging
|
import logging
|
||||||
import unicodedata
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -16,71 +15,10 @@ logger = logging.getLogger(__name__)
|
|||||||
CACHE_SIZE = 100000
|
CACHE_SIZE = 100000
|
||||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||||
|
|
||||||
def load_range(filename):
|
|
||||||
"""
|
|
||||||
Load a file from the data path.
|
|
||||||
"""
|
|
||||||
with (DATA_PATH / filename).open() as file:
|
|
||||||
return file.read()
|
|
||||||
|
|
||||||
EMOJI_RANGE = load_range('emoji.txt')
|
# simple_tokenize is imported so that other things can import it from here.
|
||||||
NON_PUNCT_RANGE = load_range('non_punct.txt')
|
# Suppress the pyflakes warning.
|
||||||
COMBINING_MARK_RANGE = load_range('combining_mark.txt')
|
simple_tokenize = simple_tokenize
|
||||||
|
|
||||||
COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE)
|
|
||||||
TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
|
|
||||||
|
|
||||||
|
|
||||||
def simple_tokenize(text):
|
|
||||||
"""
|
|
||||||
A simple tokenizer that can be applied to most languages.
|
|
||||||
|
|
||||||
It considers a word to be made of a sequence of 'token characters', an
|
|
||||||
overly inclusive range that includes letters, Han characters, emoji, and a
|
|
||||||
bunch of miscellaneous whatnot, but excludes most punctuation and
|
|
||||||
whitespace.
|
|
||||||
|
|
||||||
The single complication for the sake of English is that apostrophes are not
|
|
||||||
considered part of the token if they appear on the edge of the character
|
|
||||||
sequence, but they are if they appear internally. "cats'" is not a token,
|
|
||||||
but "cat's" is.
|
|
||||||
"""
|
|
||||||
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
|
||||||
|
|
||||||
|
|
||||||
mecab_tokenize = None
|
|
||||||
def tokenize(text, lang):
|
|
||||||
"""
|
|
||||||
Tokenize this text in a way that's straightforward but appropriate for
|
|
||||||
the language.
|
|
||||||
|
|
||||||
So far, this means that Japanese is handled by mecab_tokenize, and
|
|
||||||
everything else is handled by simple_tokenize. Additionally, Arabic commas
|
|
||||||
and combining marks are removed.
|
|
||||||
|
|
||||||
Strings that are looked up in wordfreq will be run through this function
|
|
||||||
first, so that they can be expected to match the data.
|
|
||||||
"""
|
|
||||||
if lang == 'ja':
|
|
||||||
global mecab_tokenize
|
|
||||||
if mecab_tokenize is None:
|
|
||||||
from wordfreq.mecab import mecab_tokenize
|
|
||||||
return mecab_tokenize(text)
|
|
||||||
|
|
||||||
if lang == 'ar':
|
|
||||||
text = standardize_arabic(text)
|
|
||||||
|
|
||||||
return simple_tokenize(text)
|
|
||||||
|
|
||||||
|
|
||||||
def standardize_arabic(text):
|
|
||||||
"""
|
|
||||||
Standardizes arabic text by removing combining marks and tatweels.
|
|
||||||
"""
|
|
||||||
return unicodedata.normalize(
|
|
||||||
'NFKC',
|
|
||||||
COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def read_cBpack(filename):
|
def read_cBpack(filename):
|
||||||
|
88
wordfreq/tokens.py
Normal file
88
wordfreq/tokens.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
import regex
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
|
||||||
|
# Here's what the following regular expression is looking for:
|
||||||
|
#
|
||||||
|
# At the start, it looks for a character in the set [\S--\p{punct}]. \S
|
||||||
|
# contains non-space characters, and then it subtracts the set of Unicode
|
||||||
|
# punctuation characters from that set. This is slightly different from \w,
|
||||||
|
# because it leaves symbols (such as emoji) as tokens.
|
||||||
|
#
|
||||||
|
# After it has found one such character, the rest of the token is (?:\B\S)*,
|
||||||
|
# which continues to consume characters as long as the next character does not
|
||||||
|
# cause a word break (\B) and is not a space (\S). The individual characters in
|
||||||
|
# this portion can be punctuation, allowing tokens such as "can't" or
|
||||||
|
# "google.com".
|
||||||
|
#
|
||||||
|
# As a complication, the rest of the token can match a glob of Han ideographs
|
||||||
|
# (\p{IsIdeo}) and hiragana (\p{Script=Hiragana}). Chinese words are made of
|
||||||
|
# Han ideographs (but we don't know how many). Japanese words are either made
|
||||||
|
# of Han ideographs and hiragana (which will be matched by this expression), or
|
||||||
|
# katakana (which will be matched by the standard Unicode rule).
|
||||||
|
#
|
||||||
|
# Without this special case for ideographs and hiragana, the standard Unicode
|
||||||
|
# rule would put each character in its own token. This actually would be the
|
||||||
|
# correct behavior for word-wrapping, but it's an ugly failure mode for NLP
|
||||||
|
# tokenization.
|
||||||
|
|
||||||
|
TOKEN_RE = regex.compile(r'[\S--\p{punct}](?:\B\S|[\p{IsIdeo}\p{Script=Hiragana}])*', regex.V1 | regex.WORD)
|
||||||
|
ARABIC_MARK_RE = regex.compile(r'[[\p{Mn}&&\p{Block=Arabic}]\N{ARABIC TATWEEL}]', regex.V1)
|
||||||
|
|
||||||
|
|
||||||
|
def simple_tokenize(text):
|
||||||
|
"""
|
||||||
|
Tokenize the given text using a straightforward, Unicode-aware token
|
||||||
|
expression. It returns non-whitespace tokens that are split at the
|
||||||
|
word boundaries defined by Unicode Tech Report #29, as implemented
|
||||||
|
by the regex package, except that it leaves Chinese and Japanese
|
||||||
|
relatively untokenized.
|
||||||
|
"""
|
||||||
|
text = unicodedata.normalize('NFKC', text)
|
||||||
|
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
||||||
|
|
||||||
|
|
||||||
|
def remove_arabic_marks(text):
|
||||||
|
"""
|
||||||
|
Remove decorations from Arabic words:
|
||||||
|
|
||||||
|
- Combining marks of class Mn, which tend to represent non-essential
|
||||||
|
vowel markings.
|
||||||
|
- Tatweels, horizontal segments that are used to extend or justify a
|
||||||
|
word.
|
||||||
|
"""
|
||||||
|
return ARABIC_MARK_RE.sub('', text)
|
||||||
|
|
||||||
|
|
||||||
|
mecab_tokenize = None
|
||||||
|
def tokenize(text, lang):
|
||||||
|
"""
|
||||||
|
Tokenize this text in a way that's relatively simple but appropriate for
|
||||||
|
the language.
|
||||||
|
|
||||||
|
So far, this means:
|
||||||
|
|
||||||
|
- Chinese is presumed to already be tokenized. (Sorry. It's hard.)
|
||||||
|
- Japanese will be delegated to the external mecab-python module.
|
||||||
|
- Chinese or Japanese texts that aren't identified as the appropriate
|
||||||
|
language will only split on punctuation and script boundaries, giving
|
||||||
|
you untokenized globs of characters that probably represent many words.
|
||||||
|
- All other languages will be tokenized according to UTR #29.
|
||||||
|
|
||||||
|
Additionally, the text will be case-folded to lowercase, and text marked
|
||||||
|
as Arabic will have combining marks and tatweels removed.
|
||||||
|
|
||||||
|
Strings that are looked up in wordfreq will be run through this function
|
||||||
|
first, so that they can be expected to match the data.
|
||||||
|
"""
|
||||||
|
if lang == 'ja':
|
||||||
|
global mecab_tokenize
|
||||||
|
if mecab_tokenize is None:
|
||||||
|
from wordfreq.mecab import mecab_tokenize
|
||||||
|
return mecab_tokenize(text)
|
||||||
|
|
||||||
|
if lang == 'ar':
|
||||||
|
text = remove_arabic_marks(text)
|
||||||
|
|
||||||
|
return simple_tokenize(text)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user