mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Use the regex implementation of Unicode segmentation
This commit is contained in:
parent
2b8089e2b1
commit
95998205ad
2
setup.py
2
setup.py
@ -26,7 +26,7 @@ classifiers = [
|
||||
current_dir = os.path.dirname(__file__)
|
||||
README_contents = open(os.path.join(current_dir, 'README.md')).read()
|
||||
doclines = README_contents.split("\n")
|
||||
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes']
|
||||
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
|
||||
if sys.version_info < (3, 4):
|
||||
dependencies.append('pathlib')
|
||||
|
||||
|
@ -95,13 +95,17 @@ def test_failed_cB_conversion():
|
||||
def test_tokenization():
|
||||
# We preserve apostrophes within words, so "can't" is a single word in the
|
||||
# data
|
||||
eq_(tokenize("can't", 'en'), ["can't"])
|
||||
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
|
||||
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
|
||||
|
||||
# Certain punctuation does not inherently split a word.
|
||||
eq_(tokenize("Anything is possible at zombo.com", 'en'),
|
||||
['anything', 'is', 'possible', 'at', 'zombo.com'])
|
||||
|
||||
# Splits occur after symbols, and at splitting punctuation such as hyphens.
|
||||
eq_(tokenize('😂test', 'en'), ['😂', 'test'])
|
||||
|
||||
# We do split at other punctuation, causing the word-combining rule to
|
||||
# apply.
|
||||
eq_(tokenize("can.t", 'en'), ['can', 't'])
|
||||
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
|
||||
|
||||
|
||||
def test_casefolding():
|
||||
@ -110,11 +114,11 @@ def test_casefolding():
|
||||
|
||||
|
||||
def test_phrase_freq():
|
||||
plant = word_frequency("plan.t", 'en')
|
||||
assert_greater(plant, 0)
|
||||
ff = word_frequency("flip-flop", 'en')
|
||||
assert_greater(ff, 0)
|
||||
assert_almost_equal(
|
||||
1.0 / plant,
|
||||
1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
|
||||
1.0 / ff,
|
||||
1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
|
||||
)
|
||||
|
||||
|
||||
@ -134,8 +138,8 @@ def test_not_really_random():
|
||||
def test_not_enough_ascii():
|
||||
random_ascii_words(lang='zh')
|
||||
|
||||
def test_ar():
|
||||
|
||||
def test_ar():
|
||||
# Remove tatweels
|
||||
eq_(
|
||||
tokenize('متــــــــعب', 'ar'),
|
||||
@ -152,3 +156,16 @@ def test_ar():
|
||||
tokenize('\ufefb', 'ar'), # An Arabic ligature...
|
||||
['\u0644\u0627'] # ...that is affected by NFKC normalization
|
||||
)
|
||||
|
||||
|
||||
def test_ideographic_fallback():
|
||||
# Try tokenizing Chinese text -- it should remain stuck together.
|
||||
eq_(tokenize('中国文字', 'zh'), ['中国文字'])
|
||||
|
||||
# When Japanese is tagged with the wrong language, it will be split
|
||||
# at script boundaries.
|
||||
ja_text = 'ひらがなカタカナromaji'
|
||||
eq_(
|
||||
tokenize(ja_text, 'en'),
|
||||
['ひらがな', 'カタカナ', 'romaji']
|
||||
)
|
||||
|
@ -1,14 +1,13 @@
|
||||
from wordfreq.tokens import tokenize, simple_tokenize
|
||||
from pkg_resources import resource_filename
|
||||
from functools import lru_cache
|
||||
import langcodes
|
||||
import msgpack
|
||||
import re
|
||||
import gzip
|
||||
import itertools
|
||||
import pathlib
|
||||
import random
|
||||
import logging
|
||||
import unicodedata
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -16,71 +15,10 @@ logger = logging.getLogger(__name__)
|
||||
CACHE_SIZE = 100000
|
||||
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
|
||||
|
||||
def load_range(filename):
|
||||
"""
|
||||
Load a file from the data path.
|
||||
"""
|
||||
with (DATA_PATH / filename).open() as file:
|
||||
return file.read()
|
||||
|
||||
EMOJI_RANGE = load_range('emoji.txt')
|
||||
NON_PUNCT_RANGE = load_range('non_punct.txt')
|
||||
COMBINING_MARK_RANGE = load_range('combining_mark.txt')
|
||||
|
||||
COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE)
|
||||
TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
|
||||
|
||||
|
||||
def simple_tokenize(text):
|
||||
"""
|
||||
A simple tokenizer that can be applied to most languages.
|
||||
|
||||
It considers a word to be made of a sequence of 'token characters', an
|
||||
overly inclusive range that includes letters, Han characters, emoji, and a
|
||||
bunch of miscellaneous whatnot, but excludes most punctuation and
|
||||
whitespace.
|
||||
|
||||
The single complication for the sake of English is that apostrophes are not
|
||||
considered part of the token if they appear on the edge of the character
|
||||
sequence, but they are if they appear internally. "cats'" is not a token,
|
||||
but "cat's" is.
|
||||
"""
|
||||
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
||||
|
||||
|
||||
mecab_tokenize = None
|
||||
def tokenize(text, lang):
|
||||
"""
|
||||
Tokenize this text in a way that's straightforward but appropriate for
|
||||
the language.
|
||||
|
||||
So far, this means that Japanese is handled by mecab_tokenize, and
|
||||
everything else is handled by simple_tokenize. Additionally, Arabic commas
|
||||
and combining marks are removed.
|
||||
|
||||
Strings that are looked up in wordfreq will be run through this function
|
||||
first, so that they can be expected to match the data.
|
||||
"""
|
||||
if lang == 'ja':
|
||||
global mecab_tokenize
|
||||
if mecab_tokenize is None:
|
||||
from wordfreq.mecab import mecab_tokenize
|
||||
return mecab_tokenize(text)
|
||||
|
||||
if lang == 'ar':
|
||||
text = standardize_arabic(text)
|
||||
|
||||
return simple_tokenize(text)
|
||||
|
||||
|
||||
def standardize_arabic(text):
|
||||
"""
|
||||
Standardizes arabic text by removing combining marks and tatweels.
|
||||
"""
|
||||
return unicodedata.normalize(
|
||||
'NFKC',
|
||||
COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
|
||||
)
|
||||
# simple_tokenize is imported so that other things can import it from here.
|
||||
# Suppress the pyflakes warning.
|
||||
simple_tokenize = simple_tokenize
|
||||
|
||||
|
||||
def read_cBpack(filename):
|
||||
|
88
wordfreq/tokens.py
Normal file
88
wordfreq/tokens.py
Normal file
@ -0,0 +1,88 @@
|
||||
import regex
|
||||
import unicodedata
|
||||
|
||||
|
||||
# Here's what the following regular expression is looking for:
|
||||
#
|
||||
# At the start, it looks for a character in the set [\S--\p{punct}]. \S
|
||||
# contains non-space characters, and then it subtracts the set of Unicode
|
||||
# punctuation characters from that set. This is slightly different from \w,
|
||||
# because it leaves symbols (such as emoji) as tokens.
|
||||
#
|
||||
# After it has found one such character, the rest of the token is (?:\B\S)*,
|
||||
# which continues to consume characters as long as the next character does not
|
||||
# cause a word break (\B) and is not a space (\S). The individual characters in
|
||||
# this portion can be punctuation, allowing tokens such as "can't" or
|
||||
# "google.com".
|
||||
#
|
||||
# As a complication, the rest of the token can match a glob of Han ideographs
|
||||
# (\p{IsIdeo}) and hiragana (\p{Script=Hiragana}). Chinese words are made of
|
||||
# Han ideographs (but we don't know how many). Japanese words are either made
|
||||
# of Han ideographs and hiragana (which will be matched by this expression), or
|
||||
# katakana (which will be matched by the standard Unicode rule).
|
||||
#
|
||||
# Without this special case for ideographs and hiragana, the standard Unicode
|
||||
# rule would put each character in its own token. This actually would be the
|
||||
# correct behavior for word-wrapping, but it's an ugly failure mode for NLP
|
||||
# tokenization.
|
||||
|
||||
TOKEN_RE = regex.compile(r'[\S--\p{punct}](?:\B\S|[\p{IsIdeo}\p{Script=Hiragana}])*', regex.V1 | regex.WORD)
|
||||
ARABIC_MARK_RE = regex.compile(r'[[\p{Mn}&&\p{Block=Arabic}]\N{ARABIC TATWEEL}]', regex.V1)
|
||||
|
||||
|
||||
def simple_tokenize(text):
|
||||
"""
|
||||
Tokenize the given text using a straightforward, Unicode-aware token
|
||||
expression. It returns non-whitespace tokens that are split at the
|
||||
word boundaries defined by Unicode Tech Report #29, as implemented
|
||||
by the regex package, except that it leaves Chinese and Japanese
|
||||
relatively untokenized.
|
||||
"""
|
||||
text = unicodedata.normalize('NFKC', text)
|
||||
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
||||
|
||||
|
||||
def remove_arabic_marks(text):
|
||||
"""
|
||||
Remove decorations from Arabic words:
|
||||
|
||||
- Combining marks of class Mn, which tend to represent non-essential
|
||||
vowel markings.
|
||||
- Tatweels, horizontal segments that are used to extend or justify a
|
||||
word.
|
||||
"""
|
||||
return ARABIC_MARK_RE.sub('', text)
|
||||
|
||||
|
||||
mecab_tokenize = None
|
||||
def tokenize(text, lang):
|
||||
"""
|
||||
Tokenize this text in a way that's relatively simple but appropriate for
|
||||
the language.
|
||||
|
||||
So far, this means:
|
||||
|
||||
- Chinese is presumed to already be tokenized. (Sorry. It's hard.)
|
||||
- Japanese will be delegated to the external mecab-python module.
|
||||
- Chinese or Japanese texts that aren't identified as the appropriate
|
||||
language will only split on punctuation and script boundaries, giving
|
||||
you untokenized globs of characters that probably represent many words.
|
||||
- All other languages will be tokenized according to UTR #29.
|
||||
|
||||
Additionally, the text will be case-folded to lowercase, and text marked
|
||||
as Arabic will have combining marks and tatweels removed.
|
||||
|
||||
Strings that are looked up in wordfreq will be run through this function
|
||||
first, so that they can be expected to match the data.
|
||||
"""
|
||||
if lang == 'ja':
|
||||
global mecab_tokenize
|
||||
if mecab_tokenize is None:
|
||||
from wordfreq.mecab import mecab_tokenize
|
||||
return mecab_tokenize(text)
|
||||
|
||||
if lang == 'ar':
|
||||
text = remove_arabic_marks(text)
|
||||
|
||||
return simple_tokenize(text)
|
||||
|
Loading…
Reference in New Issue
Block a user