Use the regex implementation of Unicode segmentation

Former-commit-id: 95998205ad
This commit is contained in:
Robyn Speer 2015-08-24 16:24:49 -04:00
parent e15fc14b8e
commit 8795525372
4 changed files with 119 additions and 76 deletions

View File

@ -26,7 +26,7 @@ classifiers = [
current_dir = os.path.dirname(__file__)
README_contents = open(os.path.join(current_dir, 'README.md')).read()
doclines = README_contents.split("\n")
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes']
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
if sys.version_info < (3, 4):
dependencies.append('pathlib')

View File

@ -95,13 +95,17 @@ def test_failed_cB_conversion():
def test_tokenization():
# We preserve apostrophes within words, so "can't" is a single word in the
# data
eq_(tokenize("can't", 'en'), ["can't"])
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
# Certain punctuation does not inherently split a word.
eq_(tokenize("Anything is possible at zombo.com", 'en'),
['anything', 'is', 'possible', 'at', 'zombo.com'])
# Splits occur after symbols, and at splitting punctuation such as hyphens.
eq_(tokenize('😂test', 'en'), ['😂', 'test'])
# We do split at other punctuation, causing the word-combining rule to
# apply.
eq_(tokenize("can.t", 'en'), ['can', 't'])
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
def test_casefolding():
@ -110,11 +114,11 @@ def test_casefolding():
def test_phrase_freq():
plant = word_frequency("plan.t", 'en')
assert_greater(plant, 0)
ff = word_frequency("flip-flop", 'en')
assert_greater(ff, 0)
assert_almost_equal(
1.0 / plant,
1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
1.0 / ff,
1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
)
@ -134,8 +138,8 @@ def test_not_really_random():
def test_not_enough_ascii():
random_ascii_words(lang='zh')
def test_ar():
def test_ar():
# Remove tatweels
eq_(
tokenize('متــــــــعب', 'ar'),
@ -152,3 +156,16 @@ def test_ar():
tokenize('\ufefb', 'ar'), # An Arabic ligature...
['\u0644\u0627'] # ...that is affected by NFKC normalization
)
def test_ideographic_fallback():
# Try tokenizing Chinese text -- it should remain stuck together.
eq_(tokenize('中国文字', 'zh'), ['中国文字'])
# When Japanese is tagged with the wrong language, it will be split
# at script boundaries.
ja_text = 'ひらがなカタカナromaji'
eq_(
tokenize(ja_text, 'en'),
['ひらがな', 'カタカナ', 'romaji']
)

View File

@ -1,14 +1,13 @@
from wordfreq.tokens import tokenize, simple_tokenize
from pkg_resources import resource_filename
from functools import lru_cache
import langcodes
import msgpack
import re
import gzip
import itertools
import pathlib
import random
import logging
import unicodedata
logger = logging.getLogger(__name__)
@ -16,71 +15,10 @@ logger = logging.getLogger(__name__)
CACHE_SIZE = 100000
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
def load_range(filename):
"""
Load a file from the data path.
"""
with (DATA_PATH / filename).open() as file:
return file.read()
EMOJI_RANGE = load_range('emoji.txt')
NON_PUNCT_RANGE = load_range('non_punct.txt')
COMBINING_MARK_RANGE = load_range('combining_mark.txt')
COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE)
TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
def simple_tokenize(text):
"""
A simple tokenizer that can be applied to most languages.
It considers a word to be made of a sequence of 'token characters', an
overly inclusive range that includes letters, Han characters, emoji, and a
bunch of miscellaneous whatnot, but excludes most punctuation and
whitespace.
The single complication for the sake of English is that apostrophes are not
considered part of the token if they appear on the edge of the character
sequence, but they are if they appear internally. "cats'" is not a token,
but "cat's" is.
"""
return [token.casefold() for token in TOKEN_RE.findall(text)]
mecab_tokenize = None
def tokenize(text, lang):
"""
Tokenize this text in a way that's straightforward but appropriate for
the language.
So far, this means that Japanese is handled by mecab_tokenize, and
everything else is handled by simple_tokenize. Additionally, Arabic commas
and combining marks are removed.
Strings that are looked up in wordfreq will be run through this function
first, so that they can be expected to match the data.
"""
if lang == 'ja':
global mecab_tokenize
if mecab_tokenize is None:
from wordfreq.mecab import mecab_tokenize
return mecab_tokenize(text)
if lang == 'ar':
text = standardize_arabic(text)
return simple_tokenize(text)
def standardize_arabic(text):
"""
Standardizes arabic text by removing combining marks and tatweels.
"""
return unicodedata.normalize(
'NFKC',
COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
)
# simple_tokenize is imported so that other things can import it from here.
# Suppress the pyflakes warning.
simple_tokenize = simple_tokenize
def read_cBpack(filename):

88
wordfreq/tokens.py Normal file
View File

@ -0,0 +1,88 @@
import regex
import unicodedata
# Here's what the following regular expression is looking for:
#
# At the start, it looks for a character in the set [\S--\p{punct}]. \S
# contains non-space characters, and then it subtracts the set of Unicode
# punctuation characters from that set. This is slightly different from \w,
# because it leaves symbols (such as emoji) as tokens.
#
# After it has found one such character, the rest of the token is (?:\B\S)*,
# which continues to consume characters as long as the next character does not
# cause a word break (\B) and is not a space (\S). The individual characters in
# this portion can be punctuation, allowing tokens such as "can't" or
# "google.com".
#
# As a complication, the rest of the token can match a glob of Han ideographs
# (\p{IsIdeo}) and hiragana (\p{Script=Hiragana}). Chinese words are made of
# Han ideographs (but we don't know how many). Japanese words are either made
# of Han ideographs and hiragana (which will be matched by this expression), or
# katakana (which will be matched by the standard Unicode rule).
#
# Without this special case for ideographs and hiragana, the standard Unicode
# rule would put each character in its own token. This actually would be the
# correct behavior for word-wrapping, but it's an ugly failure mode for NLP
# tokenization.
TOKEN_RE = regex.compile(r'[\S--\p{punct}](?:\B\S|[\p{IsIdeo}\p{Script=Hiragana}])*', regex.V1 | regex.WORD)
ARABIC_MARK_RE = regex.compile(r'[[\p{Mn}&&\p{Block=Arabic}]\N{ARABIC TATWEEL}]', regex.V1)
def simple_tokenize(text):
"""
Tokenize the given text using a straightforward, Unicode-aware token
expression. It returns non-whitespace tokens that are split at the
word boundaries defined by Unicode Tech Report #29, as implemented
by the regex package, except that it leaves Chinese and Japanese
relatively untokenized.
"""
text = unicodedata.normalize('NFKC', text)
return [token.casefold() for token in TOKEN_RE.findall(text)]
def remove_arabic_marks(text):
"""
Remove decorations from Arabic words:
- Combining marks of class Mn, which tend to represent non-essential
vowel markings.
- Tatweels, horizontal segments that are used to extend or justify a
word.
"""
return ARABIC_MARK_RE.sub('', text)
mecab_tokenize = None
def tokenize(text, lang):
"""
Tokenize this text in a way that's relatively simple but appropriate for
the language.
So far, this means:
- Chinese is presumed to already be tokenized. (Sorry. It's hard.)
- Japanese will be delegated to the external mecab-python module.
- Chinese or Japanese texts that aren't identified as the appropriate
language will only split on punctuation and script boundaries, giving
you untokenized globs of characters that probably represent many words.
- All other languages will be tokenized according to UTR #29.
Additionally, the text will be case-folded to lowercase, and text marked
as Arabic will have combining marks and tatweels removed.
Strings that are looked up in wordfreq will be run through this function
first, so that they can be expected to match the data.
"""
if lang == 'ja':
global mecab_tokenize
if mecab_tokenize is None:
from wordfreq.mecab import mecab_tokenize
return mecab_tokenize(text)
if lang == 'ar':
text = remove_arabic_marks(text)
return simple_tokenize(text)