diff --git a/setup.py b/setup.py index 4761eb3..d2ef0fb 100755 --- a/setup.py +++ b/setup.py @@ -26,7 +26,7 @@ classifiers = [ current_dir = os.path.dirname(__file__) README_contents = open(os.path.join(current_dir, 'README.md')).read() doclines = README_contents.split("\n") -dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes'] +dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015'] if sys.version_info < (3, 4): dependencies.append('pathlib') diff --git a/tests/test.py b/tests/test.py index 679811c..0a8e212 100644 --- a/tests/test.py +++ b/tests/test.py @@ -95,13 +95,17 @@ def test_failed_cB_conversion(): def test_tokenization(): # We preserve apostrophes within words, so "can't" is a single word in the # data - eq_(tokenize("can't", 'en'), ["can't"]) + eq_(tokenize("I don't split at apostrophes, you see.", 'en'), + ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']) + # Certain punctuation does not inherently split a word. + eq_(tokenize("Anything is possible at zombo.com", 'en'), + ['anything', 'is', 'possible', 'at', 'zombo.com']) + + # Splits occur after symbols, and at splitting punctuation such as hyphens. eq_(tokenize('😂test', 'en'), ['😂', 'test']) - # We do split at other punctuation, causing the word-combining rule to - # apply. - eq_(tokenize("can.t", 'en'), ['can', 't']) + eq_(tokenize("flip-flop", 'en'), ['flip', 'flop']) def test_casefolding(): @@ -110,11 +114,11 @@ def test_casefolding(): def test_phrase_freq(): - plant = word_frequency("plan.t", 'en') - assert_greater(plant, 0) + ff = word_frequency("flip-flop", 'en') + assert_greater(ff, 0) assert_almost_equal( - 1.0 / plant, - 1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en') + 1.0 / ff, + 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en') ) @@ -134,8 +138,8 @@ def test_not_really_random(): def test_not_enough_ascii(): random_ascii_words(lang='zh') -def test_ar(): +def test_ar(): # Remove tatweels eq_( tokenize('متــــــــعب', 'ar'), @@ -152,3 +156,16 @@ def test_ar(): tokenize('\ufefb', 'ar'), # An Arabic ligature... ['\u0644\u0627'] # ...that is affected by NFKC normalization ) + + +def test_ideographic_fallback(): + # Try tokenizing Chinese text -- it should remain stuck together. + eq_(tokenize('中国文字', 'zh'), ['中国文字']) + + # When Japanese is tagged with the wrong language, it will be split + # at script boundaries. + ja_text = 'ひらがなカタカナromaji' + eq_( + tokenize(ja_text, 'en'), + ['ひらがな', 'カタカナ', 'romaji'] + ) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index cb085f7..e939127 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -1,14 +1,13 @@ +from wordfreq.tokens import tokenize, simple_tokenize from pkg_resources import resource_filename from functools import lru_cache import langcodes import msgpack -import re import gzip import itertools import pathlib import random import logging -import unicodedata logger = logging.getLogger(__name__) @@ -16,71 +15,10 @@ logger = logging.getLogger(__name__) CACHE_SIZE = 100000 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data')) -def load_range(filename): - """ - Load a file from the data path. - """ - with (DATA_PATH / filename).open() as file: - return file.read() -EMOJI_RANGE = load_range('emoji.txt') -NON_PUNCT_RANGE = load_range('non_punct.txt') -COMBINING_MARK_RANGE = load_range('combining_mark.txt') - -COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE) -TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE)) - - -def simple_tokenize(text): - """ - A simple tokenizer that can be applied to most languages. - - It considers a word to be made of a sequence of 'token characters', an - overly inclusive range that includes letters, Han characters, emoji, and a - bunch of miscellaneous whatnot, but excludes most punctuation and - whitespace. - - The single complication for the sake of English is that apostrophes are not - considered part of the token if they appear on the edge of the character - sequence, but they are if they appear internally. "cats'" is not a token, - but "cat's" is. - """ - return [token.casefold() for token in TOKEN_RE.findall(text)] - - -mecab_tokenize = None -def tokenize(text, lang): - """ - Tokenize this text in a way that's straightforward but appropriate for - the language. - - So far, this means that Japanese is handled by mecab_tokenize, and - everything else is handled by simple_tokenize. Additionally, Arabic commas - and combining marks are removed. - - Strings that are looked up in wordfreq will be run through this function - first, so that they can be expected to match the data. - """ - if lang == 'ja': - global mecab_tokenize - if mecab_tokenize is None: - from wordfreq.mecab import mecab_tokenize - return mecab_tokenize(text) - - if lang == 'ar': - text = standardize_arabic(text) - - return simple_tokenize(text) - - -def standardize_arabic(text): - """ - Standardizes arabic text by removing combining marks and tatweels. - """ - return unicodedata.normalize( - 'NFKC', - COMBINING_MARK_RE.sub('', text.replace('ـ', '')) - ) +# simple_tokenize is imported so that other things can import it from here. +# Suppress the pyflakes warning. +simple_tokenize = simple_tokenize def read_cBpack(filename): diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py new file mode 100644 index 0000000..5130d0f --- /dev/null +++ b/wordfreq/tokens.py @@ -0,0 +1,88 @@ +import regex +import unicodedata + + +# Here's what the following regular expression is looking for: +# +# At the start, it looks for a character in the set [\S--\p{punct}]. \S +# contains non-space characters, and then it subtracts the set of Unicode +# punctuation characters from that set. This is slightly different from \w, +# because it leaves symbols (such as emoji) as tokens. +# +# After it has found one such character, the rest of the token is (?:\B\S)*, +# which continues to consume characters as long as the next character does not +# cause a word break (\B) and is not a space (\S). The individual characters in +# this portion can be punctuation, allowing tokens such as "can't" or +# "google.com". +# +# As a complication, the rest of the token can match a glob of Han ideographs +# (\p{IsIdeo}) and hiragana (\p{Script=Hiragana}). Chinese words are made of +# Han ideographs (but we don't know how many). Japanese words are either made +# of Han ideographs and hiragana (which will be matched by this expression), or +# katakana (which will be matched by the standard Unicode rule). +# +# Without this special case for ideographs and hiragana, the standard Unicode +# rule would put each character in its own token. This actually would be the +# correct behavior for word-wrapping, but it's an ugly failure mode for NLP +# tokenization. + +TOKEN_RE = regex.compile(r'[\S--\p{punct}](?:\B\S|[\p{IsIdeo}\p{Script=Hiragana}])*', regex.V1 | regex.WORD) +ARABIC_MARK_RE = regex.compile(r'[[\p{Mn}&&\p{Block=Arabic}]\N{ARABIC TATWEEL}]', regex.V1) + + +def simple_tokenize(text): + """ + Tokenize the given text using a straightforward, Unicode-aware token + expression. It returns non-whitespace tokens that are split at the + word boundaries defined by Unicode Tech Report #29, as implemented + by the regex package, except that it leaves Chinese and Japanese + relatively untokenized. + """ + text = unicodedata.normalize('NFKC', text) + return [token.casefold() for token in TOKEN_RE.findall(text)] + + +def remove_arabic_marks(text): + """ + Remove decorations from Arabic words: + + - Combining marks of class Mn, which tend to represent non-essential + vowel markings. + - Tatweels, horizontal segments that are used to extend or justify a + word. + """ + return ARABIC_MARK_RE.sub('', text) + + +mecab_tokenize = None +def tokenize(text, lang): + """ + Tokenize this text in a way that's relatively simple but appropriate for + the language. + + So far, this means: + + - Chinese is presumed to already be tokenized. (Sorry. It's hard.) + - Japanese will be delegated to the external mecab-python module. + - Chinese or Japanese texts that aren't identified as the appropriate + language will only split on punctuation and script boundaries, giving + you untokenized globs of characters that probably represent many words. + - All other languages will be tokenized according to UTR #29. + + Additionally, the text will be case-folded to lowercase, and text marked + as Arabic will have combining marks and tatweels removed. + + Strings that are looked up in wordfreq will be run through this function + first, so that they can be expected to match the data. + """ + if lang == 'ja': + global mecab_tokenize + if mecab_tokenize is None: + from wordfreq.mecab import mecab_tokenize + return mecab_tokenize(text) + + if lang == 'ar': + text = remove_arabic_marks(text) + + return simple_tokenize(text) +