Use the regex implementation of Unicode segmentation

2024-12-23 17:31:41 +00:00 · 2015-08-24 16:24:49 -04:00 · 2015-08-24 16:24:49 -04:00 · 95998205ad
commit 95998205ad
parent 2b8089e2b1
4 changed files with 119 additions and 76 deletions
--- a/setup.py
+++ b/setup.py
@ -26,7 +26,7 @@ classifiers = [
 current_dir = os.path.dirname(__file__)
 README_contents = open(os.path.join(current_dir, 'README.md')).read()
 doclines = README_contents.split("\n")
-dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes']
+dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
 if sys.version_info < (3, 4):
    dependencies.append('pathlib')

--- a/tests/test.py
+++ b/tests/test.py
@ -95,13 +95,17 @@ def test_failed_cB_conversion():
 def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data
-    eq_(tokenize("can't", 'en'), ["can't"])
+    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
+        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])

+    # Certain punctuation does not inherently split a word.
+    eq_(tokenize("Anything is possible at zombo.com", 'en'),
+        ['anything', 'is', 'possible', 'at', 'zombo.com'])
+
+    # Splits occur after symbols, and at splitting punctuation such as hyphens.
    eq_(tokenize('😂test', 'en'), ['😂', 'test'])

-    # We do split at other punctuation, causing the word-combining rule to
-    # apply.
-    eq_(tokenize("can.t", 'en'), ['can', 't'])
+    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])


 def test_casefolding():
@ -110,11 +114,11 @@ def test_casefolding():


 def test_phrase_freq():
-    plant = word_frequency("plan.t", 'en')
-    assert_greater(plant, 0)
+    ff = word_frequency("flip-flop", 'en')
+    assert_greater(ff, 0)
    assert_almost_equal(
-        1.0 / plant,
-        1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
+        1.0 / ff,
+        1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
    )


@ -134,8 +138,8 @@ def test_not_really_random():
 def test_not_enough_ascii():
    random_ascii_words(lang='zh')

-def test_ar():

+def test_ar():
    # Remove tatweels
    eq_(
        tokenize('متــــــــعب', 'ar'),
@ -152,3 +156,16 @@ def test_ar():
        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
        ['\u0644\u0627']  # ...that is affected by NFKC normalization
    )
+
+
+def test_ideographic_fallback():
+    # Try tokenizing Chinese text -- it should remain stuck together.
+    eq_(tokenize('中国文字', 'zh'), ['中国文字'])
+
+    # When Japanese is tagged with the wrong language, it will be split
+    # at script boundaries.
+    ja_text = 'ひらがなカタカナromaji'
+    eq_(
+        tokenize(ja_text, 'en'),
+        ['ひらがな', 'カタカナ', 'romaji']
+    )
--- a/wordfreq/init.py
+++ b/wordfreq/init.py
@ -1,14 +1,13 @@
+from wordfreq.tokens import tokenize, simple_tokenize
 from pkg_resources import resource_filename
 from functools import lru_cache
 import langcodes
 import msgpack
-import re
 import gzip
 import itertools
 import pathlib
 import random
 import logging
-import unicodedata

 logger = logging.getLogger(__name__)

@ -16,71 +15,10 @@ logger = logging.getLogger(__name__)
 CACHE_SIZE = 100000
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))

-def load_range(filename):
-    """
-    Load a file from the data path.
-    """
-    with (DATA_PATH / filename).open() as file:
-        return file.read()

-EMOJI_RANGE = load_range('emoji.txt')
-NON_PUNCT_RANGE = load_range('non_punct.txt')
-COMBINING_MARK_RANGE = load_range('combining_mark.txt')
-
-COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE)
-TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
-
-
-def simple_tokenize(text):
-    """
-    A simple tokenizer that can be applied to most languages.
-
-    It considers a word to be made of a sequence of 'token characters', an
-    overly inclusive range that includes letters, Han characters, emoji, and a
-    bunch of miscellaneous whatnot, but excludes most punctuation and
-    whitespace.
-
-    The single complication for the sake of English is that apostrophes are not
-    considered part of the token if they appear on the edge of the character
-    sequence, but they are if they appear internally. "cats'" is not a token,
-    but "cat's" is.
-    """
-    return [token.casefold() for token in TOKEN_RE.findall(text)]
-
-
-mecab_tokenize = None
-def tokenize(text, lang):
-    """
-    Tokenize this text in a way that's straightforward but appropriate for
-    the language.
-
-    So far, this means that Japanese is handled by mecab_tokenize, and
-    everything else is handled by simple_tokenize. Additionally, Arabic commas
-    and combining marks are removed.
-
-    Strings that are looked up in wordfreq will be run through this function
-    first, so that they can be expected to match the data.
-    """
-    if lang == 'ja':
-        global mecab_tokenize
-        if mecab_tokenize is None:
-            from wordfreq.mecab import mecab_tokenize
-        return mecab_tokenize(text)
-
-    if lang == 'ar':
-        text = standardize_arabic(text)
-
-    return simple_tokenize(text)
-
-
-def standardize_arabic(text):
-    """
-    Standardizes arabic text by removing combining marks and tatweels.
-    """
-    return unicodedata.normalize(
-        'NFKC',
-        COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
-    )
+# simple_tokenize is imported so that other things can import it from here.
+# Suppress the pyflakes warning.
+simple_tokenize = simple_tokenize


 def read_cBpack(filename):
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -0,0 +1,88 @@
+import regex
+import unicodedata
+
+
+# Here's what the following regular expression is looking for:
+#
+# At the start, it looks for a character in the set [\S--\p{punct}]. \S
+# contains non-space characters, and then it subtracts the set of Unicode
+# punctuation characters from that set. This is slightly different from \w,
+# because it leaves symbols (such as emoji) as tokens.
+#
+# After it has found one such character, the rest of the token is (?:\B\S)*,
+# which continues to consume characters as long as the next character does not
+# cause a word break (\B) and is not a space (\S). The individual characters in
+# this portion can be punctuation, allowing tokens such as "can't" or
+# "google.com".
+#
+# As a complication, the rest of the token can match a glob of Han ideographs
+# (\p{IsIdeo}) and hiragana (\p{Script=Hiragana}). Chinese words are made of
+# Han ideographs (but we don't know how many). Japanese words are either made
+# of Han ideographs and hiragana (which will be matched by this expression), or
+# katakana (which will be matched by the standard Unicode rule).
+#
+# Without this special case for ideographs and hiragana, the standard Unicode
+# rule would put each character in its own token. This actually would be the
+# correct behavior for word-wrapping, but it's an ugly failure mode for NLP
+# tokenization.
+
+TOKEN_RE = regex.compile(r'[\S--\p{punct}](?:\B\S|[\p{IsIdeo}\p{Script=Hiragana}])*', regex.V1 | regex.WORD)
+ARABIC_MARK_RE = regex.compile(r'[[\p{Mn}&&\p{Block=Arabic}]\N{ARABIC TATWEEL}]', regex.V1)
+
+
+def simple_tokenize(text):
+    """
+    Tokenize the given text using a straightforward, Unicode-aware token
+    expression. It returns non-whitespace tokens that are split at the
+    word boundaries defined by Unicode Tech Report #29, as implemented
+    by the regex package, except that it leaves Chinese and Japanese
+    relatively untokenized.
+    """
+    text = unicodedata.normalize('NFKC', text)
+    return [token.casefold() for token in TOKEN_RE.findall(text)]
+
+
+def remove_arabic_marks(text):
+    """
+    Remove decorations from Arabic words:
+
+    - Combining marks of class Mn, which tend to represent non-essential
+      vowel markings.
+    - Tatweels, horizontal segments that are used to extend or justify a
+      word.
+    """
+    return ARABIC_MARK_RE.sub('', text)
+
+
+mecab_tokenize = None
+def tokenize(text, lang):
+    """
+    Tokenize this text in a way that's relatively simple but appropriate for
+    the language.
+
+    So far, this means:
+
+    - Chinese is presumed to already be tokenized. (Sorry. It's hard.)
+    - Japanese will be delegated to the external mecab-python module.
+    - Chinese or Japanese texts that aren't identified as the appropriate
+      language will only split on punctuation and script boundaries, giving
+      you untokenized globs of characters that probably represent many words.
+    - All other languages will be tokenized according to UTR #29.
+
+    Additionally, the text will be case-folded to lowercase, and text marked
+    as Arabic will have combining marks and tatweels removed.
+
+    Strings that are looked up in wordfreq will be run through this function
+    first, so that they can be expected to match the data.
+    """
+    if lang == 'ja':
+        global mecab_tokenize
+        if mecab_tokenize is None:
+            from wordfreq.mecab import mecab_tokenize
+        return mecab_tokenize(text)
+
+    if lang == 'ar':
+        text = remove_arabic_marks(text)
+
+    return simple_tokenize(text)
+