From 879552537226f1b420080db3cefcda38e426cabc Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Mon, 24 Aug 2015 16:24:49 -0400
Subject: [PATCH] Use the regex implementation of Unicode segmentation

Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
---
 setup.py             |  2 +-
 tests/test.py        | 35 +++++++++++++-----
 wordfreq/__init__.py | 70 ++---------------------------------
 wordfreq/tokens.py   | 88 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 119 insertions(+), 76 deletions(-)
 create mode 100644 wordfreq/tokens.py

diff --git a/setup.py b/setup.py
index 4761eb3..d2ef0fb 100755
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,7 @@ classifiers = [
 current_dir = os.path.dirname(__file__)
 README_contents = open(os.path.join(current_dir, 'README.md')).read()
 doclines = README_contents.split("\n")
-dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes']
+dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
 if sys.version_info < (3, 4):
     dependencies.append('pathlib')
 
diff --git a/tests/test.py b/tests/test.py
index 679811c..0a8e212 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -95,13 +95,17 @@ def test_failed_cB_conversion():
 def test_tokenization():
     # We preserve apostrophes within words, so "can't" is a single word in the
     # data
-    eq_(tokenize("can't", 'en'), ["can't"])
+    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
+        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
 
+    # Certain punctuation does not inherently split a word.
+    eq_(tokenize("Anything is possible at zombo.com", 'en'),
+        ['anything', 'is', 'possible', 'at', 'zombo.com'])
+
+    # Splits occur after symbols, and at splitting punctuation such as hyphens.
     eq_(tokenize('😂test', 'en'), ['😂', 'test'])
 
-    # We do split at other punctuation, causing the word-combining rule to
-    # apply.
-    eq_(tokenize("can.t", 'en'), ['can', 't'])
+    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
 
 
 def test_casefolding():
@@ -110,11 +114,11 @@ def test_casefolding():
 
 
 def test_phrase_freq():
-    plant = word_frequency("plan.t", 'en')
-    assert_greater(plant, 0)
+    ff = word_frequency("flip-flop", 'en')
+    assert_greater(ff, 0)
     assert_almost_equal(
-        1.0 / plant,
-        1.0 / word_frequency('plan', 'en') + 1.0 / word_frequency('t', 'en')
+        1.0 / ff,
+        1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
     )
 
 
@@ -134,8 +138,8 @@ def test_not_really_random():
 def test_not_enough_ascii():
     random_ascii_words(lang='zh')
 
-def test_ar():
 
+def test_ar():
     # Remove tatweels
     eq_(
         tokenize('متــــــــعب', 'ar'),
@@ -152,3 +156,16 @@ def test_ar():
         tokenize('\ufefb', 'ar'),  # An Arabic ligature...
         ['\u0644\u0627']  # ...that is affected by NFKC normalization
     )
+
+
+def test_ideographic_fallback():
+    # Try tokenizing Chinese text -- it should remain stuck together.
+    eq_(tokenize('中国文字', 'zh'), ['中国文字'])
+
+    # When Japanese is tagged with the wrong language, it will be split
+    # at script boundaries.
+    ja_text = 'ひらがなカタカナromaji'
+    eq_(
+        tokenize(ja_text, 'en'),
+        ['ひらがな', 'カタカナ', 'romaji']
+    )
diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py
index cb085f7..e939127 100644
--- a/wordfreq/__init__.py
+++ b/wordfreq/__init__.py
@@ -1,14 +1,13 @@
+from wordfreq.tokens import tokenize, simple_tokenize
 from pkg_resources import resource_filename
 from functools import lru_cache
 import langcodes
 import msgpack
-import re
 import gzip
 import itertools
 import pathlib
 import random
 import logging
-import unicodedata
 
 logger = logging.getLogger(__name__)
 
@@ -16,71 +15,10 @@ logger = logging.getLogger(__name__)
 CACHE_SIZE = 100000
 DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
 
-def load_range(filename):
-    """
-    Load a file from the data path.
-    """
-    with (DATA_PATH / filename).open() as file:
-        return file.read()
 
-EMOJI_RANGE = load_range('emoji.txt')
-NON_PUNCT_RANGE = load_range('non_punct.txt')
-COMBINING_MARK_RANGE = load_range('combining_mark.txt')
-
-COMBINING_MARK_RE = re.compile(COMBINING_MARK_RANGE)
-TOKEN_RE = re.compile("{0}|{1}+(?:'{1}+)*".format(EMOJI_RANGE, NON_PUNCT_RANGE))
-
-
-def simple_tokenize(text):
-    """
-    A simple tokenizer that can be applied to most languages.
-
-    It considers a word to be made of a sequence of 'token characters', an
-    overly inclusive range that includes letters, Han characters, emoji, and a
-    bunch of miscellaneous whatnot, but excludes most punctuation and
-    whitespace.
-
-    The single complication for the sake of English is that apostrophes are not
-    considered part of the token if they appear on the edge of the character
-    sequence, but they are if they appear internally. "cats'" is not a token,
-    but "cat's" is.
-    """
-    return [token.casefold() for token in TOKEN_RE.findall(text)]
-
-
-mecab_tokenize = None
-def tokenize(text, lang):
-    """
-    Tokenize this text in a way that's straightforward but appropriate for
-    the language.
-
-    So far, this means that Japanese is handled by mecab_tokenize, and
-    everything else is handled by simple_tokenize. Additionally, Arabic commas
-    and combining marks are removed.
-
-    Strings that are looked up in wordfreq will be run through this function
-    first, so that they can be expected to match the data.
-    """
-    if lang == 'ja':
-        global mecab_tokenize
-        if mecab_tokenize is None:
-            from wordfreq.mecab import mecab_tokenize
-        return mecab_tokenize(text)
-
-    if lang == 'ar':
-        text = standardize_arabic(text)
-
-    return simple_tokenize(text)
-
-
-def standardize_arabic(text):
-    """
-    Standardizes arabic text by removing combining marks and tatweels.
-    """
-    return unicodedata.normalize(
-        'NFKC',
-        COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
-    )
+# simple_tokenize is imported so that other things can import it from here.
+# Suppress the pyflakes warning.
+simple_tokenize = simple_tokenize
 
 
 def read_cBpack(filename):
diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
new file mode 100644
index 0000000..5130d0f
--- /dev/null
+++ b/wordfreq/tokens.py
@@ -0,0 +1,88 @@
+import regex
+import unicodedata
+
+
+# Here's what the following regular expression is looking for:
+#
+# At the start, it looks for a character in the set [\S--\p{punct}]. \S
+# contains non-space characters, and then it subtracts the set of Unicode
+# punctuation characters from that set. This is slightly different from \w,
+# because it leaves symbols (such as emoji) as tokens.
+#
+# After it has found one such character, the rest of the token is (?:\B\S)*,
+# which continues to consume characters as long as the next character does not
+# cause a word break (\B) and is not a space (\S). The individual characters in
+# this portion can be punctuation, allowing tokens such as "can't" or
+# "google.com".
+#
+# As a complication, the rest of the token can match a glob of Han ideographs
+# (\p{IsIdeo}) and hiragana (\p{Script=Hiragana}). Chinese words are made of
+# Han ideographs (but we don't know how many). Japanese words are either made
+# of Han ideographs and hiragana (which will be matched by this expression), or
+# katakana (which will be matched by the standard Unicode rule).
+#
+# Without this special case for ideographs and hiragana, the standard Unicode
+# rule would put each character in its own token. This actually would be the
+# correct behavior for word-wrapping, but it's an ugly failure mode for NLP
+# tokenization.
+
+TOKEN_RE = regex.compile(r'[\S--\p{punct}](?:\B\S|[\p{IsIdeo}\p{Script=Hiragana}])*', regex.V1 | regex.WORD)
+ARABIC_MARK_RE = regex.compile(r'[[\p{Mn}&&\p{Block=Arabic}]\N{ARABIC TATWEEL}]', regex.V1)
+
+
+def simple_tokenize(text):
+    """
+    Tokenize the given text using a straightforward, Unicode-aware token
+    expression. It returns non-whitespace tokens that are split at the
+    word boundaries defined by Unicode Tech Report #29, as implemented
+    by the regex package, except that it leaves Chinese and Japanese
+    relatively untokenized.
+    """
+    text = unicodedata.normalize('NFKC', text)
+    return [token.casefold() for token in TOKEN_RE.findall(text)]
+
+
+def remove_arabic_marks(text):
+    """
+    Remove decorations from Arabic words:
+
+    - Combining marks of class Mn, which tend to represent non-essential
+      vowel markings.
+    - Tatweels, horizontal segments that are used to extend or justify a
+      word.
+    """
+    return ARABIC_MARK_RE.sub('', text)
+
+
+mecab_tokenize = None
+def tokenize(text, lang):
+    """
+    Tokenize this text in a way that's relatively simple but appropriate for
+    the language.
+
+    So far, this means:
+
+    - Chinese is presumed to already be tokenized. (Sorry. It's hard.)
+    - Japanese will be delegated to the external mecab-python module.
+    - Chinese or Japanese texts that aren't identified as the appropriate
+      language will only split on punctuation and script boundaries, giving
+      you untokenized globs of characters that probably represent many words.
+    - All other languages will be tokenized according to UTR #29.
+
+    Additionally, the text will be case-folded to lowercase, and text marked
+    as Arabic will have combining marks and tatweels removed.
+
+    Strings that are looked up in wordfreq will be run through this function
+    first, so that they can be expected to match the data.
+    """
+    if lang == 'ja':
+        global mecab_tokenize
+        if mecab_tokenize is None:
+            from wordfreq.mecab import mecab_tokenize
+        return mecab_tokenize(text)
+
+    if lang == 'ar':
+        text = remove_arabic_marks(text)
+
+    return simple_tokenize(text)
+