From 3155cf27e6bac4297c8b7a3f163100129989e503 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Fri, 1 Jul 2016 18:00:57 -0400
Subject: [PATCH] Fix tokenization of SE Asian and South Asian scripts (#37)

Former-commit-id: 270f6c7ca616165e89ccbfa270d78eabc49782c4
---
 setup.py           |  2 +-
 tests/test.py      |  5 ++++
 wordfreq/tokens.py | 57 ++++++++++++++++++++++++++++++++++------------
 3 files changed, 49 insertions(+), 15 deletions(-)
diff --git a/setup.py b/setup.py
index 46abfe7..cbf7046 100755
--- a/setup.py
+++ b/setup.py
@@ -34,7 +34,7 @@ if sys.version_info < (3, 4):
 
 setup(
     name="wordfreq",
-    version='1.4',
+    version='1.4.1',
     maintainer='Luminoso Technologies, Inc.',
     maintainer_email='info@luminoso.com',
     url='http://github.com/LuminosoInsight/wordfreq/',
diff --git a/tests/test.py b/tests/test.py
index 6c20ccb..941bd90 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -189,3 +189,8 @@ def test_ideographic_fallback():
     eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
         ['การเล่นดนตรี', 'means', 'playing', 'music'])
 
+    # Test Khmer, a script similar to Thai
+    eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍'])
+
+    # Test Hindi -- tokens split where there are spaces, and not where there aren't
+    eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी'])
diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index cc275f0..0332f05 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -2,25 +2,54 @@ import regex
 import unicodedata
 
 
+# See the documentation inside TOKEN_RE for why we have to handle these
+# scripts specially.
+SPACELESS_SCRIPTS = [
+    'Hiragana',
+    'Thai',  # Thai script
+    'Khmr',  # Khmer script
+    'Laoo',  # Lao script
+    'Mymr',  # Burmese script
+    'Tale',  # Tai Le script
+    'Talu',  # Tai Lü script
+    'Lana',  # Lanna script
+]
+
+
+def _make_spaceless_expr():
+    pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
+    return ''.join(pieces)
+
+
+SPACELESS_EXPR = _make_spaceless_expr()
+
+
 TOKEN_RE = regex.compile(r"""
     # Case 1: a special case for non-spaced languages
     # -----------------------------------------------
 
-    # When we see characters that are Han ideographs (\p{IsIdeo}), hiragana
-    # (\p{Script=Hiragana}), or Thai (\p{Script=Thai}), we allow a sequence
-    # of those characters to be glued together as a single token.
+    # Some scripts are written without spaces, and the Unicode algorithm
+    # seems to overreact and insert word breaks between all their letters.
+    # When we see sequences of characters in these scripts, we make sure not
+    # to break them up. Such scripts include Han ideographs (\p{IsIdeo}),
+    # hiragana (\p{Script=Hiragana}), and many Southeast Asian scripts such
+    # as Thai and Khmer.
     #
     # Without this case, the standard rule (case 2) would make each character
     # a separate token. This would be the correct behavior for word-wrapping,
     # but a messy failure mode for NLP tokenization.
     #
-    # It is, of course, better to use a tokenizer that is designed for Chinese,
-    # Japanese, or Thai text. This is effectively a fallback for when the wrong
-    # tokenizer is used.
+    # If you have Chinese or Japanese text, it's certainly better to use a
+    # tokenizer that's designed for it. Elsewhere in this file, we have
+    # specific tokenizers that can handle Chinese and Japanese. With this
+    # rule, though, at least this general tokenizer will fail less badly
+    # on those languages.
     #
-    # This rule is listed first so that it takes precedence.
+    # This rule is listed first so that it takes precedence. The placeholder
+    # <SPACELESS> will be replaced by the complex range expression made by
+    # _make_spaceless_expr().
 
-    [\p{IsIdeo}\p{Script=Hiragana}\p{Script=Thai}]+ |
+    [<SPACELESS>]+ |
 
     # Case 2: standard Unicode segmentation
     # -------------------------------------
@@ -34,16 +63,16 @@ TOKEN_RE = regex.compile(r"""
 
     # The rest of the token matches characters that are not any sort of space
     # (\S) and do not cause word breaks according to the Unicode word
-    # segmentation heuristic (\B).
+    # segmentation heuristic (\B), or are categorized as Marks (\p{M}).
 
-    (?:\B\S)*
-""", regex.V1 | regex.WORD | regex.VERBOSE)
+    (?:\B\S|\p{M})*
+""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
 
 TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
-    [\p{IsIdeo}\p{Script=Hiragana}]+ |
+    [<SPACELESS>]+ |
     [\p{punct}]+ |
-    \S(?:\B\S)*
-""", regex.V1 | regex.WORD | regex.VERBOSE)
+    \S(?:\B\S|\p{M})*
+""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
 
 ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)