From 666f7e51fa52d417d5b97a80a79fb3153156a1fe Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Thu, 26 Apr 2018 15:53:07 -0400
Subject: [PATCH] Handle Japanese edge cases in simple_tokenize

---
 README.md                 |  6 +++---
 setup.py                  |  2 +-
 tests/test.py             | 11 +++-------
 tests/test_japanese.py    | 42 ++++++++++++++++++++++++++++++++++++++-
 wordfreq/language_info.py | 32 ++++++++++++++++++++++++-----
 wordfreq/tokens.py        |  4 ++--
 6 files changed, 77 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 995286f..5b7de47 100644
--- a/README.md
+++ b/README.md
@@ -416,9 +416,9 @@ sources:
 
 - Wikipedia, the free encyclopedia (http://www.wikipedia.org)
 
-It contains data from OPUS OpenSubtitles 2016
-(http://opus.lingfil.uu.se/OpenSubtitles2016.php), whose data originates from
-the OpenSubtitles project (http://www.opensubtitles.org/).
+It contains data from OPUS OpenSubtitles 2018
+(http://opus.nlpl.eu/OpenSubtitles.php), whose data originates from the
+OpenSubtitles project (http://www.opensubtitles.org/).
 
 It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
 SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
diff --git a/setup.py b/setup.py
index 4680980..620a67e 100755
--- a/setup.py
+++ b/setup.py
@@ -36,7 +36,7 @@ if sys.version_info < (3, 4):
 
 setup(
     name="wordfreq",
-    version='2.0',
+    version='2.0.1',
     maintainer='Luminoso Technologies, Inc.',
     maintainer_email='info@luminoso.com',
     url='http://github.com/LuminosoInsight/wordfreq/',
diff --git a/tests/test.py b/tests/test.py
index de82d9a..d7fb321 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -204,16 +204,11 @@ def test_arabic():
 
 def test_ideographic_fallback():
     # Try tokenizing Chinese text as English -- it should remain stuck together.
+    #
+    # More complex examples like this, involving the multiple scripts of Japanese,
+    # are in test_japanese.py.
     eq_(tokenize('中国文字', 'en'), ['中国文字'])
 
-    # When Japanese is tagged with the wrong language, it will be split
-    # at script boundaries.
-    ja_text = 'ひらがなカタカナromaji'
-    eq_(
-        tokenize(ja_text, 'en'),
-        ['ひらがな', 'カタカナ', 'romaji']
-    )
-
 
 def test_other_languages():
     # Test that we leave Thai letters stuck together. If we had better Thai support,
diff --git a/tests/test_japanese.py b/tests/test_japanese.py
index d5a73b3..1cd1efa 100644
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@@ -1,5 +1,5 @@
 from nose.tools import eq_, assert_almost_equal
-from wordfreq import tokenize, word_frequency
+from wordfreq import tokenize, simple_tokenize, word_frequency
 
 
 def test_tokens():
@@ -7,6 +7,46 @@ def test_tokens():
         ['おはよう', 'ござい', 'ます'])
 
 
+def test_simple_tokenize():
+    # When Japanese is run through simple_tokenize -- either because it's
+    # tagged with the wrong language, or because we want to pass through
+    # Japanese text without getting MeCab involved -- it will be split at
+    # boundaries between Japanese and non-Japanese scripts, but all Japanese
+    # scripts will be stuck together. Here the switch between hiragana
+    # (ひらがな) and katakana (カタカナ) is not a boundary, but the switch
+    # between katakana and romaji is.
+    #
+    # We used to try to infer word boundaries between hiragana and katakana,
+    # but this leads to edge cases that are unsolvable without a dictionary.
+    ja_text = 'ひらがなカタカナromaji'
+    eq_(
+        simple_tokenize(ja_text),
+        ['ひらがなカタカナ', 'romaji']
+    )
+
+    # An example that would be multiple tokens if tokenized as 'ja' via MeCab,
+    # but sticks together in simple_tokenize
+    eq_(simple_tokenize('おはようございます'), ['おはようございます'])
+
+    # Names that use the weird possessive marker ヶ, which is technically a
+    # katakana even though it's being used like a kanji, stay together as one
+    # token
+    eq_(simple_tokenize("犬ヶ島"), ["犬ヶ島"])
+
+    # The word in ConceptNet that made me notice that simple_tokenize used
+    # to have a problem with the character 々
+    eq_(simple_tokenize("晴々しい"), ["晴々しい"])
+
+    # Explicit word separators are still token boundaries, such as the dot
+    # between "toner" and "cartridge" in "toner cartridge"
+    eq_(simple_tokenize("トナー・カートリッジ"), ["トナー", "カートリッジ"])
+
+    # This word has multiple weird characters that aren't quite kanji in it,
+    # and is in the dictionary
+    eq_(simple_tokenize("見ヶ〆料"), ["見ヶ〆料"])
+
+
+
 def test_combination():
     ohayou_freq = word_frequency('おはよう', 'ja')
     gozai_freq = word_frequency('ござい', 'ja')
diff --git a/wordfreq/language_info.py b/wordfreq/language_info.py
index 6c56b64..3b736be 100644
--- a/wordfreq/language_info.py
+++ b/wordfreq/language_info.py
@@ -8,11 +8,13 @@ from langcodes import Language, best_match
 # a specific tokenizer for the language or give up.
 SPACELESS_SCRIPTS = [
     # Han ideographs are spaceless, but they don't need to appear in this list
-    # because they have their own cases in get_language_info and TOKEN_RE.
-    'Hiragana',
-    # We omit katakana because Unicode regular expressions can already
-    # tokenize sequences of katakana, and omitting it here means we can also
-    # recognize a switch between hiragana and katakana as a token boundary.
+    # because _almost_ all of them, except for some exceptional Japanese
+    # characters, are covered by the \p{IsIdeo} check.  Checking for
+    # Script=Hani and IsIdeo slows down our regexes with huge, redundant
+    # classes of characters. Instead, we'll list the exceptions below.
+
+    'Hira',  # Hiragana
+    'Kana',  # Katakana
     'Thai',  # Thai script
     'Khmr',  # Khmer script
     'Laoo',  # Lao script
@@ -23,6 +25,26 @@ SPACELESS_SCRIPTS = [
 ]
 
 
+EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆'
+
+# ー is a lengthening mark that's both hiragana and katakana. Unicode
+# segmentation handles it as a special case, but we're overriding standard
+# Unicode segmentation, so we need to have the special case too.
+#
+# 々 and 〻 are "iteration marks" that stand for the previous kanji. So they
+# act identically to kanji (ideograms) without technically _being_ kanji.  That
+# technicality doesn't matter to us.
+#
+# 〆 is a Japanese abbreviation for "total", and even this can be used in the
+# middle of words. Why isn't it just considered an ideograph? I don't know, I
+# didn't come up with this language, or Unicode for that matter.
+#
+# None of this even comes up when we're trying to tokenize Chinese and
+# Japanese.  It comes up when we're trying to _not_ tokenize a word because
+# it's Chinese or Japanese and the tokenization doesn't really matter, which
+# happens in ConceptNet.
+
+
 def _language_in_list(language, targets, min_score=80):
     """
     A helper function to determine whether this language matches one of the
diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py
index 8e1bb20..38f875b 100644
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@@ -3,7 +3,7 @@ import unicodedata
 import logging
 import langcodes
 
-from .language_info import get_language_info, SPACELESS_SCRIPTS
+from .language_info import get_language_info, SPACELESS_SCRIPTS, EXTRA_JAPANESE_CHARACTERS
 from .preprocess import preprocess_text, smash_numbers
 
 # Placeholders for CJK functions that we'll import on demand
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
 def _make_spaceless_expr():
     scripts = sorted(SPACELESS_SCRIPTS)
     pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts]
-    return ''.join(pieces)
+    return ''.join(pieces) + EXTRA_JAPANESE_CHARACTERS
 
 
 SPACELESS_EXPR = _make_spaceless_expr()