Handle Japanese edge cases in simple_tokenize

2024-12-23 09:21:37 +00:00 · 2018-04-26 15:53:07 -04:00 · 2018-04-26 15:53:07 -04:00 · 666f7e51fa
commit 666f7e51fa
parent 18f176dbf6
6 changed files with 77 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -416,9 +416,9 @@ sources:

 - Wikipedia, the free encyclopedia (http://www.wikipedia.org)

-It contains data from OPUS OpenSubtitles 2016
-(http://opus.lingfil.uu.se/OpenSubtitles2016.php), whose data originates from
-the OpenSubtitles project (http://www.opensubtitles.org/).
+It contains data from OPUS OpenSubtitles 2018
+(http://opus.nlpl.eu/OpenSubtitles.php), whose data originates from the
+OpenSubtitles project (http://www.opensubtitles.org/).

 It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
 SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
--- a/setup.py
+++ b/setup.py
@ -36,7 +36,7 @@ if sys.version_info < (3, 4):

 setup(
    name="wordfreq",
-    version='2.0',
+    version='2.0.1',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
--- a/tests/test.py
+++ b/tests/test.py
@ -204,16 +204,11 @@ def test_arabic():

 def test_ideographic_fallback():
    # Try tokenizing Chinese text as English -- it should remain stuck together.
+    #
+    # More complex examples like this, involving the multiple scripts of Japanese,
+    # are in test_japanese.py.
    eq_(tokenize('中国文字', 'en'), ['中国文字'])

-    # When Japanese is tagged with the wrong language, it will be split
-    # at script boundaries.
-    ja_text = 'ひらがなカタカナromaji'
-    eq_(
-        tokenize(ja_text, 'en'),
-        ['ひらがな', 'カタカナ', 'romaji']
-    )
-

 def test_other_languages():
    # Test that we leave Thai letters stuck together. If we had better Thai support,
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@ -1,5 +1,5 @@
 from nose.tools import eq_, assert_almost_equal
-from wordfreq import tokenize, word_frequency
+from wordfreq import tokenize, simple_tokenize, word_frequency


 def test_tokens():
@ -7,6 +7,46 @@ def test_tokens():
        ['おはよう', 'ござい', 'ます'])


+def test_simple_tokenize():
+    # When Japanese is run through simple_tokenize -- either because it's
+    # tagged with the wrong language, or because we want to pass through
+    # Japanese text without getting MeCab involved -- it will be split at
+    # boundaries between Japanese and non-Japanese scripts, but all Japanese
+    # scripts will be stuck together. Here the switch between hiragana
+    # (ひらがな) and katakana (カタカナ) is not a boundary, but the switch
+    # between katakana and romaji is.
+    #
+    # We used to try to infer word boundaries between hiragana and katakana,
+    # but this leads to edge cases that are unsolvable without a dictionary.
+    ja_text = 'ひらがなカタカナromaji'
+    eq_(
+        simple_tokenize(ja_text),
+        ['ひらがなカタカナ', 'romaji']
+    )
+
+    # An example that would be multiple tokens if tokenized as 'ja' via MeCab,
+    # but sticks together in simple_tokenize
+    eq_(simple_tokenize('おはようございます'), ['おはようございます'])
+
+    # Names that use the weird possessive marker ヶ, which is technically a
+    # katakana even though it's being used like a kanji, stay together as one
+    # token
+    eq_(simple_tokenize("犬ヶ島"), ["犬ヶ島"])
+
+    # The word in ConceptNet that made me notice that simple_tokenize used
+    # to have a problem with the character 々
+    eq_(simple_tokenize("晴々しい"), ["晴々しい"])
+
+    # Explicit word separators are still token boundaries, such as the dot
+    # between "toner" and "cartridge" in "toner cartridge"
+    eq_(simple_tokenize("トナー・カートリッジ"), ["トナー", "カートリッジ"])
+
+    # This word has multiple weird characters that aren't quite kanji in it,
+    # and is in the dictionary
+    eq_(simple_tokenize("見ヶ〆料"), ["見ヶ〆料"])
+
+
+
 def test_combination():
    ohayou_freq = word_frequency('おはよう', 'ja')
    gozai_freq = word_frequency('ござい', 'ja')
--- a/wordfreq/language_info.py
+++ b/wordfreq/language_info.py
@ -8,11 +8,13 @@ from langcodes import Language, best_match
 # a specific tokenizer for the language or give up.
 SPACELESS_SCRIPTS = [
    # Han ideographs are spaceless, but they don't need to appear in this list
-    # because they have their own cases in get_language_info and TOKEN_RE.
-    'Hiragana',
-    # We omit katakana because Unicode regular expressions can already
-    # tokenize sequences of katakana, and omitting it here means we can also
-    # recognize a switch between hiragana and katakana as a token boundary.
+    # because _almost_ all of them, except for some exceptional Japanese
+    # characters, are covered by the \p{IsIdeo} check.  Checking for
+    # Script=Hani and IsIdeo slows down our regexes with huge, redundant
+    # classes of characters. Instead, we'll list the exceptions below.
+
+    'Hira',  # Hiragana
+    'Kana',  # Katakana
    'Thai',  # Thai script
    'Khmr',  # Khmer script
    'Laoo',  # Lao script
@ -23,6 +25,26 @@ SPACELESS_SCRIPTS = [
 ]


+EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆'
+
+# ー is a lengthening mark that's both hiragana and katakana. Unicode
+# segmentation handles it as a special case, but we're overriding standard
+# Unicode segmentation, so we need to have the special case too.
+#
+# 々 and 〻 are "iteration marks" that stand for the previous kanji. So they
+# act identically to kanji (ideograms) without technically _being_ kanji.  That
+# technicality doesn't matter to us.
+#
+# 〆 is a Japanese abbreviation for "total", and even this can be used in the
+# middle of words. Why isn't it just considered an ideograph? I don't know, I
+# didn't come up with this language, or Unicode for that matter.
+#
+# None of this even comes up when we're trying to tokenize Chinese and
+# Japanese.  It comes up when we're trying to _not_ tokenize a word because
+# it's Chinese or Japanese and the tokenization doesn't really matter, which
+# happens in ConceptNet.
+
+
 def _language_in_list(language, targets, min_score=80):
    """
    A helper function to determine whether this language matches one of the
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -3,7 +3,7 @@ import unicodedata
 import logging
 import langcodes

-from .language_info import get_language_info, SPACELESS_SCRIPTS
+from .language_info import get_language_info, SPACELESS_SCRIPTS, EXTRA_JAPANESE_CHARACTERS
 from .preprocess import preprocess_text, smash_numbers

 # Placeholders for CJK functions that we'll import on demand
@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
 def _make_spaceless_expr():
    scripts = sorted(SPACELESS_SCRIPTS)
    pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts]
-    return ''.join(pieces)
+    return ''.join(pieces) + EXTRA_JAPANESE_CHARACTERS


 SPACELESS_EXPR = _make_spaceless_expr()