Merge pull request #56 from LuminosoInsight/japanese-edge-cases

Handle Japanese edge cases in `simple_tokenize`
2024-12-23 17:31:41 +00:00 · 2018-05-01 14:57:45 -04:00 · 2018-05-01 14:57:45 -04:00 · 316670a234
commit 316670a234
parent 18f176dbf6 e0da20b0c4
7 changed files with 101 additions and 20 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,27 @@
 ## Version 2.0.1 (2018-05-01)
 Fixed edge cases that inserted spurious token boundaries when Japanese text is
 run through `simple_tokenize`, because of a few characters that don't match any
 of our "spaceless scripts".
 It is not a typical situation for Japanese text to be passed through
 `simple_tokenize`, because Japanese text should instead use the
 Japanese-specific tokenization in `wordfreq.mecab`.
 However, some downstream uses of wordfreq have justifiable reasons to pass all
 terms through `simple_tokenize`, even terms that may be in Japanese, and in
 those cases we want to detect only the most obvious token boundaries.
 In this situation, we no longer try to detect script changes, such as between
 kanji and katakana, as token boundaries. This particularly allows us to keep
 together Japanese words where ヶ appears betwen kanji, as well as words that
 use the iteration mark 々.
 This change does not affect any word frequencies. (The Japanese word list uses
 `wordfreq.mecab` for tokenization, not `simple_tokenize`.)
 ## Version 2.0 (2018-03-14)
 The big change in this version is that text preprocessing, tokenization, and
--- a/README.md
+++ b/README.md
@ -416,9 +416,9 @@ sources:
 - Wikipedia, the free encyclopedia (http://www.wikipedia.org)
-It contains data from OPUS OpenSubtitles 2016
+It contains data from OPUS OpenSubtitles 2018
-(http://opus.lingfil.uu.se/OpenSubtitles2016.php), whose data originates from
+(http://opus.nlpl.eu/OpenSubtitles.php), whose data originates from the
-the OpenSubtitles project (http://www.opensubtitles.org/).
+OpenSubtitles project (http://www.opensubtitles.org/).
 It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
 SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
--- a/setup.py
+++ b/setup.py
@ -36,7 +36,7 @@ if sys.version_info < (3, 4):
 setup(
    name="wordfreq",
-    version='2.0',
+    version='2.0.1',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
--- a/tests/test.py
+++ b/tests/test.py
@ -204,16 +204,11 @@ def test_arabic():
 def test_ideographic_fallback():
    # Try tokenizing Chinese text as English -- it should remain stuck together.
    #
    # More complex examples like this, involving the multiple scripts of Japanese,
    # are in test_japanese.py.
    eq_(tokenize('中国文字', 'en'), ['中国文字'])
    # When Japanese is tagged with the wrong language, it will be split
    # at script boundaries.
    ja_text = 'ひらがなカタカナromaji'
    eq_(
        tokenize(ja_text, 'en'),
        ['ひらがな', 'カタカナ', 'romaji']
    )
 def test_other_languages():
    # Test that we leave Thai letters stuck together. If we had better Thai support,
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@ -1,5 +1,5 @@
 from nose.tools import eq_, assert_almost_equal
-from wordfreq import tokenize, word_frequency
+from wordfreq import tokenize, simple_tokenize, word_frequency
 def test_tokens():
@ -7,6 +7,46 @@ def test_tokens():
        ['おはよう', 'ござい', 'ます'])
 def test_simple_tokenize():
    # When Japanese is run through simple_tokenize -- either because it's
    # tagged with the wrong language, or because we want to pass through
    # Japanese text without getting MeCab involved -- it will be split at
    # boundaries between Japanese and non-Japanese scripts, but all Japanese
    # scripts will be stuck together. Here the switch between hiragana
    # (ひらがな) and katakana (カタカナ) is not a boundary, but the switch
    # between katakana and romaji is.
    #
    # We used to try to infer word boundaries between hiragana and katakana,
    # but this leads to edge cases that are unsolvable without a dictionary.
    ja_text = 'ひらがなカタカナromaji'
    eq_(
        simple_tokenize(ja_text),
        ['ひらがなカタカナ', 'romaji']
    )
    # An example that would be multiple tokens if tokenized as 'ja' via MeCab,
    # but sticks together in simple_tokenize
    eq_(simple_tokenize('おはようございます'), ['おはようございます'])
    # Names that use the weird possessive marker ヶ, which is technically a
    # katakana even though it's being used like a kanji, stay together as one
    # token
    eq_(simple_tokenize("犬ヶ島"), ["犬ヶ島"])
    # The word in ConceptNet that made me notice that simple_tokenize used
    # to have a problem with the character 々
    eq_(simple_tokenize("晴々しい"), ["晴々しい"])
    # Explicit word separators are still token boundaries, such as the dot
    # between "toner" and "cartridge" in "toner cartridge"
    eq_(simple_tokenize("トナー・カートリッジ"), ["トナー", "カートリッジ"])
    # This word has multiple weird characters that aren't quite kanji in it,
    # and is in the dictionary
    eq_(simple_tokenize("見ヶ〆料"), ["見ヶ〆料"])
 def test_combination():
    ohayou_freq = word_frequency('おはよう', 'ja')
    gozai_freq = word_frequency('ござい', 'ja')
--- a/wordfreq/language_info.py
+++ b/wordfreq/language_info.py
@ -8,11 +8,13 @@ from langcodes import Language, best_match
 # a specific tokenizer for the language or give up.
 SPACELESS_SCRIPTS = [
    # Han ideographs are spaceless, but they don't need to appear in this list
-    # because they have their own cases in get_language_info and TOKEN_RE.
+    # because _almost_ all of them, except for some exceptional Japanese
-    'Hiragana',
+    # characters, are covered by the \p{IsIdeo} check.  Checking for
-    # We omit katakana because Unicode regular expressions can already
+    # Script=Hani and IsIdeo slows down our regexes with huge, redundant
-    # tokenize sequences of katakana, and omitting it here means we can also
+    # classes of characters. Instead, we'll list the exceptions below.
-    # recognize a switch between hiragana and katakana as a token boundary.
+
    'Hira',  # Hiragana
    'Kana',  # Katakana
    'Thai',  # Thai script
    'Khmr',  # Khmer script
    'Laoo',  # Lao script
@ -23,6 +25,26 @@ SPACELESS_SCRIPTS = [
 ]
 EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆'
 # ー is a lengthening mark that's both hiragana and katakana. Unicode
 # segmentation handles it as a special case, but we're overriding standard
 # Unicode segmentation, so we need to have the special case too.
 #
 # 々 and 〻 are "iteration marks" that stand for the previous kanji. So they
 # act identically to kanji (ideograms) without technically _being_ kanji.  That
 # technicality doesn't matter to us.
 #
 # 〆 is a Japanese abbreviation for "total", and even this can be used in the
 # middle of words. Why isn't it just considered an ideograph? I don't know, I
 # didn't come up with this language, or Unicode for that matter.
 #
 # None of this even comes up when we're trying to tokenize Chinese and
 # Japanese.  It comes up when we're trying to _not_ tokenize a word because
 # it's Chinese or Japanese and the tokenization doesn't really matter, which
 # happens in ConceptNet.
 def _language_in_list(language, targets, min_score=80):
    """
    A helper function to determine whether this language matches one of the
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -3,7 +3,7 @@ import unicodedata
 import logging
 import langcodes
-from .language_info import get_language_info, SPACELESS_SCRIPTS
+from .language_info import get_language_info, SPACELESS_SCRIPTS, EXTRA_JAPANESE_CHARACTERS
 from .preprocess import preprocess_text, smash_numbers
 # Placeholders for CJK functions that we'll import on demand
@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
 def _make_spaceless_expr():
    scripts = sorted(SPACELESS_SCRIPTS)
    pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts]
-    return ''.join(pieces)
+    return ''.join(pieces) + EXTRA_JAPANESE_CHARACTERS
 SPACELESS_EXPR = _make_spaceless_expr()