From 666f7e51fa52d417d5b97a80a79fb3153156a1fe Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Thu, 26 Apr 2018 15:53:07 -0400 Subject: [PATCH] Handle Japanese edge cases in simple_tokenize --- README.md | 6 +++--- setup.py | 2 +- tests/test.py | 11 +++------- tests/test_japanese.py | 42 ++++++++++++++++++++++++++++++++++++++- wordfreq/language_info.py | 32 ++++++++++++++++++++++++----- wordfreq/tokens.py | 4 ++-- 6 files changed, 77 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 995286f..5b7de47 100644 --- a/README.md +++ b/README.md @@ -416,9 +416,9 @@ sources: - Wikipedia, the free encyclopedia (http://www.wikipedia.org) -It contains data from OPUS OpenSubtitles 2016 -(http://opus.lingfil.uu.se/OpenSubtitles2016.php), whose data originates from -the OpenSubtitles project (http://www.opensubtitles.org/). +It contains data from OPUS OpenSubtitles 2018 +(http://opus.nlpl.eu/OpenSubtitles.php), whose data originates from the +OpenSubtitles project (http://www.opensubtitles.org/). It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al. diff --git a/setup.py b/setup.py index 4680980..620a67e 100755 --- a/setup.py +++ b/setup.py @@ -36,7 +36,7 @@ if sys.version_info < (3, 4): setup( name="wordfreq", - version='2.0', + version='2.0.1', maintainer='Luminoso Technologies, Inc.', maintainer_email='info@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/', diff --git a/tests/test.py b/tests/test.py index de82d9a..d7fb321 100644 --- a/tests/test.py +++ b/tests/test.py @@ -204,16 +204,11 @@ def test_arabic(): def test_ideographic_fallback(): # Try tokenizing Chinese text as English -- it should remain stuck together. + # + # More complex examples like this, involving the multiple scripts of Japanese, + # are in test_japanese.py. eq_(tokenize('中国文字', 'en'), ['中国文字']) - # When Japanese is tagged with the wrong language, it will be split - # at script boundaries. - ja_text = 'ひらがなカタカナromaji' - eq_( - tokenize(ja_text, 'en'), - ['ひらがな', 'カタカナ', 'romaji'] - ) - def test_other_languages(): # Test that we leave Thai letters stuck together. If we had better Thai support, diff --git a/tests/test_japanese.py b/tests/test_japanese.py index d5a73b3..1cd1efa 100644 --- a/tests/test_japanese.py +++ b/tests/test_japanese.py @@ -1,5 +1,5 @@ from nose.tools import eq_, assert_almost_equal -from wordfreq import tokenize, word_frequency +from wordfreq import tokenize, simple_tokenize, word_frequency def test_tokens(): @@ -7,6 +7,46 @@ def test_tokens(): ['おはよう', 'ござい', 'ます']) +def test_simple_tokenize(): + # When Japanese is run through simple_tokenize -- either because it's + # tagged with the wrong language, or because we want to pass through + # Japanese text without getting MeCab involved -- it will be split at + # boundaries between Japanese and non-Japanese scripts, but all Japanese + # scripts will be stuck together. Here the switch between hiragana + # (ひらがな) and katakana (カタカナ) is not a boundary, but the switch + # between katakana and romaji is. + # + # We used to try to infer word boundaries between hiragana and katakana, + # but this leads to edge cases that are unsolvable without a dictionary. + ja_text = 'ひらがなカタカナromaji' + eq_( + simple_tokenize(ja_text), + ['ひらがなカタカナ', 'romaji'] + ) + + # An example that would be multiple tokens if tokenized as 'ja' via MeCab, + # but sticks together in simple_tokenize + eq_(simple_tokenize('おはようございます'), ['おはようございます']) + + # Names that use the weird possessive marker ヶ, which is technically a + # katakana even though it's being used like a kanji, stay together as one + # token + eq_(simple_tokenize("犬ヶ島"), ["犬ヶ島"]) + + # The word in ConceptNet that made me notice that simple_tokenize used + # to have a problem with the character 々 + eq_(simple_tokenize("晴々しい"), ["晴々しい"]) + + # Explicit word separators are still token boundaries, such as the dot + # between "toner" and "cartridge" in "toner cartridge" + eq_(simple_tokenize("トナー・カートリッジ"), ["トナー", "カートリッジ"]) + + # This word has multiple weird characters that aren't quite kanji in it, + # and is in the dictionary + eq_(simple_tokenize("見ヶ〆料"), ["見ヶ〆料"]) + + + def test_combination(): ohayou_freq = word_frequency('おはよう', 'ja') gozai_freq = word_frequency('ござい', 'ja') diff --git a/wordfreq/language_info.py b/wordfreq/language_info.py index 6c56b64..3b736be 100644 --- a/wordfreq/language_info.py +++ b/wordfreq/language_info.py @@ -8,11 +8,13 @@ from langcodes import Language, best_match # a specific tokenizer for the language or give up. SPACELESS_SCRIPTS = [ # Han ideographs are spaceless, but they don't need to appear in this list - # because they have their own cases in get_language_info and TOKEN_RE. - 'Hiragana', - # We omit katakana because Unicode regular expressions can already - # tokenize sequences of katakana, and omitting it here means we can also - # recognize a switch between hiragana and katakana as a token boundary. + # because _almost_ all of them, except for some exceptional Japanese + # characters, are covered by the \p{IsIdeo} check. Checking for + # Script=Hani and IsIdeo slows down our regexes with huge, redundant + # classes of characters. Instead, we'll list the exceptions below. + + 'Hira', # Hiragana + 'Kana', # Katakana 'Thai', # Thai script 'Khmr', # Khmer script 'Laoo', # Lao script @@ -23,6 +25,26 @@ SPACELESS_SCRIPTS = [ ] +EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆' + +# ー is a lengthening mark that's both hiragana and katakana. Unicode +# segmentation handles it as a special case, but we're overriding standard +# Unicode segmentation, so we need to have the special case too. +# +# 々 and 〻 are "iteration marks" that stand for the previous kanji. So they +# act identically to kanji (ideograms) without technically _being_ kanji. That +# technicality doesn't matter to us. +# +# 〆 is a Japanese abbreviation for "total", and even this can be used in the +# middle of words. Why isn't it just considered an ideograph? I don't know, I +# didn't come up with this language, or Unicode for that matter. +# +# None of this even comes up when we're trying to tokenize Chinese and +# Japanese. It comes up when we're trying to _not_ tokenize a word because +# it's Chinese or Japanese and the tokenization doesn't really matter, which +# happens in ConceptNet. + + def _language_in_list(language, targets, min_score=80): """ A helper function to determine whether this language matches one of the diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index 8e1bb20..38f875b 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -3,7 +3,7 @@ import unicodedata import logging import langcodes -from .language_info import get_language_info, SPACELESS_SCRIPTS +from .language_info import get_language_info, SPACELESS_SCRIPTS, EXTRA_JAPANESE_CHARACTERS from .preprocess import preprocess_text, smash_numbers # Placeholders for CJK functions that we'll import on demand @@ -18,7 +18,7 @@ logger = logging.getLogger(__name__) def _make_spaceless_expr(): scripts = sorted(SPACELESS_SCRIPTS) pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts] - return ''.join(pieces) + return ''.join(pieces) + EXTRA_JAPANESE_CHARACTERS SPACELESS_EXPR = _make_spaceless_expr()