Handle Japanese edge cases in simple_tokenize

This commit is contained in:
Robyn Speer 2018-04-26 15:53:07 -04:00
parent 18f176dbf6
commit 666f7e51fa
6 changed files with 77 additions and 20 deletions

View File

@ -416,9 +416,9 @@ sources:
- Wikipedia, the free encyclopedia (http://www.wikipedia.org) - Wikipedia, the free encyclopedia (http://www.wikipedia.org)
It contains data from OPUS OpenSubtitles 2016 It contains data from OPUS OpenSubtitles 2018
(http://opus.lingfil.uu.se/OpenSubtitles2016.php), whose data originates from (http://opus.nlpl.eu/OpenSubtitles.php), whose data originates from the
the OpenSubtitles project (http://www.opensubtitles.org/). OpenSubtitles project (http://www.opensubtitles.org/).
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al. SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.

View File

@ -36,7 +36,7 @@ if sys.version_info < (3, 4):
setup( setup(
name="wordfreq", name="wordfreq",
version='2.0', version='2.0.1',
maintainer='Luminoso Technologies, Inc.', maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com', maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/', url='http://github.com/LuminosoInsight/wordfreq/',

View File

@ -204,16 +204,11 @@ def test_arabic():
def test_ideographic_fallback(): def test_ideographic_fallback():
# Try tokenizing Chinese text as English -- it should remain stuck together. # Try tokenizing Chinese text as English -- it should remain stuck together.
#
# More complex examples like this, involving the multiple scripts of Japanese,
# are in test_japanese.py.
eq_(tokenize('中国文字', 'en'), ['中国文字']) eq_(tokenize('中国文字', 'en'), ['中国文字'])
# When Japanese is tagged with the wrong language, it will be split
# at script boundaries.
ja_text = 'ひらがなカタカナromaji'
eq_(
tokenize(ja_text, 'en'),
['ひらがな', 'カタカナ', 'romaji']
)
def test_other_languages(): def test_other_languages():
# Test that we leave Thai letters stuck together. If we had better Thai support, # Test that we leave Thai letters stuck together. If we had better Thai support,

View File

@ -1,5 +1,5 @@
from nose.tools import eq_, assert_almost_equal from nose.tools import eq_, assert_almost_equal
from wordfreq import tokenize, word_frequency from wordfreq import tokenize, simple_tokenize, word_frequency
def test_tokens(): def test_tokens():
@ -7,6 +7,46 @@ def test_tokens():
['おはよう', 'ござい', 'ます']) ['おはよう', 'ござい', 'ます'])
def test_simple_tokenize():
# When Japanese is run through simple_tokenize -- either because it's
# tagged with the wrong language, or because we want to pass through
# Japanese text without getting MeCab involved -- it will be split at
# boundaries between Japanese and non-Japanese scripts, but all Japanese
# scripts will be stuck together. Here the switch between hiragana
# (ひらがな) and katakana (カタカナ) is not a boundary, but the switch
# between katakana and romaji is.
#
# We used to try to infer word boundaries between hiragana and katakana,
# but this leads to edge cases that are unsolvable without a dictionary.
ja_text = 'ひらがなカタカナromaji'
eq_(
simple_tokenize(ja_text),
['ひらがなカタカナ', 'romaji']
)
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
# but sticks together in simple_tokenize
eq_(simple_tokenize('おはようございます'), ['おはようございます'])
# Names that use the weird possessive marker ヶ, which is technically a
# katakana even though it's being used like a kanji, stay together as one
# token
eq_(simple_tokenize("犬ヶ島"), ["犬ヶ島"])
# The word in ConceptNet that made me notice that simple_tokenize used
# to have a problem with the character 々
eq_(simple_tokenize("晴々しい"), ["晴々しい"])
# Explicit word separators are still token boundaries, such as the dot
# between "toner" and "cartridge" in "toner cartridge"
eq_(simple_tokenize("トナー・カートリッジ"), ["トナー", "カートリッジ"])
# This word has multiple weird characters that aren't quite kanji in it,
# and is in the dictionary
eq_(simple_tokenize("見ヶ〆料"), ["見ヶ〆料"])
def test_combination(): def test_combination():
ohayou_freq = word_frequency('おはよう', 'ja') ohayou_freq = word_frequency('おはよう', 'ja')
gozai_freq = word_frequency('ござい', 'ja') gozai_freq = word_frequency('ござい', 'ja')

View File

@ -8,11 +8,13 @@ from langcodes import Language, best_match
# a specific tokenizer for the language or give up. # a specific tokenizer for the language or give up.
SPACELESS_SCRIPTS = [ SPACELESS_SCRIPTS = [
# Han ideographs are spaceless, but they don't need to appear in this list # Han ideographs are spaceless, but they don't need to appear in this list
# because they have their own cases in get_language_info and TOKEN_RE. # because _almost_ all of them, except for some exceptional Japanese
'Hiragana', # characters, are covered by the \p{IsIdeo} check. Checking for
# We omit katakana because Unicode regular expressions can already # Script=Hani and IsIdeo slows down our regexes with huge, redundant
# tokenize sequences of katakana, and omitting it here means we can also # classes of characters. Instead, we'll list the exceptions below.
# recognize a switch between hiragana and katakana as a token boundary.
'Hira', # Hiragana
'Kana', # Katakana
'Thai', # Thai script 'Thai', # Thai script
'Khmr', # Khmer script 'Khmr', # Khmer script
'Laoo', # Lao script 'Laoo', # Lao script
@ -23,6 +25,26 @@ SPACELESS_SCRIPTS = [
] ]
EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆'
# ー is a lengthening mark that's both hiragana and katakana. Unicode
# segmentation handles it as a special case, but we're overriding standard
# Unicode segmentation, so we need to have the special case too.
#
# 々 and 〻 are "iteration marks" that stand for the previous kanji. So they
# act identically to kanji (ideograms) without technically _being_ kanji. That
# technicality doesn't matter to us.
#
# 〆 is a Japanese abbreviation for "total", and even this can be used in the
# middle of words. Why isn't it just considered an ideograph? I don't know, I
# didn't come up with this language, or Unicode for that matter.
#
# None of this even comes up when we're trying to tokenize Chinese and
# Japanese. It comes up when we're trying to _not_ tokenize a word because
# it's Chinese or Japanese and the tokenization doesn't really matter, which
# happens in ConceptNet.
def _language_in_list(language, targets, min_score=80): def _language_in_list(language, targets, min_score=80):
""" """
A helper function to determine whether this language matches one of the A helper function to determine whether this language matches one of the

View File

@ -3,7 +3,7 @@ import unicodedata
import logging import logging
import langcodes import langcodes
from .language_info import get_language_info, SPACELESS_SCRIPTS from .language_info import get_language_info, SPACELESS_SCRIPTS, EXTRA_JAPANESE_CHARACTERS
from .preprocess import preprocess_text, smash_numbers from .preprocess import preprocess_text, smash_numbers
# Placeholders for CJK functions that we'll import on demand # Placeholders for CJK functions that we'll import on demand
@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
def _make_spaceless_expr(): def _make_spaceless_expr():
scripts = sorted(SPACELESS_SCRIPTS) scripts = sorted(SPACELESS_SCRIPTS)
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts] pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts]
return ''.join(pieces) return ''.join(pieces) + EXTRA_JAPANESE_CHARACTERS
SPACELESS_EXPR = _make_spaceless_expr() SPACELESS_EXPR = _make_spaceless_expr()