Merge pull request #56 from LuminosoInsight/japanese-edge-cases

Handle Japanese edge cases in `simple_tokenize`
This commit is contained in:
Lance Nathan 2018-05-01 14:57:45 -04:00 committed by GitHub
commit 316670a234
7 changed files with 101 additions and 20 deletions

View File

@ -1,3 +1,27 @@
## Version 2.0.1 (2018-05-01)
Fixed edge cases that inserted spurious token boundaries when Japanese text is
run through `simple_tokenize`, because of a few characters that don't match any
of our "spaceless scripts".
It is not a typical situation for Japanese text to be passed through
`simple_tokenize`, because Japanese text should instead use the
Japanese-specific tokenization in `wordfreq.mecab`.
However, some downstream uses of wordfreq have justifiable reasons to pass all
terms through `simple_tokenize`, even terms that may be in Japanese, and in
those cases we want to detect only the most obvious token boundaries.
In this situation, we no longer try to detect script changes, such as between
kanji and katakana, as token boundaries. This particularly allows us to keep
together Japanese words where ヶ appears betwen kanji, as well as words that
use the iteration mark 々.
This change does not affect any word frequencies. (The Japanese word list uses
`wordfreq.mecab` for tokenization, not `simple_tokenize`.)
## Version 2.0 (2018-03-14) ## Version 2.0 (2018-03-14)
The big change in this version is that text preprocessing, tokenization, and The big change in this version is that text preprocessing, tokenization, and

View File

@ -416,9 +416,9 @@ sources:
- Wikipedia, the free encyclopedia (http://www.wikipedia.org) - Wikipedia, the free encyclopedia (http://www.wikipedia.org)
It contains data from OPUS OpenSubtitles 2016 It contains data from OPUS OpenSubtitles 2018
(http://opus.lingfil.uu.se/OpenSubtitles2016.php), whose data originates from (http://opus.nlpl.eu/OpenSubtitles.php), whose data originates from the
the OpenSubtitles project (http://www.opensubtitles.org/). OpenSubtitles project (http://www.opensubtitles.org/).
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK, It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al. SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.

View File

@ -36,7 +36,7 @@ if sys.version_info < (3, 4):
setup( setup(
name="wordfreq", name="wordfreq",
version='2.0', version='2.0.1',
maintainer='Luminoso Technologies, Inc.', maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com', maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/', url='http://github.com/LuminosoInsight/wordfreq/',

View File

@ -204,16 +204,11 @@ def test_arabic():
def test_ideographic_fallback(): def test_ideographic_fallback():
# Try tokenizing Chinese text as English -- it should remain stuck together. # Try tokenizing Chinese text as English -- it should remain stuck together.
#
# More complex examples like this, involving the multiple scripts of Japanese,
# are in test_japanese.py.
eq_(tokenize('中国文字', 'en'), ['中国文字']) eq_(tokenize('中国文字', 'en'), ['中国文字'])
# When Japanese is tagged with the wrong language, it will be split
# at script boundaries.
ja_text = 'ひらがなカタカナromaji'
eq_(
tokenize(ja_text, 'en'),
['ひらがな', 'カタカナ', 'romaji']
)
def test_other_languages(): def test_other_languages():
# Test that we leave Thai letters stuck together. If we had better Thai support, # Test that we leave Thai letters stuck together. If we had better Thai support,

View File

@ -1,5 +1,5 @@
from nose.tools import eq_, assert_almost_equal from nose.tools import eq_, assert_almost_equal
from wordfreq import tokenize, word_frequency from wordfreq import tokenize, simple_tokenize, word_frequency
def test_tokens(): def test_tokens():
@ -7,6 +7,46 @@ def test_tokens():
['おはよう', 'ござい', 'ます']) ['おはよう', 'ござい', 'ます'])
def test_simple_tokenize():
# When Japanese is run through simple_tokenize -- either because it's
# tagged with the wrong language, or because we want to pass through
# Japanese text without getting MeCab involved -- it will be split at
# boundaries between Japanese and non-Japanese scripts, but all Japanese
# scripts will be stuck together. Here the switch between hiragana
# (ひらがな) and katakana (カタカナ) is not a boundary, but the switch
# between katakana and romaji is.
#
# We used to try to infer word boundaries between hiragana and katakana,
# but this leads to edge cases that are unsolvable without a dictionary.
ja_text = 'ひらがなカタカナromaji'
eq_(
simple_tokenize(ja_text),
['ひらがなカタカナ', 'romaji']
)
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
# but sticks together in simple_tokenize
eq_(simple_tokenize('おはようございます'), ['おはようございます'])
# Names that use the weird possessive marker ヶ, which is technically a
# katakana even though it's being used like a kanji, stay together as one
# token
eq_(simple_tokenize("犬ヶ島"), ["犬ヶ島"])
# The word in ConceptNet that made me notice that simple_tokenize used
# to have a problem with the character 々
eq_(simple_tokenize("晴々しい"), ["晴々しい"])
# Explicit word separators are still token boundaries, such as the dot
# between "toner" and "cartridge" in "toner cartridge"
eq_(simple_tokenize("トナー・カートリッジ"), ["トナー", "カートリッジ"])
# This word has multiple weird characters that aren't quite kanji in it,
# and is in the dictionary
eq_(simple_tokenize("見ヶ〆料"), ["見ヶ〆料"])
def test_combination(): def test_combination():
ohayou_freq = word_frequency('おはよう', 'ja') ohayou_freq = word_frequency('おはよう', 'ja')
gozai_freq = word_frequency('ござい', 'ja') gozai_freq = word_frequency('ござい', 'ja')

View File

@ -8,11 +8,13 @@ from langcodes import Language, best_match
# a specific tokenizer for the language or give up. # a specific tokenizer for the language or give up.
SPACELESS_SCRIPTS = [ SPACELESS_SCRIPTS = [
# Han ideographs are spaceless, but they don't need to appear in this list # Han ideographs are spaceless, but they don't need to appear in this list
# because they have their own cases in get_language_info and TOKEN_RE. # because _almost_ all of them, except for some exceptional Japanese
'Hiragana', # characters, are covered by the \p{IsIdeo} check. Checking for
# We omit katakana because Unicode regular expressions can already # Script=Hani and IsIdeo slows down our regexes with huge, redundant
# tokenize sequences of katakana, and omitting it here means we can also # classes of characters. Instead, we'll list the exceptions below.
# recognize a switch between hiragana and katakana as a token boundary.
'Hira', # Hiragana
'Kana', # Katakana
'Thai', # Thai script 'Thai', # Thai script
'Khmr', # Khmer script 'Khmr', # Khmer script
'Laoo', # Lao script 'Laoo', # Lao script
@ -23,6 +25,26 @@ SPACELESS_SCRIPTS = [
] ]
EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆'
# ー is a lengthening mark that's both hiragana and katakana. Unicode
# segmentation handles it as a special case, but we're overriding standard
# Unicode segmentation, so we need to have the special case too.
#
# 々 and 〻 are "iteration marks" that stand for the previous kanji. So they
# act identically to kanji (ideograms) without technically _being_ kanji. That
# technicality doesn't matter to us.
#
# 〆 is a Japanese abbreviation for "total", and even this can be used in the
# middle of words. Why isn't it just considered an ideograph? I don't know, I
# didn't come up with this language, or Unicode for that matter.
#
# None of this even comes up when we're trying to tokenize Chinese and
# Japanese. It comes up when we're trying to _not_ tokenize a word because
# it's Chinese or Japanese and the tokenization doesn't really matter, which
# happens in ConceptNet.
def _language_in_list(language, targets, min_score=80): def _language_in_list(language, targets, min_score=80):
""" """
A helper function to determine whether this language matches one of the A helper function to determine whether this language matches one of the

View File

@ -3,7 +3,7 @@ import unicodedata
import logging import logging
import langcodes import langcodes
from .language_info import get_language_info, SPACELESS_SCRIPTS from .language_info import get_language_info, SPACELESS_SCRIPTS, EXTRA_JAPANESE_CHARACTERS
from .preprocess import preprocess_text, smash_numbers from .preprocess import preprocess_text, smash_numbers
# Placeholders for CJK functions that we'll import on demand # Placeholders for CJK functions that we'll import on demand
@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
def _make_spaceless_expr(): def _make_spaceless_expr():
scripts = sorted(SPACELESS_SCRIPTS) scripts = sorted(SPACELESS_SCRIPTS)
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts] pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts]
return ''.join(pieces) return ''.join(pieces) + EXTRA_JAPANESE_CHARACTERS
SPACELESS_EXPR = _make_spaceless_expr() SPACELESS_EXPR = _make_spaceless_expr()