mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Handle Japanese edge cases in simple_tokenize
This commit is contained in:
parent
18f176dbf6
commit
666f7e51fa
@ -416,9 +416,9 @@ sources:
|
||||
|
||||
- Wikipedia, the free encyclopedia (http://www.wikipedia.org)
|
||||
|
||||
It contains data from OPUS OpenSubtitles 2016
|
||||
(http://opus.lingfil.uu.se/OpenSubtitles2016.php), whose data originates from
|
||||
the OpenSubtitles project (http://www.opensubtitles.org/).
|
||||
It contains data from OPUS OpenSubtitles 2018
|
||||
(http://opus.nlpl.eu/OpenSubtitles.php), whose data originates from the
|
||||
OpenSubtitles project (http://www.opensubtitles.org/).
|
||||
|
||||
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
|
||||
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
|
||||
|
2
setup.py
2
setup.py
@ -36,7 +36,7 @@ if sys.version_info < (3, 4):
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
version='2.0',
|
||||
version='2.0.1',
|
||||
maintainer='Luminoso Technologies, Inc.',
|
||||
maintainer_email='info@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
|
@ -204,16 +204,11 @@ def test_arabic():
|
||||
|
||||
def test_ideographic_fallback():
|
||||
# Try tokenizing Chinese text as English -- it should remain stuck together.
|
||||
#
|
||||
# More complex examples like this, involving the multiple scripts of Japanese,
|
||||
# are in test_japanese.py.
|
||||
eq_(tokenize('中国文字', 'en'), ['中国文字'])
|
||||
|
||||
# When Japanese is tagged with the wrong language, it will be split
|
||||
# at script boundaries.
|
||||
ja_text = 'ひらがなカタカナromaji'
|
||||
eq_(
|
||||
tokenize(ja_text, 'en'),
|
||||
['ひらがな', 'カタカナ', 'romaji']
|
||||
)
|
||||
|
||||
|
||||
def test_other_languages():
|
||||
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
||||
|
@ -1,5 +1,5 @@
|
||||
from nose.tools import eq_, assert_almost_equal
|
||||
from wordfreq import tokenize, word_frequency
|
||||
from wordfreq import tokenize, simple_tokenize, word_frequency
|
||||
|
||||
|
||||
def test_tokens():
|
||||
@ -7,6 +7,46 @@ def test_tokens():
|
||||
['おはよう', 'ござい', 'ます'])
|
||||
|
||||
|
||||
def test_simple_tokenize():
|
||||
# When Japanese is run through simple_tokenize -- either because it's
|
||||
# tagged with the wrong language, or because we want to pass through
|
||||
# Japanese text without getting MeCab involved -- it will be split at
|
||||
# boundaries between Japanese and non-Japanese scripts, but all Japanese
|
||||
# scripts will be stuck together. Here the switch between hiragana
|
||||
# (ひらがな) and katakana (カタカナ) is not a boundary, but the switch
|
||||
# between katakana and romaji is.
|
||||
#
|
||||
# We used to try to infer word boundaries between hiragana and katakana,
|
||||
# but this leads to edge cases that are unsolvable without a dictionary.
|
||||
ja_text = 'ひらがなカタカナromaji'
|
||||
eq_(
|
||||
simple_tokenize(ja_text),
|
||||
['ひらがなカタカナ', 'romaji']
|
||||
)
|
||||
|
||||
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
|
||||
# but sticks together in simple_tokenize
|
||||
eq_(simple_tokenize('おはようございます'), ['おはようございます'])
|
||||
|
||||
# Names that use the weird possessive marker ヶ, which is technically a
|
||||
# katakana even though it's being used like a kanji, stay together as one
|
||||
# token
|
||||
eq_(simple_tokenize("犬ヶ島"), ["犬ヶ島"])
|
||||
|
||||
# The word in ConceptNet that made me notice that simple_tokenize used
|
||||
# to have a problem with the character 々
|
||||
eq_(simple_tokenize("晴々しい"), ["晴々しい"])
|
||||
|
||||
# Explicit word separators are still token boundaries, such as the dot
|
||||
# between "toner" and "cartridge" in "toner cartridge"
|
||||
eq_(simple_tokenize("トナー・カートリッジ"), ["トナー", "カートリッジ"])
|
||||
|
||||
# This word has multiple weird characters that aren't quite kanji in it,
|
||||
# and is in the dictionary
|
||||
eq_(simple_tokenize("見ヶ〆料"), ["見ヶ〆料"])
|
||||
|
||||
|
||||
|
||||
def test_combination():
|
||||
ohayou_freq = word_frequency('おはよう', 'ja')
|
||||
gozai_freq = word_frequency('ござい', 'ja')
|
||||
|
@ -8,11 +8,13 @@ from langcodes import Language, best_match
|
||||
# a specific tokenizer for the language or give up.
|
||||
SPACELESS_SCRIPTS = [
|
||||
# Han ideographs are spaceless, but they don't need to appear in this list
|
||||
# because they have their own cases in get_language_info and TOKEN_RE.
|
||||
'Hiragana',
|
||||
# We omit katakana because Unicode regular expressions can already
|
||||
# tokenize sequences of katakana, and omitting it here means we can also
|
||||
# recognize a switch between hiragana and katakana as a token boundary.
|
||||
# because _almost_ all of them, except for some exceptional Japanese
|
||||
# characters, are covered by the \p{IsIdeo} check. Checking for
|
||||
# Script=Hani and IsIdeo slows down our regexes with huge, redundant
|
||||
# classes of characters. Instead, we'll list the exceptions below.
|
||||
|
||||
'Hira', # Hiragana
|
||||
'Kana', # Katakana
|
||||
'Thai', # Thai script
|
||||
'Khmr', # Khmer script
|
||||
'Laoo', # Lao script
|
||||
@ -23,6 +25,26 @@ SPACELESS_SCRIPTS = [
|
||||
]
|
||||
|
||||
|
||||
EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆'
|
||||
|
||||
# ー is a lengthening mark that's both hiragana and katakana. Unicode
|
||||
# segmentation handles it as a special case, but we're overriding standard
|
||||
# Unicode segmentation, so we need to have the special case too.
|
||||
#
|
||||
# 々 and 〻 are "iteration marks" that stand for the previous kanji. So they
|
||||
# act identically to kanji (ideograms) without technically _being_ kanji. That
|
||||
# technicality doesn't matter to us.
|
||||
#
|
||||
# 〆 is a Japanese abbreviation for "total", and even this can be used in the
|
||||
# middle of words. Why isn't it just considered an ideograph? I don't know, I
|
||||
# didn't come up with this language, or Unicode for that matter.
|
||||
#
|
||||
# None of this even comes up when we're trying to tokenize Chinese and
|
||||
# Japanese. It comes up when we're trying to _not_ tokenize a word because
|
||||
# it's Chinese or Japanese and the tokenization doesn't really matter, which
|
||||
# happens in ConceptNet.
|
||||
|
||||
|
||||
def _language_in_list(language, targets, min_score=80):
|
||||
"""
|
||||
A helper function to determine whether this language matches one of the
|
||||
|
@ -3,7 +3,7 @@ import unicodedata
|
||||
import logging
|
||||
import langcodes
|
||||
|
||||
from .language_info import get_language_info, SPACELESS_SCRIPTS
|
||||
from .language_info import get_language_info, SPACELESS_SCRIPTS, EXTRA_JAPANESE_CHARACTERS
|
||||
from .preprocess import preprocess_text, smash_numbers
|
||||
|
||||
# Placeholders for CJK functions that we'll import on demand
|
||||
@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
|
||||
def _make_spaceless_expr():
|
||||
scripts = sorted(SPACELESS_SCRIPTS)
|
||||
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts]
|
||||
return ''.join(pieces)
|
||||
return ''.join(pieces) + EXTRA_JAPANESE_CHARACTERS
|
||||
|
||||
|
||||
SPACELESS_EXPR = _make_spaceless_expr()
|
||||
|
Loading…
Reference in New Issue
Block a user