mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Merge pull request #56 from LuminosoInsight/japanese-edge-cases
Handle Japanese edge cases in `simple_tokenize`
This commit is contained in:
commit
316670a234
24
CHANGELOG.md
24
CHANGELOG.md
@ -1,3 +1,27 @@
|
|||||||
|
## Version 2.0.1 (2018-05-01)
|
||||||
|
|
||||||
|
Fixed edge cases that inserted spurious token boundaries when Japanese text is
|
||||||
|
run through `simple_tokenize`, because of a few characters that don't match any
|
||||||
|
of our "spaceless scripts".
|
||||||
|
|
||||||
|
It is not a typical situation for Japanese text to be passed through
|
||||||
|
`simple_tokenize`, because Japanese text should instead use the
|
||||||
|
Japanese-specific tokenization in `wordfreq.mecab`.
|
||||||
|
|
||||||
|
However, some downstream uses of wordfreq have justifiable reasons to pass all
|
||||||
|
terms through `simple_tokenize`, even terms that may be in Japanese, and in
|
||||||
|
those cases we want to detect only the most obvious token boundaries.
|
||||||
|
|
||||||
|
In this situation, we no longer try to detect script changes, such as between
|
||||||
|
kanji and katakana, as token boundaries. This particularly allows us to keep
|
||||||
|
together Japanese words where ヶ appears betwen kanji, as well as words that
|
||||||
|
use the iteration mark 々.
|
||||||
|
|
||||||
|
This change does not affect any word frequencies. (The Japanese word list uses
|
||||||
|
`wordfreq.mecab` for tokenization, not `simple_tokenize`.)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Version 2.0 (2018-03-14)
|
## Version 2.0 (2018-03-14)
|
||||||
|
|
||||||
The big change in this version is that text preprocessing, tokenization, and
|
The big change in this version is that text preprocessing, tokenization, and
|
||||||
|
@ -416,9 +416,9 @@ sources:
|
|||||||
|
|
||||||
- Wikipedia, the free encyclopedia (http://www.wikipedia.org)
|
- Wikipedia, the free encyclopedia (http://www.wikipedia.org)
|
||||||
|
|
||||||
It contains data from OPUS OpenSubtitles 2016
|
It contains data from OPUS OpenSubtitles 2018
|
||||||
(http://opus.lingfil.uu.se/OpenSubtitles2016.php), whose data originates from
|
(http://opus.nlpl.eu/OpenSubtitles.php), whose data originates from the
|
||||||
the OpenSubtitles project (http://www.opensubtitles.org/).
|
OpenSubtitles project (http://www.opensubtitles.org/).
|
||||||
|
|
||||||
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
|
It contains data from various SUBTLEX word lists: SUBTLEX-US, SUBTLEX-UK,
|
||||||
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
|
SUBTLEX-CH, SUBTLEX-DE, and SUBTLEX-NL, created by Marc Brysbaert et al.
|
||||||
|
2
setup.py
2
setup.py
@ -36,7 +36,7 @@ if sys.version_info < (3, 4):
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="wordfreq",
|
name="wordfreq",
|
||||||
version='2.0',
|
version='2.0.1',
|
||||||
maintainer='Luminoso Technologies, Inc.',
|
maintainer='Luminoso Technologies, Inc.',
|
||||||
maintainer_email='info@luminoso.com',
|
maintainer_email='info@luminoso.com',
|
||||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||||
|
@ -204,16 +204,11 @@ def test_arabic():
|
|||||||
|
|
||||||
def test_ideographic_fallback():
|
def test_ideographic_fallback():
|
||||||
# Try tokenizing Chinese text as English -- it should remain stuck together.
|
# Try tokenizing Chinese text as English -- it should remain stuck together.
|
||||||
|
#
|
||||||
|
# More complex examples like this, involving the multiple scripts of Japanese,
|
||||||
|
# are in test_japanese.py.
|
||||||
eq_(tokenize('中国文字', 'en'), ['中国文字'])
|
eq_(tokenize('中国文字', 'en'), ['中国文字'])
|
||||||
|
|
||||||
# When Japanese is tagged with the wrong language, it will be split
|
|
||||||
# at script boundaries.
|
|
||||||
ja_text = 'ひらがなカタカナromaji'
|
|
||||||
eq_(
|
|
||||||
tokenize(ja_text, 'en'),
|
|
||||||
['ひらがな', 'カタカナ', 'romaji']
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_other_languages():
|
def test_other_languages():
|
||||||
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
from nose.tools import eq_, assert_almost_equal
|
from nose.tools import eq_, assert_almost_equal
|
||||||
from wordfreq import tokenize, word_frequency
|
from wordfreq import tokenize, simple_tokenize, word_frequency
|
||||||
|
|
||||||
|
|
||||||
def test_tokens():
|
def test_tokens():
|
||||||
@ -7,6 +7,46 @@ def test_tokens():
|
|||||||
['おはよう', 'ござい', 'ます'])
|
['おはよう', 'ござい', 'ます'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_simple_tokenize():
|
||||||
|
# When Japanese is run through simple_tokenize -- either because it's
|
||||||
|
# tagged with the wrong language, or because we want to pass through
|
||||||
|
# Japanese text without getting MeCab involved -- it will be split at
|
||||||
|
# boundaries between Japanese and non-Japanese scripts, but all Japanese
|
||||||
|
# scripts will be stuck together. Here the switch between hiragana
|
||||||
|
# (ひらがな) and katakana (カタカナ) is not a boundary, but the switch
|
||||||
|
# between katakana and romaji is.
|
||||||
|
#
|
||||||
|
# We used to try to infer word boundaries between hiragana and katakana,
|
||||||
|
# but this leads to edge cases that are unsolvable without a dictionary.
|
||||||
|
ja_text = 'ひらがなカタカナromaji'
|
||||||
|
eq_(
|
||||||
|
simple_tokenize(ja_text),
|
||||||
|
['ひらがなカタカナ', 'romaji']
|
||||||
|
)
|
||||||
|
|
||||||
|
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
|
||||||
|
# but sticks together in simple_tokenize
|
||||||
|
eq_(simple_tokenize('おはようございます'), ['おはようございます'])
|
||||||
|
|
||||||
|
# Names that use the weird possessive marker ヶ, which is technically a
|
||||||
|
# katakana even though it's being used like a kanji, stay together as one
|
||||||
|
# token
|
||||||
|
eq_(simple_tokenize("犬ヶ島"), ["犬ヶ島"])
|
||||||
|
|
||||||
|
# The word in ConceptNet that made me notice that simple_tokenize used
|
||||||
|
# to have a problem with the character 々
|
||||||
|
eq_(simple_tokenize("晴々しい"), ["晴々しい"])
|
||||||
|
|
||||||
|
# Explicit word separators are still token boundaries, such as the dot
|
||||||
|
# between "toner" and "cartridge" in "toner cartridge"
|
||||||
|
eq_(simple_tokenize("トナー・カートリッジ"), ["トナー", "カートリッジ"])
|
||||||
|
|
||||||
|
# This word has multiple weird characters that aren't quite kanji in it,
|
||||||
|
# and is in the dictionary
|
||||||
|
eq_(simple_tokenize("見ヶ〆料"), ["見ヶ〆料"])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_combination():
|
def test_combination():
|
||||||
ohayou_freq = word_frequency('おはよう', 'ja')
|
ohayou_freq = word_frequency('おはよう', 'ja')
|
||||||
gozai_freq = word_frequency('ござい', 'ja')
|
gozai_freq = word_frequency('ござい', 'ja')
|
||||||
|
@ -8,11 +8,13 @@ from langcodes import Language, best_match
|
|||||||
# a specific tokenizer for the language or give up.
|
# a specific tokenizer for the language or give up.
|
||||||
SPACELESS_SCRIPTS = [
|
SPACELESS_SCRIPTS = [
|
||||||
# Han ideographs are spaceless, but they don't need to appear in this list
|
# Han ideographs are spaceless, but they don't need to appear in this list
|
||||||
# because they have their own cases in get_language_info and TOKEN_RE.
|
# because _almost_ all of them, except for some exceptional Japanese
|
||||||
'Hiragana',
|
# characters, are covered by the \p{IsIdeo} check. Checking for
|
||||||
# We omit katakana because Unicode regular expressions can already
|
# Script=Hani and IsIdeo slows down our regexes with huge, redundant
|
||||||
# tokenize sequences of katakana, and omitting it here means we can also
|
# classes of characters. Instead, we'll list the exceptions below.
|
||||||
# recognize a switch between hiragana and katakana as a token boundary.
|
|
||||||
|
'Hira', # Hiragana
|
||||||
|
'Kana', # Katakana
|
||||||
'Thai', # Thai script
|
'Thai', # Thai script
|
||||||
'Khmr', # Khmer script
|
'Khmr', # Khmer script
|
||||||
'Laoo', # Lao script
|
'Laoo', # Lao script
|
||||||
@ -23,6 +25,26 @@ SPACELESS_SCRIPTS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆'
|
||||||
|
|
||||||
|
# ー is a lengthening mark that's both hiragana and katakana. Unicode
|
||||||
|
# segmentation handles it as a special case, but we're overriding standard
|
||||||
|
# Unicode segmentation, so we need to have the special case too.
|
||||||
|
#
|
||||||
|
# 々 and 〻 are "iteration marks" that stand for the previous kanji. So they
|
||||||
|
# act identically to kanji (ideograms) without technically _being_ kanji. That
|
||||||
|
# technicality doesn't matter to us.
|
||||||
|
#
|
||||||
|
# 〆 is a Japanese abbreviation for "total", and even this can be used in the
|
||||||
|
# middle of words. Why isn't it just considered an ideograph? I don't know, I
|
||||||
|
# didn't come up with this language, or Unicode for that matter.
|
||||||
|
#
|
||||||
|
# None of this even comes up when we're trying to tokenize Chinese and
|
||||||
|
# Japanese. It comes up when we're trying to _not_ tokenize a word because
|
||||||
|
# it's Chinese or Japanese and the tokenization doesn't really matter, which
|
||||||
|
# happens in ConceptNet.
|
||||||
|
|
||||||
|
|
||||||
def _language_in_list(language, targets, min_score=80):
|
def _language_in_list(language, targets, min_score=80):
|
||||||
"""
|
"""
|
||||||
A helper function to determine whether this language matches one of the
|
A helper function to determine whether this language matches one of the
|
||||||
|
@ -3,7 +3,7 @@ import unicodedata
|
|||||||
import logging
|
import logging
|
||||||
import langcodes
|
import langcodes
|
||||||
|
|
||||||
from .language_info import get_language_info, SPACELESS_SCRIPTS
|
from .language_info import get_language_info, SPACELESS_SCRIPTS, EXTRA_JAPANESE_CHARACTERS
|
||||||
from .preprocess import preprocess_text, smash_numbers
|
from .preprocess import preprocess_text, smash_numbers
|
||||||
|
|
||||||
# Placeholders for CJK functions that we'll import on demand
|
# Placeholders for CJK functions that we'll import on demand
|
||||||
@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
|
|||||||
def _make_spaceless_expr():
|
def _make_spaceless_expr():
|
||||||
scripts = sorted(SPACELESS_SCRIPTS)
|
scripts = sorted(SPACELESS_SCRIPTS)
|
||||||
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts]
|
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts]
|
||||||
return ''.join(pieces)
|
return ''.join(pieces) + EXTRA_JAPANESE_CHARACTERS
|
||||||
|
|
||||||
|
|
||||||
SPACELESS_EXPR = _make_spaceless_expr()
|
SPACELESS_EXPR = _make_spaceless_expr()
|
||||||
|
Loading…
Reference in New Issue
Block a user