From 3155cf27e6bac4297c8b7a3f163100129989e503 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Fri, 1 Jul 2016 18:00:57 -0400 Subject: [PATCH] Fix tokenization of SE Asian and South Asian scripts (#37) Former-commit-id: 270f6c7ca616165e89ccbfa270d78eabc49782c4 --- setup.py | 2 +- tests/test.py | 5 ++++ wordfreq/tokens.py | 57 ++++++++++++++++++++++++++++++++++------------ 3 files changed, 49 insertions(+), 15 deletions(-) diff --git a/setup.py b/setup.py index 46abfe7..cbf7046 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ if sys.version_info < (3, 4): setup( name="wordfreq", - version='1.4', + version='1.4.1', maintainer='Luminoso Technologies, Inc.', maintainer_email='info@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/', diff --git a/tests/test.py b/tests/test.py index 6c20ccb..941bd90 100644 --- a/tests/test.py +++ b/tests/test.py @@ -189,3 +189,8 @@ def test_ideographic_fallback(): eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'), ['การเล่นดนตรี', 'means', 'playing', 'music']) + # Test Khmer, a script similar to Thai + eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍']) + + # Test Hindi -- tokens split where there are spaces, and not where there aren't + eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी']) diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index cc275f0..0332f05 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -2,25 +2,54 @@ import regex import unicodedata +# See the documentation inside TOKEN_RE for why we have to handle these +# scripts specially. +SPACELESS_SCRIPTS = [ + 'Hiragana', + 'Thai', # Thai script + 'Khmr', # Khmer script + 'Laoo', # Lao script + 'Mymr', # Burmese script + 'Tale', # Tai Le script + 'Talu', # Tai Lü script + 'Lana', # Lanna script +] + + +def _make_spaceless_expr(): + pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS] + return ''.join(pieces) + + +SPACELESS_EXPR = _make_spaceless_expr() + + TOKEN_RE = regex.compile(r""" # Case 1: a special case for non-spaced languages # ----------------------------------------------- - # When we see characters that are Han ideographs (\p{IsIdeo}), hiragana - # (\p{Script=Hiragana}), or Thai (\p{Script=Thai}), we allow a sequence - # of those characters to be glued together as a single token. + # Some scripts are written without spaces, and the Unicode algorithm + # seems to overreact and insert word breaks between all their letters. + # When we see sequences of characters in these scripts, we make sure not + # to break them up. Such scripts include Han ideographs (\p{IsIdeo}), + # hiragana (\p{Script=Hiragana}), and many Southeast Asian scripts such + # as Thai and Khmer. # # Without this case, the standard rule (case 2) would make each character # a separate token. This would be the correct behavior for word-wrapping, # but a messy failure mode for NLP tokenization. # - # It is, of course, better to use a tokenizer that is designed for Chinese, - # Japanese, or Thai text. This is effectively a fallback for when the wrong - # tokenizer is used. + # If you have Chinese or Japanese text, it's certainly better to use a + # tokenizer that's designed for it. Elsewhere in this file, we have + # specific tokenizers that can handle Chinese and Japanese. With this + # rule, though, at least this general tokenizer will fail less badly + # on those languages. # - # This rule is listed first so that it takes precedence. + # This rule is listed first so that it takes precedence. The placeholder + # will be replaced by the complex range expression made by + # _make_spaceless_expr(). - [\p{IsIdeo}\p{Script=Hiragana}\p{Script=Thai}]+ | + []+ | # Case 2: standard Unicode segmentation # ------------------------------------- @@ -34,16 +63,16 @@ TOKEN_RE = regex.compile(r""" # The rest of the token matches characters that are not any sort of space # (\S) and do not cause word breaks according to the Unicode word - # segmentation heuristic (\B). + # segmentation heuristic (\B), or are categorized as Marks (\p{M}). - (?:\B\S)* -""", regex.V1 | regex.WORD | regex.VERBOSE) + (?:\B\S|\p{M})* +""".replace('', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE) TOKEN_RE_WITH_PUNCTUATION = regex.compile(r""" - [\p{IsIdeo}\p{Script=Hiragana}]+ | + []+ | [\p{punct}]+ | - \S(?:\B\S)* -""", regex.V1 | regex.WORD | regex.VERBOSE) + \S(?:\B\S|\p{M})* +""".replace('', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE) ARABIC_MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)