diff --git a/tests/test.py b/tests/test.py index 0013dcb..07f8bef 100644 --- a/tests/test.py +++ b/tests/test.py @@ -100,7 +100,7 @@ def test_tokenization(): # data eq_(tokenize("I don't split at apostrophes, you see.", 'en'), ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']) - + eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True), ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']) @@ -180,3 +180,10 @@ def test_ideographic_fallback(): tokenize(ja_text, 'en'), ['ひらがな', 'カタカナ', 'romaji'] ) + + # Test that we leave Thai letters stuck together. If we had better Thai support, + # we would actually split this into a three-word phrase. + eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี']) + eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'), + ['การเล่นดนตรี', 'means', 'playing', 'music']) + diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index f4d1339..cc275f0 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -3,23 +3,24 @@ import unicodedata TOKEN_RE = regex.compile(r""" - # Case 1: a special case for Chinese and Japanese + # Case 1: a special case for non-spaced languages # ----------------------------------------------- - # When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana - # (\p{Script=Hiragana}), we allow a sequence of those characters to be - # glued together as a single token. Without this case, the standard rule - # (case 2) would make each character a separate token. This would be the - # correct behavior for word-wrapping, but a messy failure mode for NLP - # tokenization. + # When we see characters that are Han ideographs (\p{IsIdeo}), hiragana + # (\p{Script=Hiragana}), or Thai (\p{Script=Thai}), we allow a sequence + # of those characters to be glued together as a single token. # - # It is, of course, better to use a tokenizer that is designed for Chinese - # or Japanese text. This is effectively a fallback for when the wrong + # Without this case, the standard rule (case 2) would make each character + # a separate token. This would be the correct behavior for word-wrapping, + # but a messy failure mode for NLP tokenization. + # + # It is, of course, better to use a tokenizer that is designed for Chinese, + # Japanese, or Thai text. This is effectively a fallback for when the wrong # tokenizer is used. # # This rule is listed first so that it takes precedence. - [\p{IsIdeo}\p{Script=Hiragana}]+ | + [\p{IsIdeo}\p{Script=Hiragana}\p{Script=Thai}]+ | # Case 2: standard Unicode segmentation # -------------------------------------