From 51e260b7136565c325781d622ec6eda88a53a72f Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Mon, 22 Feb 2016 14:26:50 -0500 Subject: [PATCH] Leave Thai segments alone in the default regex Our regex already has a special case to leave Chinese and Japanese alone when an appropriate tokenizer for the language isn't being used, as Unicode's default segmentation would make every character into its own token. The same thing happens in Thai, and we don't even *have* an appropriate tokenizer for Thai, so I've added a similar fallback. Former-commit-id: 07f16e6f03cc42436a467eaab935996f22d37d46 --- tests/test.py | 8 +++++++- wordfreq/tokens.py | 21 +++++++++++---------- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/tests/test.py b/tests/test.py index 0013dcb..177ebf4 100644 --- a/tests/test.py +++ b/tests/test.py @@ -100,7 +100,7 @@ def test_tokenization(): # data eq_(tokenize("I don't split at apostrophes, you see.", 'en'), ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']) - + eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True), ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']) @@ -116,6 +116,12 @@ def test_tokenization(): eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True), ['this', 'text', 'has', '...', 'punctuation', ':)']) + # Test that we leave Thai letters stuck together. If we had better Thai support, + # we would actually split this into a three-word phrase. + eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี']) + eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'), + ['การเล่นดนตรี', 'means', 'playing', 'music']) + def test_casefolding(): eq_(tokenize('WEISS', 'de'), ['weiss']) diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index f4d1339..cc275f0 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -3,23 +3,24 @@ import unicodedata TOKEN_RE = regex.compile(r""" - # Case 1: a special case for Chinese and Japanese + # Case 1: a special case for non-spaced languages # ----------------------------------------------- - # When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana - # (\p{Script=Hiragana}), we allow a sequence of those characters to be - # glued together as a single token. Without this case, the standard rule - # (case 2) would make each character a separate token. This would be the - # correct behavior for word-wrapping, but a messy failure mode for NLP - # tokenization. + # When we see characters that are Han ideographs (\p{IsIdeo}), hiragana + # (\p{Script=Hiragana}), or Thai (\p{Script=Thai}), we allow a sequence + # of those characters to be glued together as a single token. # - # It is, of course, better to use a tokenizer that is designed for Chinese - # or Japanese text. This is effectively a fallback for when the wrong + # Without this case, the standard rule (case 2) would make each character + # a separate token. This would be the correct behavior for word-wrapping, + # but a messy failure mode for NLP tokenization. + # + # It is, of course, better to use a tokenizer that is designed for Chinese, + # Japanese, or Thai text. This is effectively a fallback for when the wrong # tokenizer is used. # # This rule is listed first so that it takes precedence. - [\p{IsIdeo}\p{Script=Hiragana}]+ | + [\p{IsIdeo}\p{Script=Hiragana}\p{Script=Thai}]+ | # Case 2: standard Unicode segmentation # -------------------------------------