mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Merge pull request #32 from LuminosoInsight/thai-fix
Leave Thai segments alone in the default regex
This commit is contained in:
commit
84497429e1
@ -100,7 +100,7 @@ def test_tokenization():
|
||||
# data
|
||||
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
|
||||
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
|
||||
|
||||
|
||||
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
|
||||
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
|
||||
|
||||
@ -180,3 +180,10 @@ def test_ideographic_fallback():
|
||||
tokenize(ja_text, 'en'),
|
||||
['ひらがな', 'カタカナ', 'romaji']
|
||||
)
|
||||
|
||||
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
||||
# we would actually split this into a three-word phrase.
|
||||
eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
|
||||
eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
|
||||
['การเล่นดนตรี', 'means', 'playing', 'music'])
|
||||
|
||||
|
@ -3,23 +3,24 @@ import unicodedata
|
||||
|
||||
|
||||
TOKEN_RE = regex.compile(r"""
|
||||
# Case 1: a special case for Chinese and Japanese
|
||||
# Case 1: a special case for non-spaced languages
|
||||
# -----------------------------------------------
|
||||
|
||||
# When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana
|
||||
# (\p{Script=Hiragana}), we allow a sequence of those characters to be
|
||||
# glued together as a single token. Without this case, the standard rule
|
||||
# (case 2) would make each character a separate token. This would be the
|
||||
# correct behavior for word-wrapping, but a messy failure mode for NLP
|
||||
# tokenization.
|
||||
# When we see characters that are Han ideographs (\p{IsIdeo}), hiragana
|
||||
# (\p{Script=Hiragana}), or Thai (\p{Script=Thai}), we allow a sequence
|
||||
# of those characters to be glued together as a single token.
|
||||
#
|
||||
# It is, of course, better to use a tokenizer that is designed for Chinese
|
||||
# or Japanese text. This is effectively a fallback for when the wrong
|
||||
# Without this case, the standard rule (case 2) would make each character
|
||||
# a separate token. This would be the correct behavior for word-wrapping,
|
||||
# but a messy failure mode for NLP tokenization.
|
||||
#
|
||||
# It is, of course, better to use a tokenizer that is designed for Chinese,
|
||||
# Japanese, or Thai text. This is effectively a fallback for when the wrong
|
||||
# tokenizer is used.
|
||||
#
|
||||
# This rule is listed first so that it takes precedence.
|
||||
|
||||
[\p{IsIdeo}\p{Script=Hiragana}]+ |
|
||||
[\p{IsIdeo}\p{Script=Hiragana}\p{Script=Thai}]+ |
|
||||
|
||||
# Case 2: standard Unicode segmentation
|
||||
# -------------------------------------
|
||||
|
Loading…
Reference in New Issue
Block a user