mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
Leave Thai segments alone in the default regex
Our regex already has a special case to leave Chinese and Japanese alone
when an appropriate tokenizer for the language isn't being used, as
Unicode's default segmentation would make every character into its own
token.
The same thing happens in Thai, and we don't even *have* an appropriate
tokenizer for Thai, so I've added a similar fallback.
Former-commit-id: 07f16e6f03
This commit is contained in:
parent
927d4f45a4
commit
51e260b713
@ -116,6 +116,12 @@ def test_tokenization():
|
|||||||
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
|
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
|
||||||
['this', 'text', 'has', '...', 'punctuation', ':)'])
|
['this', 'text', 'has', '...', 'punctuation', ':)'])
|
||||||
|
|
||||||
|
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
||||||
|
# we would actually split this into a three-word phrase.
|
||||||
|
eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
|
||||||
|
eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
|
||||||
|
['การเล่นดนตรี', 'means', 'playing', 'music'])
|
||||||
|
|
||||||
|
|
||||||
def test_casefolding():
|
def test_casefolding():
|
||||||
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
||||||
|
@ -3,23 +3,24 @@ import unicodedata
|
|||||||
|
|
||||||
|
|
||||||
TOKEN_RE = regex.compile(r"""
|
TOKEN_RE = regex.compile(r"""
|
||||||
# Case 1: a special case for Chinese and Japanese
|
# Case 1: a special case for non-spaced languages
|
||||||
# -----------------------------------------------
|
# -----------------------------------------------
|
||||||
|
|
||||||
# When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana
|
# When we see characters that are Han ideographs (\p{IsIdeo}), hiragana
|
||||||
# (\p{Script=Hiragana}), we allow a sequence of those characters to be
|
# (\p{Script=Hiragana}), or Thai (\p{Script=Thai}), we allow a sequence
|
||||||
# glued together as a single token. Without this case, the standard rule
|
# of those characters to be glued together as a single token.
|
||||||
# (case 2) would make each character a separate token. This would be the
|
|
||||||
# correct behavior for word-wrapping, but a messy failure mode for NLP
|
|
||||||
# tokenization.
|
|
||||||
#
|
#
|
||||||
# It is, of course, better to use a tokenizer that is designed for Chinese
|
# Without this case, the standard rule (case 2) would make each character
|
||||||
# or Japanese text. This is effectively a fallback for when the wrong
|
# a separate token. This would be the correct behavior for word-wrapping,
|
||||||
|
# but a messy failure mode for NLP tokenization.
|
||||||
|
#
|
||||||
|
# It is, of course, better to use a tokenizer that is designed for Chinese,
|
||||||
|
# Japanese, or Thai text. This is effectively a fallback for when the wrong
|
||||||
# tokenizer is used.
|
# tokenizer is used.
|
||||||
#
|
#
|
||||||
# This rule is listed first so that it takes precedence.
|
# This rule is listed first so that it takes precedence.
|
||||||
|
|
||||||
[\p{IsIdeo}\p{Script=Hiragana}]+ |
|
[\p{IsIdeo}\p{Script=Hiragana}\p{Script=Thai}]+ |
|
||||||
|
|
||||||
# Case 2: standard Unicode segmentation
|
# Case 2: standard Unicode segmentation
|
||||||
# -------------------------------------
|
# -------------------------------------
|
||||||
|
Loading…
Reference in New Issue
Block a user