mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
move Thai test to where it makes more sense
This commit is contained in:
parent
07f16e6f03
commit
4ec6b56faa
@ -116,12 +116,6 @@ def test_tokenization():
|
|||||||
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
|
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
|
||||||
['this', 'text', 'has', '...', 'punctuation', ':)'])
|
['this', 'text', 'has', '...', 'punctuation', ':)'])
|
||||||
|
|
||||||
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
|
||||||
# we would actually split this into a three-word phrase.
|
|
||||||
eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
|
|
||||||
eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
|
|
||||||
['การเล่นดนตรี', 'means', 'playing', 'music'])
|
|
||||||
|
|
||||||
|
|
||||||
def test_casefolding():
|
def test_casefolding():
|
||||||
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
||||||
@ -186,3 +180,10 @@ def test_ideographic_fallback():
|
|||||||
tokenize(ja_text, 'en'),
|
tokenize(ja_text, 'en'),
|
||||||
['ひらがな', 'カタカナ', 'romaji']
|
['ひらがな', 'カタカナ', 'romaji']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
||||||
|
# we would actually split this into a three-word phrase.
|
||||||
|
eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
|
||||||
|
eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
|
||||||
|
['การเล่นดนตรี', 'means', 'playing', 'music'])
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user