wordfreq/tests/test.py

from wordfreq import (
    word_frequency, available_languages, cB_to_freq,
    top_n_list, random_words, random_ascii_words, tokenize
)
from nose.tools import (
    eq_, assert_almost_equal, assert_greater, raises
)


def test_freq_examples():
    # Stopwords are most common in the correct language
    assert_greater(word_frequency('the', 'en'),
                   word_frequency('de', 'en'))

    assert_greater(word_frequency('de', 'es'),
                   word_frequency('the', 'es'))


def test_languages():
    # Make sure the number of available languages doesn't decrease
    avail = available_languages()
    assert_greater(len(avail), 15)

    # Look up a word representing laughter in each language, and make sure
    # it has a non-zero frequency.
    for lang in avail:
        if lang in {'zh', 'ja'}:
            text = '笑'
        elif lang == 'ar':
            text = 'ههههه'
        else:
            text = 'lol'
        assert_greater(word_frequency(text, lang), 0)

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
        assert_greater(word_frequency(text, new_lang_code), 0, (text, new_lang_code))


def test_twitter():
    avail = available_languages('twitter')
    assert_greater(len(avail), 14)

    for lang in avail:
        assert_greater(word_frequency('rt', lang, 'twitter'),
                       word_frequency('rt', lang, 'combined'))


def test_minimums():
    eq_(word_frequency('esquivalience', 'en'), 0)
    eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
    eq_(word_frequency('the', 'en', minimum=1), 1)

def test_most_common_words():
    # If something causes the most common words in well-supported languages to
    # change, we should know.

    def get_most_common(lang):
        """
        Return the single most common word in the language.
        """
        return top_n_list(lang, 1)[0]

    eq_(get_most_common('ar'), 'في')
    eq_(get_most_common('de'), 'die')
    eq_(get_most_common('en'), 'the')
    eq_(get_most_common('es'), 'de')
    eq_(get_most_common('fr'), 'de')
    eq_(get_most_common('it'), 'di')
    eq_(get_most_common('ja'), 'の')
    eq_(get_most_common('nl'), 'de')
    eq_(get_most_common('pt'), 'de')
    eq_(get_most_common('ru'), 'в')
    eq_(get_most_common('tr'), 'bir')
    eq_(get_most_common('zh'), '的')


def test_language_matching():
    freq = word_frequency('的', 'zh')
    eq_(word_frequency('的', 'zh-TW'), freq)
    eq_(word_frequency('的', 'zh-CN'), freq)
    eq_(word_frequency('的', 'zh-Hant'), freq)
    eq_(word_frequency('的', 'zh-Hans'), freq)
    eq_(word_frequency('的', 'yue-HK'), freq)
    eq_(word_frequency('的', 'cmn'), freq)


def test_cB_conversion():
    eq_(cB_to_freq(0), 1.)
    assert_almost_equal(cB_to_freq(-100), 0.1)
    assert_almost_equal(cB_to_freq(-600), 1e-6)


@raises(ValueError)
def test_failed_cB_conversion():
    cB_to_freq(1)


def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data
    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])

    eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
        ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])

    # Certain punctuation does not inherently split a word.
    eq_(tokenize("Anything is possible at zombo.com", 'en'),
        ['anything', 'is', 'possible', 'at', 'zombo.com'])

    # Splits occur after symbols, and at splitting punctuation such as hyphens.
    eq_(tokenize('😂test', 'en'), ['😂', 'test'])

    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])

    eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
        ['this', 'text', 'has', '...', 'punctuation', ':)'])


def test_casefolding():
    eq_(tokenize('WEISS', 'de'), ['weiss'])
    eq_(tokenize('weiß', 'de'), ['weiss'])
    eq_(tokenize('İstanbul', 'tr'), ['istanbul'])
    eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])


def test_phrase_freq():
    ff = word_frequency("flip-flop", 'en')
    assert_greater(ff, 0)
    assert_almost_equal(
        1.0 / ff,
        1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
    )


def test_not_really_random():
    # If your xkcd-style password comes out like this, maybe you shouldn't
    # use it
    eq_(random_words(nwords=4, lang='en', bits_per_word=0),
        'the the the the')

    # This not only tests random_ascii_words, it makes sure we didn't end
    # up with 'eos' as a very common Japanese word
    eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
        'rt rt rt rt')


@raises(ValueError)
def test_not_enough_ascii():
    random_ascii_words(lang='zh')


def test_ar():
    # Remove tatweels
    eq_(
        tokenize('متــــــــعب', 'ar'),
        ['متعب']
    )

    # Remove combining marks
    eq_(
        tokenize('حَرَكَات', 'ar'),
        ['حركات']
    )

    eq_(
        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
        ['\u0644\u0627']  # ...that is affected by NFKC normalization
    )


def test_ideographic_fallback():
    # Try tokenizing Chinese text as English -- it should remain stuck together.
    eq_(tokenize('中国文字', 'en'), ['中国文字'])

    # When Japanese is tagged with the wrong language, it will be split
    # at script boundaries.
    ja_text = 'ひらがなカタカナromaji'
    eq_(
        tokenize(ja_text, 'en'),
        ['ひらがな', 'カタカナ', 'romaji']
    )

    # Test that we leave Thai letters stuck together. If we had better Thai support,
    # we would actually split this into a three-word phrase.
    eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
    eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
        ['การเล่นดนตรี', 'means', 'playing', 'music'])
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								from wordfreq import (
-												removed unused imports


Former-commit-id: b9578ae21e58ff40cd63506e4f31e4ddae11f179
											
										
										
											2015-07-07 20:21:22 +00:00
+								    word_frequency, available_languages, cB_to_freq,
-												Express the combining of word frequencies in an explicitly associative and commutative way.


Former-commit-id: 32b4033d6399f10e10dd3f1c9194847a7f01f302
											
										
										
											2015-07-09 19:26:54 +00:00
+								    top_n_list, random_words, random_ascii_words, tokenize
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								)
 								from nose.tools import (
-												removed unused imports


Former-commit-id: b9578ae21e58ff40cd63506e4f31e4ddae11f179
											
										
										
											2015-07-07 20:21:22 +00:00
+								    eq_, assert_almost_equal, assert_greater, raises
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								)
 								def test_freq_examples():
 								    # Stopwords are most common in the correct language
 								    assert_greater(word_frequency('the', 'en'),
 								                   word_frequency('de', 'en'))
 								    assert_greater(word_frequency('de', 'es'),
 								                   word_frequency('the', 'es'))
 								def test_languages():
 								    # Make sure the number of available languages doesn't decrease
 								    avail = available_languages()
-												add tests for Turkish


Former-commit-id: fc93c8dc9c66a786914137729c42209be0c4acd0
											
										
										
											2015-09-04 20:40:11 +00:00
+								    assert_greater(len(avail), 15)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												fix Arabic test, where 'lol' is no longer common


Former-commit-id: da79dfb24786c4295c18357c018f31c204d1e9bc
											
										
										
											2016-05-11 21:01:47 +00:00
+								    # Look up a word representing laughter in each language, and make sure
 								    # it has a non-zero frequency.
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								    for lang in avail:
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
+								        if lang in {'zh', 'ja'}:
 								            text = '笑'
-												fix Arabic test, where 'lol' is no longer common


Former-commit-id: da79dfb24786c4295c18357c018f31c204d1e9bc
											
										
										
											2016-05-11 21:01:47 +00:00
+								        elif lang == 'ar':
 								            text = 'ههههه'
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
+								        else:
 								            text = 'lol'
 								        assert_greater(word_frequency(text, lang), 0)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
+								        # Make up a weirdly verbose language code and make sure
 								        # we still get it
 								        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
-												fix Arabic test, where 'lol' is no longer common


Former-commit-id: da79dfb24786c4295c18357c018f31c204d1e9bc
											
										
										
											2016-05-11 21:01:47 +00:00
+								        assert_greater(word_frequency(text, new_lang_code), 0, (text, new_lang_code))
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												test and document new twitter wordlists


Former-commit-id: 14cb40810019eb8ca5d1350be46c41c645bf12b6
											
										
										
											2015-07-01 21:53:38 +00:00
+								def test_twitter():
 								    avail = available_languages('twitter')
-												add tests for Turkish


Former-commit-id: fc93c8dc9c66a786914137729c42209be0c4acd0
											
										
										
											2015-09-04 20:40:11 +00:00
+								    assert_greater(len(avail), 14)
-												test and document new twitter wordlists


Former-commit-id: 14cb40810019eb8ca5d1350be46c41c645bf12b6
											
										
										
											2015-07-01 21:53:38 +00:00
 								    for lang in avail:
 								        assert_greater(word_frequency('rt', lang, 'twitter'),
 								                       word_frequency('rt', lang, 'combined'))
-												updated minimum


Former-commit-id: 59c03e24118ffbd4159e1162a6a64ebf38bf4edb
											
										
										
											2015-07-07 19:46:33 +00:00
+								def test_minimums():
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								    eq_(word_frequency('esquivalience', 'en'), 0)
-												changed default to minimum for word_frequency


Former-commit-id: 9aa773aa2bba694c691d1ea7b18e16a64fe7695e
											
										
										
											2015-07-07 19:03:26 +00:00
+								    eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
-												updated minimum


Former-commit-id: 59c03e24118ffbd4159e1162a6a64ebf38bf4edb
											
										
										
											2015-07-07 19:46:33 +00:00
+								    eq_(word_frequency('the', 'en', minimum=1), 1)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								def test_most_common_words():
 								    # If something causes the most common words in well-supported languages to
 								    # change, we should know.
 								    def get_most_common(lang):
 								        """
 								        Return the single most common word in the language.
 								        """
 								        return top_n_list(lang, 1)[0]
 								    eq_(get_most_common('ar'), 'في')
-												Switch to a more precise centibel scale.


Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2
											
										
										
											2015-06-22 21:36:30 +00:00
+								    eq_(get_most_common('de'), 'die')
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								    eq_(get_most_common('en'), 'the')
 								    eq_(get_most_common('es'), 'de')
 								    eq_(get_most_common('fr'), 'de')
 								    eq_(get_most_common('it'), 'di')
 								    eq_(get_most_common('ja'), 'の')
 								    eq_(get_most_common('nl'), 'de')
 								    eq_(get_most_common('pt'), 'de')
 								    eq_(get_most_common('ru'), 'в')
-												add tests for Turkish


Former-commit-id: fc93c8dc9c66a786914137729c42209be0c4acd0
											
										
										
											2015-09-04 20:40:11 +00:00
+								    eq_(get_most_common('tr'), 'bir')
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								    eq_(get_most_common('zh'), '的')
 								def test_language_matching():
 								    freq = word_frequency('的', 'zh')
 								    eq_(word_frequency('的', 'zh-TW'), freq)
 								    eq_(word_frequency('的', 'zh-CN'), freq)
 								    eq_(word_frequency('的', 'zh-Hant'), freq)
 								    eq_(word_frequency('的', 'zh-Hans'), freq)
 								    eq_(word_frequency('的', 'yue-HK'), freq)
 								    eq_(word_frequency('的', 'cmn'), freq)
-												Switch to a more precise centibel scale.


Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2
											
										
										
											2015-06-22 21:36:30 +00:00
+								def test_cB_conversion():
 								    eq_(cB_to_freq(0), 1.)
 								    assert_almost_equal(cB_to_freq(-100), 0.1)
 								    assert_almost_equal(cB_to_freq(-600), 1e-6)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								@raises(ValueError)
-												Switch to a more precise centibel scale.


Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2
											
										
										
											2015-06-22 21:36:30 +00:00
+								def test_failed_cB_conversion():
 								    cB_to_freq(1)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								def test_tokenization():
 								    # We preserve apostrophes within words, so "can't" is a single word in the
-												updated comments


Former-commit-id: 131b916c579b8c45db0444a6eaffe51ef419039b
											
										
										
											2015-07-17 18:50:12 +00:00
+								    # data
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
 								        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
-												Leave Thai segments alone in the default regex

Our regex already has a special case to leave Chinese and Japanese alone
when an appropriate tokenizer for the language isn't being used, as
Unicode's default segmentation would make every character into its own
token.

The same thing happens in Thai, and we don't even *have* an appropriate
tokenizer for Thai, so I've added a similar fallback.


Former-commit-id: 07f16e6f03cc42436a467eaab935996f22d37d46
											
										
										
											2016-02-22 19:26:50 +00:00
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
+								    eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
 								        ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
-												updated tests for emoji splitting


Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185
											
										
										
											2015-06-25 15:25:51 +00:00
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								    # Certain punctuation does not inherently split a word.
 								    eq_(tokenize("Anything is possible at zombo.com", 'en'),
 								        ['anything', 'is', 'possible', 'at', 'zombo.com'])
 								    # Splits occur after symbols, and at splitting punctuation such as hyphens.
-												updated tests for emoji splitting


Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185
											
										
										
											2015-06-25 15:25:51 +00:00
+								    eq_(tokenize('😂test', 'en'), ['😂', 'test'])
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
-												updated tests for emoji splitting


Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185
											
										
										
											2015-06-25 15:25:51 +00:00
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
+								    eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
 								        ['this', 'text', 'has', '...', 'punctuation', ':)'])
-												case-fold instead of just lowercasing tokens


Former-commit-id: 638467f60022c6933a9a2fb8ff1280d39e9a3d70
											
										
										
											2015-06-30 19:14:02 +00:00
 								def test_casefolding():
 								    eq_(tokenize('WEISS', 'de'), ['weiss'])
 								    eq_(tokenize('weiß', 'de'), ['weiss'])
-												add tests for Turkish


Former-commit-id: fc93c8dc9c66a786914137729c42209be0c4acd0
											
										
										
											2015-09-04 20:40:11 +00:00
+								    eq_(tokenize('İstanbul', 'tr'), ['istanbul'])
 								    eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
-												case-fold instead of just lowercasing tokens


Former-commit-id: 638467f60022c6933a9a2fb8ff1280d39e9a3d70
											
										
										
											2015-06-30 19:14:02 +00:00
-												updated tests for emoji splitting


Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185
											
										
										
											2015-06-25 15:25:51 +00:00
+								def test_phrase_freq():
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								    ff = word_frequency("flip-flop", 'en')
 								    assert_greater(ff, 0)
-												updated tests


Former-commit-id: ca66a5f883d4a19c2b9fa81e1f6c3c8309924f69
											
										
										
											2015-07-07 18:13:28 +00:00
+								    assert_almost_equal(
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+.0 / ff,
 .0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
-												Express the combining of word frequencies in an explicitly associative and commutative way.


Former-commit-id: 32b4033d6399f10e10dd3f1c9194847a7f01f302
											
										
										
											2015-07-09 19:26:54 +00:00
+								    )
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								def test_not_really_random():
 								    # If your xkcd-style password comes out like this, maybe you shouldn't
 								    # use it
-												revert changes to test_not_really_random


Former-commit-id: bbf7b9de34f4b1f7ff3ac4a3b3789f5f45fa1a86
											
										
										
											2015-06-30 15:29:14 +00:00
+								    eq_(random_words(nwords=4, lang='en', bits_per_word=0),
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								        'the the the the')
 								    # This not only tests random_ascii_words, it makes sure we didn't end
 								    # up with 'eos' as a very common Japanese word
-												revert changes to test_not_really_random


Former-commit-id: bbf7b9de34f4b1f7ff3ac4a3b3789f5f45fa1a86
											
										
										
											2015-06-30 15:29:14 +00:00
+								    eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
-												update data using new build


Former-commit-id: f9a9ee7a82fb122124aec58a4fbf14cccaf27c35
											
										
										
											2015-07-01 15:18:39 +00:00
+								        'rt rt rt rt')
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								@raises(ValueError)
 								def test_not_enough_ascii():
 								    random_ascii_words(lang='zh')
-												added arabic tests


Former-commit-id: f83d31a35774b08d40ab5c6a9fb8c09616e71819
											
										
										
											2015-07-07 19:10:59 +00:00
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								def test_ar():
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
+								    # Remove tatweels
-												added arabic tests


Former-commit-id: f83d31a35774b08d40ab5c6a9fb8c09616e71819
											
										
										
											2015-07-07 19:10:59 +00:00
+								    eq_(
 								        tokenize('متــــــــعب', 'ar'),
 								        ['متعب']
 								    )
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
+								    # Remove combining marks
-												added arabic tests


Former-commit-id: f83d31a35774b08d40ab5c6a9fb8c09616e71819
											
										
										
											2015-07-07 19:10:59 +00:00
+								    eq_(
 								        tokenize('حَرَكَات', 'ar'),
 								        ['حركات']
 								    )
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
 								    eq_(
-												Document the NFKC-normalized ligature in the Arabic test.


Former-commit-id: 41e1dd41d82358fd44f972e501c8586d0bbd64a2
											
										
										
											2015-08-03 15:09:44 +00:00
+								        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
 								        ['\u0644\u0627']  # ...that is affected by NFKC normalization
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
+								    )
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
 								def test_ideographic_fallback():
-												tokenize Chinese using jieba and our own frequencies


Former-commit-id: 2327f2e4d61c25b29a00f8cbb4387cf59f520628
											
										
										
											2015-09-05 07:16:56 +00:00
+								    # Try tokenizing Chinese text as English -- it should remain stuck together.
 								    eq_(tokenize('中国文字', 'en'), ['中国文字'])
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
 								    # When Japanese is tagged with the wrong language, it will be split
 								    # at script boundaries.
 								    ja_text = 'ひらがなカタカナromaji'
 								    eq_(
 								        tokenize(ja_text, 'en'),
 								        ['ひらがな', 'カタカナ', 'romaji']
 								    )
-												move Thai test to where it makes more sense


Former-commit-id: 4ec6b56faab4bc5a698e48cca1493ed45c9de6ea
											
										
										
											2016-03-10 16:56:04 +00:00
 								    # Test that we leave Thai letters stuck together. If we had better Thai support,
 								    # we would actually split this into a three-word phrase.
 								    eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
 								    eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
 								        ['การเล่นดนตรี', 'means', 'playing', 'music'])