wordfreq/tests/test.py

from wordfreq import (
    word_frequency, available_languages, cB_to_freq,
    top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
)
from nose.tools import (
    eq_, assert_almost_equal, assert_greater, raises
)


def test_freq_examples():
    # Stopwords are most common in the correct language
    assert_greater(word_frequency('the', 'en'),
                   word_frequency('de', 'en'))

    assert_greater(word_frequency('de', 'es'),
                   word_frequency('the', 'es'))


# To test the reasonableness of the Twitter list, we want to look up a
# common word representing laughter in each language. The default for
# languages not listed here is 'haha'.
LAUGHTER_WORDS = {
    'en': 'lol',
    'hi': 'lol',
    'cs': 'lol',
    'ru': 'лол',
    'zh': '笑',
    'ja': '笑',
    'ar': 'ﻪﻬﻬﻬﻫ',
    'fa': 'خخخخ',
    'ca': 'jaja',
    'es': 'jaja',
    'fr': 'ptdr',
    'pt': 'kkkk',
    'he': 'חחח',
    'bg': 'ахаха',
    'uk': 'хаха',
    'bn': 'হা হা',
    'mk': 'хаха'
}


def test_languages():
    # Make sure the number of available languages doesn't decrease
    avail = available_languages()
    assert_greater(len(avail), 26)

    # Look up the digit '2' in the main word list for each language
    for lang in avail:
        assert_greater(word_frequency('2', lang), 0, lang)

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
        assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code)


def test_twitter():
    avail = available_languages('twitter')
    assert_greater(len(avail), 15)

    for lang in avail:
        assert_greater(word_frequency('rt', lang, 'twitter'),
                       word_frequency('rt', lang, 'combined'))
        text = LAUGHTER_WORDS.get(lang, 'haha')
        assert_greater(word_frequency(text, lang, wordlist='twitter'), 0, (text, lang))


def test_minimums():
    eq_(word_frequency('esquivalience', 'en'), 0)
    eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
    eq_(word_frequency('the', 'en', minimum=1), 1)


def test_most_common_words():
    # If something causes the most common words in well-supported languages to
    # change, we should know.

    def get_most_common(lang):
        """
        Return the single most common word in the language.
        """
        return top_n_list(lang, 1)[0]

    eq_(get_most_common('ar'), 'في')
    eq_(get_most_common('de'), 'die')
    eq_(get_most_common('en'), 'the')
    eq_(get_most_common('es'), 'de')
    eq_(get_most_common('fr'), 'de')
    eq_(get_most_common('it'), 'di')
    eq_(get_most_common('ja'), 'の')
    eq_(get_most_common('nl'), 'de')
    eq_(get_most_common('pl'), 'w')
    eq_(get_most_common('pt'), 'de')
    eq_(get_most_common('ru'), 'в')
    eq_(get_most_common('tr'), 'bir')
    eq_(get_most_common('zh'), '的')


def test_language_matching():
    freq = word_frequency('的', 'zh')
    eq_(word_frequency('的', 'zh-TW'), freq)
    eq_(word_frequency('的', 'zh-CN'), freq)
    eq_(word_frequency('的', 'zh-Hant'), freq)
    eq_(word_frequency('的', 'zh-Hans'), freq)
    eq_(word_frequency('的', 'yue-HK'), freq)
    eq_(word_frequency('的', 'cmn'), freq)


def test_cB_conversion():
    eq_(cB_to_freq(0), 1.)
    assert_almost_equal(cB_to_freq(-100), 0.1)
    assert_almost_equal(cB_to_freq(-600), 1e-6)


@raises(ValueError)
def test_failed_cB_conversion():
    cB_to_freq(1)


def test_tokenization():
    # We preserve apostrophes within words, so "can't" is a single word in the
    # data
    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])

    eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
        ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])

    # Certain punctuation does not inherently split a word.
    eq_(tokenize("Anything is possible at zombo.com", 'en'),
        ['anything', 'is', 'possible', 'at', 'zombo.com'])

    # Splits occur after symbols, and at splitting punctuation such as hyphens.
    eq_(tokenize('😂test', 'en'), ['😂', 'test'])

    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])

    eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
        ['this', 'text', 'has', '...', 'punctuation', ':)'])

    # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
    # and 'David Bowie' stay together, because our Unicode segmentation algorithm
    # is up to date
    eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽'])

    eq_(tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'),
        ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
         'nothing', 'i', 'can', 'do', '🌎', '🚀'])

    # Water wave, surfer, flag of California (indicates ridiculously complete support
    # for Unicode 10 and Emoji 5.0)
    eq_(tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en'),
        ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"])


def test_casefolding():
    eq_(tokenize('WEISS', 'de'), ['weiss'])
    eq_(tokenize('weiß', 'de'), ['weiss'])
    eq_(tokenize('İstanbul', 'tr'), ['istanbul'])
    eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])


def test_number_smashing():
    eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
        ['715', 'crσσks', 'by', 'bon', 'iver'])
    eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
        ['000', 'crσσks', 'by', 'bon', 'iver'])
    eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True),
        ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
    eq_(lossy_tokenize('1', 'en'), ['1'])
    eq_(lossy_tokenize('3.14', 'en'), ['0.00'])
    eq_(lossy_tokenize('24601', 'en'), ['00000'])
    eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))


def test_phrase_freq():
    ff = word_frequency("flip-flop", 'en')
    assert_greater(ff, 0)
    assert_almost_equal(
        1.0 / ff,
        1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
    )


def test_not_really_random():
    # If your xkcd-style password comes out like this, maybe you shouldn't
    # use it
    eq_(random_words(nwords=4, lang='en', bits_per_word=0),
        'the the the the')

    # This not only tests random_ascii_words, it makes sure we didn't end
    # up with 'eos' as a very common Japanese word
    eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
        '1 1 1 1')


@raises(ValueError)
def test_not_enough_ascii():
    random_ascii_words(lang='zh', bits_per_word=14)


def test_arabic():
    # Remove tatweels
    eq_(
        tokenize('متــــــــعب', 'ar'),
        ['متعب']
    )

    # Remove combining marks
    eq_(
        tokenize('حَرَكَات', 'ar'),
        ['حركات']
    )

    eq_(
        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
        ['\u0644\u0627']  # ...that is affected by NFKC normalization
    )


def test_ideographic_fallback():
    # Try tokenizing Chinese text as English -- it should remain stuck together.
    eq_(tokenize('中国文字', 'en'), ['中国文字'])

    # When Japanese is tagged with the wrong language, it will be split
    # at script boundaries.
    ja_text = 'ひらがなカタカナromaji'
    eq_(
        tokenize(ja_text, 'en'),
        ['ひらがな', 'カタカナ', 'romaji']
    )


def test_other_languages():
    # Test that we leave Thai letters stuck together. If we had better Thai support,
    # we would actually split this into a three-word phrase.
    eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
    eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
        ['การเล่นดนตรี', 'means', 'playing', 'music'])

    # Test Khmer, a script similar to Thai
    eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍'])

    # Test Hindi -- tokens split where there are spaces, and not where there aren't
    eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी'])

    # Remove vowel points in Hebrew
    eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה'])

    # Deal with commas, cedillas, and I's in Turkish
    eq_(tokenize('kișinin', 'tr'), ['kişinin'])
    eq_(tokenize('KİȘİNİN', 'tr'), ['kişinin'])

    # Deal with cedillas that should be commas-below in Romanian
    eq_(tokenize('acelaşi', 'ro'), ['același'])
    eq_(tokenize('ACELAŞI', 'ro'), ['același'])
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								from wordfreq import (
-												removed unused imports


Former-commit-id: b9578ae21e58ff40cd63506e4f31e4ddae11f179
											
										
										
											2015-07-07 20:21:22 +00:00
+								    word_frequency, available_languages, cB_to_freq,
-												Separate preprocessing from tokenization

											
										
										
											2018-03-08 21:25:45 +00:00
+								    top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								)
 								from nose.tools import (
-												removed unused imports


Former-commit-id: b9578ae21e58ff40cd63506e4f31e4ddae11f179
											
										
										
											2015-07-07 20:21:22 +00:00
+								    eq_, assert_almost_equal, assert_greater, raises
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								)
 								def test_freq_examples():
 								    # Stopwords are most common in the correct language
 								    assert_greater(word_frequency('the', 'en'),
 								                   word_frequency('de', 'en'))
 								    assert_greater(word_frequency('de', 'es'),
 								                   word_frequency('the', 'es'))
-												Revise multilingual tests


Former-commit-id: 21246f881f0afe90408203f64e41059f02809da7
											
										
										
											2016-07-29 16:19:12 +00:00
+								# To test the reasonableness of the Twitter list, we want to look up a
 								# common word representing laughter in each language. The default for
 								# languages not listed here is 'haha'.
 								LAUGHTER_WORDS = {
 								    'en': 'lol',
 								    'hi': 'lol',
-												import new wordlists from Exquisite Corpus

											
										
										
											2017-01-05 22:59:26 +00:00
+								    'cs': 'lol',
-												Revise multilingual tests


Former-commit-id: 21246f881f0afe90408203f64e41059f02809da7
											
										
										
											2016-07-29 16:19:12 +00:00
+								    'ru': 'лол',
 								    'zh': '笑',
 								    'ja': '笑',
 								    'ar': 'ﻪﻬﻬﻬﻫ',
-												import new wordlists from Exquisite Corpus

											
										
										
											2017-01-05 22:59:26 +00:00
+								    'fa': 'خخخخ',
-												Revise multilingual tests


Former-commit-id: 21246f881f0afe90408203f64e41059f02809da7
											
										
										
											2016-07-29 16:19:12 +00:00
+								    'ca': 'jaja',
 								    'es': 'jaja',
 								    'fr': 'ptdr',
 								    'pt': 'kkkk',
 								    'he': 'חחח',
-												import new wordlists from Exquisite Corpus

											
										
										
											2017-01-05 22:59:26 +00:00
+								    'bg': 'ахаха',
 								    'uk': 'хаха',
-												v1.7: update tokenization, update data, add `bn` and `mk`

											
										
										
											2017-08-25 21:37:48 +00:00
+								    'bn': 'হা হা',
 								    'mk': 'хаха'
-												Revise multilingual tests


Former-commit-id: 21246f881f0afe90408203f64e41059f02809da7
											
										
										
											2016-07-29 16:19:12 +00:00
+								}
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								def test_languages():
 								    # Make sure the number of available languages doesn't decrease
 								    avail = available_languages()
-												Add Common Crawl data and more languages (#39)

This changes the version from 1.4.2 to 1.5.  Things done in this update include:

* include Common Crawl; support 11 more languages

* new frequency-merging strategy

* New sources: Chinese from Wikipedia (mostly Trad.), Dutch big list

* Remove kinda bad sources, i.e. Greek Twitter (too often kaomoji are detected as Greek) and Ukrainian Common Crawl. This results in dropping Ukrainian as an available language, and causing Greek to not be a 'large' language after all.

* Add Korean tokenization, and include MeCab files in data

* Remove marks from more languages

* Deal with commas and cedillas in Turkish and Romanian



Former-commit-id: e6a8f028e3ff73b5d27b8f6a6ad8b26f439c00ec
											
										
										
											2016-07-28 23:23:17 +00:00
+								    assert_greater(len(avail), 26)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												Revise multilingual tests


Former-commit-id: 21246f881f0afe90408203f64e41059f02809da7
											
										
										
											2016-07-29 16:19:12 +00:00
+								    # Look up the digit '2' in the main word list for each language
 								    for lang in avail:
 								        assert_greater(word_frequency('2', lang), 0, lang)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
+								        # Make up a weirdly verbose language code and make sure
 								        # we still get it
 								        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
-												Revise multilingual tests


Former-commit-id: 21246f881f0afe90408203f64e41059f02809da7
											
										
										
											2016-07-29 16:19:12 +00:00
+								        assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												test and document new twitter wordlists


Former-commit-id: 14cb40810019eb8ca5d1350be46c41c645bf12b6
											
										
										
											2015-07-01 21:53:38 +00:00
+								def test_twitter():
 								    avail = available_languages('twitter')
-												Revise multilingual tests


Former-commit-id: 21246f881f0afe90408203f64e41059f02809da7
											
										
										
											2016-07-29 16:19:12 +00:00
+								    assert_greater(len(avail), 15)
-												test and document new twitter wordlists


Former-commit-id: 14cb40810019eb8ca5d1350be46c41c645bf12b6
											
										
										
											2015-07-01 21:53:38 +00:00
 								    for lang in avail:
 								        assert_greater(word_frequency('rt', lang, 'twitter'),
 								                       word_frequency('rt', lang, 'combined'))
-												Revise multilingual tests


Former-commit-id: 21246f881f0afe90408203f64e41059f02809da7
											
										
										
											2016-07-29 16:19:12 +00:00
+								        text = LAUGHTER_WORDS.get(lang, 'haha')
 								        assert_greater(word_frequency(text, lang, wordlist='twitter'), 0, (text, lang))
-												test and document new twitter wordlists


Former-commit-id: 14cb40810019eb8ca5d1350be46c41c645bf12b6
											
										
										
											2015-07-01 21:53:38 +00:00
-												updated minimum


Former-commit-id: 59c03e24118ffbd4159e1162a6a64ebf38bf4edb
											
										
										
											2015-07-07 19:46:33 +00:00
+								def test_minimums():
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								    eq_(word_frequency('esquivalience', 'en'), 0)
-												changed default to minimum for word_frequency


Former-commit-id: 9aa773aa2bba694c691d1ea7b18e16a64fe7695e
											
										
										
											2015-07-07 19:03:26 +00:00
+								    eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
-												updated minimum


Former-commit-id: 59c03e24118ffbd4159e1162a6a64ebf38bf4edb
											
										
										
											2015-07-07 19:46:33 +00:00
+								    eq_(word_frequency('the', 'en', minimum=1), 1)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												Revise multilingual tests


Former-commit-id: 21246f881f0afe90408203f64e41059f02809da7
											
										
										
											2016-07-29 16:19:12 +00:00
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								def test_most_common_words():
 								    # If something causes the most common words in well-supported languages to
 								    # change, we should know.
 								    def get_most_common(lang):
 								        """
 								        Return the single most common word in the language.
 								        """
 								        return top_n_list(lang, 1)[0]
-												import new wordlists from Exquisite Corpus

											
										
										
											2017-01-05 22:59:26 +00:00
+								    eq_(get_most_common('ar'), 'في')
-												Switch to a more precise centibel scale.


Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2
											
										
										
											2015-06-22 21:36:30 +00:00
+								    eq_(get_most_common('de'), 'die')
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								    eq_(get_most_common('en'), 'the')
 								    eq_(get_most_common('es'), 'de')
 								    eq_(get_most_common('fr'), 'de')
 								    eq_(get_most_common('it'), 'di')
 								    eq_(get_most_common('ja'), 'の')
 								    eq_(get_most_common('nl'), 'de')
-												import new wordlists from Exquisite Corpus

											
										
										
											2017-01-05 22:59:26 +00:00
+								    eq_(get_most_common('pl'), 'w')
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								    eq_(get_most_common('pt'), 'de')
 								    eq_(get_most_common('ru'), 'в')
-												add tests for Turkish


Former-commit-id: fc93c8dc9c66a786914137729c42209be0c4acd0
											
										
										
											2015-09-04 20:40:11 +00:00
+								    eq_(get_most_common('tr'), 'bir')
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								    eq_(get_most_common('zh'), '的')
 								def test_language_matching():
 								    freq = word_frequency('的', 'zh')
 								    eq_(word_frequency('的', 'zh-TW'), freq)
 								    eq_(word_frequency('的', 'zh-CN'), freq)
 								    eq_(word_frequency('的', 'zh-Hant'), freq)
 								    eq_(word_frequency('的', 'zh-Hans'), freq)
 								    eq_(word_frequency('的', 'yue-HK'), freq)
 								    eq_(word_frequency('的', 'cmn'), freq)
-												Switch to a more precise centibel scale.


Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2
											
										
										
											2015-06-22 21:36:30 +00:00
+								def test_cB_conversion():
 								    eq_(cB_to_freq(0), 1.)
 								    assert_almost_equal(cB_to_freq(-100), 0.1)
 								    assert_almost_equal(cB_to_freq(-600), 1e-6)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								@raises(ValueError)
-												Switch to a more precise centibel scale.


Former-commit-id: 7862a4d2b6b2e756f52b405e28e5049b7ef93bc2
											
										
										
											2015-06-22 21:36:30 +00:00
+								def test_failed_cB_conversion():
 								    cB_to_freq(1)
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								def test_tokenization():
 								    # We preserve apostrophes within words, so "can't" is a single word in the
-												updated comments


Former-commit-id: 131b916c579b8c45db0444a6eaffe51ef419039b
											
										
										
											2015-07-17 18:50:12 +00:00
+								    # data
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
 								        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
-												Leave Thai segments alone in the default regex

Our regex already has a special case to leave Chinese and Japanese alone
when an appropriate tokenizer for the language isn't being used, as
Unicode's default segmentation would make every character into its own
token.

The same thing happens in Thai, and we don't even *have* an appropriate
tokenizer for Thai, so I've added a similar fallback.


Former-commit-id: 07f16e6f03cc42436a467eaab935996f22d37d46
											
										
										
											2016-02-22 19:26:50 +00:00
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
+								    eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
 								        ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
-												updated tests for emoji splitting


Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185
											
										
										
											2015-06-25 15:25:51 +00:00
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								    # Certain punctuation does not inherently split a word.
 								    eq_(tokenize("Anything is possible at zombo.com", 'en'),
 								        ['anything', 'is', 'possible', 'at', 'zombo.com'])
 								    # Splits occur after symbols, and at splitting punctuation such as hyphens.
-												updated tests for emoji splitting


Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185
											
										
										
											2015-06-25 15:25:51 +00:00
+								    eq_(tokenize('😂test', 'en'), ['😂', 'test'])
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
-												updated tests for emoji splitting


Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185
											
										
										
											2015-06-25 15:25:51 +00:00
-												refactor the tokenizer, add `include_punctuation` option


Former-commit-id: e8e6e0a23196abf0ecc0cf3bc72ba9943226d119
											
										
										
											2015-09-15 17:26:09 +00:00
+								    eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
 								        ['this', 'text', 'has', '...', 'punctuation', ':)'])
-												Tokenize by graphemes, not codepoints (#50)

* Tokenize by graphemes, not codepoints

* Add more documentation to TOKEN_RE

* Remove extra line break

* Update docstring - Brahmic scripts are no longer an exception

* approve using version 2017.07.28 of regex

											
										
										
											2017-08-08 15:35:28 +00:00
+								    # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
 								    # and 'David Bowie' stay together, because our Unicode segmentation algorithm
 								    # is up to date
 								    eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽'])
 								    eq_(tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'),
 								        ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
 								         'nothing', 'i', 'can', 'do', '🌎', '🚀'])
 								    # Water wave, surfer, flag of California (indicates ridiculously complete support
 								    # for Unicode 10 and Emoji 5.0)
 								    eq_(tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en'),
 								        ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"])
-												case-fold instead of just lowercasing tokens


Former-commit-id: 638467f60022c6933a9a2fb8ff1280d39e9a3d70
											
										
										
											2015-06-30 19:14:02 +00:00
 								def test_casefolding():
 								    eq_(tokenize('WEISS', 'de'), ['weiss'])
 								    eq_(tokenize('weiß', 'de'), ['weiss'])
-												add tests for Turkish


Former-commit-id: fc93c8dc9c66a786914137729c42209be0c4acd0
											
										
										
											2015-09-04 20:40:11 +00:00
+								    eq_(tokenize('İstanbul', 'tr'), ['istanbul'])
 								    eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
-												case-fold instead of just lowercasing tokens


Former-commit-id: 638467f60022c6933a9a2fb8ff1280d39e9a3d70
											
										
										
											2015-06-30 19:14:02 +00:00
-												import new wordlists from Exquisite Corpus

											
										
										
											2017-01-05 22:59:26 +00:00
+								def test_number_smashing():
-												Don't smash numbers in *all* tokenization, just when looking up freqs

I forgot momentarily that the output of the tokenizer is used by other
code.

											
										
										
											2017-01-07 00:18:52 +00:00
+								    eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
 								        ['715', 'crσσks', 'by', 'bon', 'iver'])
-												Separate preprocessing from tokenization

											
										
										
											2018-03-08 21:25:45 +00:00
+								    eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
-												Don't smash numbers in *all* tokenization, just when looking up freqs

I forgot momentarily that the output of the tokenizer is used by other
code.

											
										
										
											2017-01-07 00:18:52 +00:00
+								        ['000', 'crσσks', 'by', 'bon', 'iver'])
-												Separate preprocessing from tokenization

											
										
										
											2018-03-08 21:25:45 +00:00
+								    eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True),
-												Don't smash numbers in *all* tokenization, just when looking up freqs

I forgot momentarily that the output of the tokenizer is used by other
code.

											
										
										
											2017-01-07 00:18:52 +00:00
+								        ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
-												Separate preprocessing from tokenization

											
										
										
											2018-03-08 21:25:45 +00:00
+								    eq_(lossy_tokenize('1', 'en'), ['1'])
 								    eq_(lossy_tokenize('3.14', 'en'), ['0.00'])
 								    eq_(lossy_tokenize('24601', 'en'), ['00000'])
-												test that number-smashing still happens in freq lookups

											
										
										
											2017-01-07 00:20:41 +00:00
+								    eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
-												import new wordlists from Exquisite Corpus

											
										
										
											2017-01-05 22:59:26 +00:00
-												updated tests for emoji splitting


Former-commit-id: 3bcb3e84a111ecba5b663ce18697109641b6a185
											
										
										
											2015-06-25 15:25:51 +00:00
+								def test_phrase_freq():
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+								    ff = word_frequency("flip-flop", 'en')
 								    assert_greater(ff, 0)
-												updated tests


Former-commit-id: ca66a5f883d4a19c2b9fa81e1f6c3c8309924f69
											
										
										
											2015-07-07 18:13:28 +00:00
+								    assert_almost_equal(
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
+.0 / ff,
 .0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
-												Express the combining of word frequencies in an explicitly associative and commutative way.


Former-commit-id: 32b4033d6399f10e10dd3f1c9194847a7f01f302
											
										
										
											2015-07-09 19:26:54 +00:00
+								    )
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								def test_not_really_random():
 								    # If your xkcd-style password comes out like this, maybe you shouldn't
 								    # use it
-												revert changes to test_not_really_random


Former-commit-id: bbf7b9de34f4b1f7ff3ac4a3b3789f5f45fa1a86
											
										
										
											2015-06-30 15:29:14 +00:00
+								    eq_(random_words(nwords=4, lang='en', bits_per_word=0),
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
+								        'the the the the')
 								    # This not only tests random_ascii_words, it makes sure we didn't end
 								    # up with 'eos' as a very common Japanese word
-												revert changes to test_not_really_random


Former-commit-id: bbf7b9de34f4b1f7ff3ac4a3b3789f5f45fa1a86
											
										
										
											2015-06-30 15:29:14 +00:00
+								    eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
-												v1.7: update tokenization, update data, add `bn` and `mk`

											
										
										
											2017-08-25 21:37:48 +00:00
+								        '1 1 1 1')
-												tests for new wordfreq with full coverage


Former-commit-id: df863a5169719a154a95c788f237088704b5e619
											
										
										
											2015-05-22 00:34:17 +00:00
 								@raises(ValueError)
 								def test_not_enough_ascii():
-												Add Common Crawl data and more languages (#39)

This changes the version from 1.4.2 to 1.5.  Things done in this update include:

* include Common Crawl; support 11 more languages

* new frequency-merging strategy

* New sources: Chinese from Wikipedia (mostly Trad.), Dutch big list

* Remove kinda bad sources, i.e. Greek Twitter (too often kaomoji are detected as Greek) and Ukrainian Common Crawl. This results in dropping Ukrainian as an available language, and causing Greek to not be a 'large' language after all.

* Add Korean tokenization, and include MeCab files in data

* Remove marks from more languages

* Deal with commas and cedillas in Turkish and Romanian



Former-commit-id: e6a8f028e3ff73b5d27b8f6a6ad8b26f439c00ec
											
										
										
											2016-07-28 23:23:17 +00:00
+								    random_ascii_words(lang='zh', bits_per_word=14)
-												added arabic tests


Former-commit-id: f83d31a35774b08d40ab5c6a9fb8c09616e71819
											
										
										
											2015-07-07 19:10:59 +00:00
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
-												Tokenization in Korean, plus abjad languages (#38)

* Remove marks from more languages

* Add Korean tokenization, and include MeCab files in data

* add a Hebrew tokenization test

* fix terminology in docstrings about abjad scripts

* combine Japanese and Korean tokenization into the same function


Former-commit-id: fec6eddcc3475f49a7541d8d3202ec87e581ed53
											
										
										
											2016-07-15 19:10:25 +00:00
+								def test_arabic():
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
+								    # Remove tatweels
-												added arabic tests


Former-commit-id: f83d31a35774b08d40ab5c6a9fb8c09616e71819
											
										
										
											2015-07-07 19:10:59 +00:00
+								    eq_(
 								        tokenize('متــــــــعب', 'ar'),
 								        ['متعب']
 								    )
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
+								    # Remove combining marks
-												added arabic tests


Former-commit-id: f83d31a35774b08d40ab5c6a9fb8c09616e71819
											
										
										
											2015-07-07 19:10:59 +00:00
+								    eq_(
 								        tokenize('حَرَكَات', 'ar'),
 								        ['حركات']
 								    )
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
 								    eq_(
-												Document the NFKC-normalized ligature in the Arabic test.


Former-commit-id: 41e1dd41d82358fd44f972e501c8586d0bbd64a2
											
										
										
											2015-08-03 15:09:44 +00:00
+								        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
 								        ['\u0644\u0627']  # ...that is affected by NFKC normalization
-												ensure removal of tatweels (hopefully)


Former-commit-id: 173278fdd3e5554063228abd1f2dac8f771995f6
											
										
										
											2015-07-20 20:48:36 +00:00
+								    )
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
 								def test_ideographic_fallback():
-												tokenize Chinese using jieba and our own frequencies


Former-commit-id: 2327f2e4d61c25b29a00f8cbb4387cf59f520628
											
										
										
											2015-09-05 07:16:56 +00:00
+								    # Try tokenizing Chinese text as English -- it should remain stuck together.
 								    eq_(tokenize('中国文字', 'en'), ['中国文字'])
-												Use the regex implementation of Unicode segmentation


Former-commit-id: 95998205ad4309e4b91458d1de5650b8a725d317
											
										
										
											2015-08-24 20:24:49 +00:00
 								    # When Japanese is tagged with the wrong language, it will be split
 								    # at script boundaries.
 								    ja_text = 'ひらがなカタカナromaji'
 								    eq_(
 								        tokenize(ja_text, 'en'),
 								        ['ひらがな', 'カタカナ', 'romaji']
 								    )
-												move Thai test to where it makes more sense


Former-commit-id: 4ec6b56faab4bc5a698e48cca1493ed45c9de6ea
											
										
										
											2016-03-10 16:56:04 +00:00
-												Separate preprocessing from tokenization

											
										
										
											2018-03-08 21:25:45 +00:00
-												Tokenization in Korean, plus abjad languages (#38)

* Remove marks from more languages

* Add Korean tokenization, and include MeCab files in data

* add a Hebrew tokenization test

* fix terminology in docstrings about abjad scripts

* combine Japanese and Korean tokenization into the same function


Former-commit-id: fec6eddcc3475f49a7541d8d3202ec87e581ed53
											
										
										
											2016-07-15 19:10:25 +00:00
+								def test_other_languages():
-												move Thai test to where it makes more sense


Former-commit-id: 4ec6b56faab4bc5a698e48cca1493ed45c9de6ea
											
										
										
											2016-03-10 16:56:04 +00:00
+								    # Test that we leave Thai letters stuck together. If we had better Thai support,
 								    # we would actually split this into a three-word phrase.
 								    eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
 								    eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
 								        ['การเล่นดนตรี', 'means', 'playing', 'music'])
-												Fix tokenization of SE Asian and South Asian scripts (#37)



Former-commit-id: 270f6c7ca616165e89ccbfa270d78eabc49782c4
											
										
										
											2016-07-01 22:00:57 +00:00
+								    # Test Khmer, a script similar to Thai
 								    eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍'])
 								    # Test Hindi -- tokens split where there are spaces, and not where there aren't
 								    eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी'])
-												Tokenization in Korean, plus abjad languages (#38)

* Remove marks from more languages

* Add Korean tokenization, and include MeCab files in data

* add a Hebrew tokenization test

* fix terminology in docstrings about abjad scripts

* combine Japanese and Korean tokenization into the same function


Former-commit-id: fec6eddcc3475f49a7541d8d3202ec87e581ed53
											
										
										
											2016-07-15 19:10:25 +00:00
 								    # Remove vowel points in Hebrew
 								    eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה'])
-												Add Common Crawl data and more languages (#39)

This changes the version from 1.4.2 to 1.5.  Things done in this update include:

* include Common Crawl; support 11 more languages

* new frequency-merging strategy

* New sources: Chinese from Wikipedia (mostly Trad.), Dutch big list

* Remove kinda bad sources, i.e. Greek Twitter (too often kaomoji are detected as Greek) and Ukrainian Common Crawl. This results in dropping Ukrainian as an available language, and causing Greek to not be a 'large' language after all.

* Add Korean tokenization, and include MeCab files in data

* Remove marks from more languages

* Deal with commas and cedillas in Turkish and Romanian



Former-commit-id: e6a8f028e3ff73b5d27b8f6a6ad8b26f439c00ec
											
										
										
											2016-07-28 23:23:17 +00:00
+								    # Deal with commas, cedillas, and I's in Turkish
 								    eq_(tokenize('kișinin', 'tr'), ['kişinin'])
 								    eq_(tokenize('KİȘİNİN', 'tr'), ['kişinin'])
 								    # Deal with cedillas that should be commas-below in Romanian
 								    eq_(tokenize('acelaşi', 'ro'), ['același'])
 								    eq_(tokenize('ACELAŞI', 'ro'), ['același'])