diff --git a/setup.cfg b/setup.cfg index aadbdf3..b7e4789 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,2 @@ -[nosetests] -verbosity=2 -with-doctest=1 -with-coverage=0 -cover-package=wordfreq +[aliases] +test=pytest diff --git a/tests/test.py b/tests/test.py index d7fb321..164ea83 100644 --- a/tests/test.py +++ b/tests/test.py @@ -2,57 +2,51 @@ from wordfreq import ( word_frequency, available_languages, cB_to_freq, top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize ) -from nose.tools import ( - eq_, assert_almost_equal, assert_greater, raises, assert_not_equal -) +import pytest def test_freq_examples(): # Stopwords are most common in the correct language - assert_greater(word_frequency('the', 'en'), - word_frequency('de', 'en')) - - assert_greater(word_frequency('de', 'es'), - word_frequency('the', 'es')) - + assert word_frequency('the', 'en') > word_frequency('de', 'en') + assert word_frequency('de', 'es') > word_frequency('the', 'es') # We get word frequencies from the 'large' list when available - assert_greater(word_frequency('infrequency', 'en'), 0.) + assert word_frequency('infrequency', 'en') > 0. def test_languages(): # Make sure we get all the languages when looking for the default # 'best' wordlist avail = available_languages() - assert_greater(len(avail), 32) + assert len(avail) > 32 # 'small' covers the same languages, but with some different lists avail_small = available_languages('small') - eq_(len(avail_small), len(avail)) - assert_not_equal(avail_small, avail) + assert len(avail_small) == len(avail) + assert avail_small != avail # 'combined' is the same as 'small' avail_old_name = available_languages('combined') - eq_(avail_old_name, avail_small) + assert avail_old_name == avail_small # 'large' covers fewer languages avail_large = available_languages('large') - assert_greater(len(avail_large), 12) - assert_greater(len(avail), len(avail_large)) + assert len(avail_large) > 12 + assert len(avail) > len(avail_large) # Look up the digit '2' in the main word list for each language for lang in avail: - assert_greater(word_frequency('2', lang), 0, lang) + assert word_frequency('2', lang) > 0 # Make up a weirdly verbose language code and make sure # we still get it new_lang_code = '%s-001-x-fake-extension' % lang.upper() - assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code) + assert word_frequency('2', new_lang_code) > 0 def test_minimums(): - eq_(word_frequency('esquivalience', 'en'), 0) - eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6) - eq_(word_frequency('the', 'en', minimum=1), 1) + assert word_frequency('esquivalience', 'en') == 0 + assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6 + assert word_frequency('the', 'en', minimum=1) == 1 def test_most_common_words(): @@ -65,141 +59,135 @@ def test_most_common_words(): """ return top_n_list(lang, 1)[0] - eq_(get_most_common('ar'), 'في') - eq_(get_most_common('de'), 'die') - eq_(get_most_common('en'), 'the') - eq_(get_most_common('es'), 'de') - eq_(get_most_common('fr'), 'de') - eq_(get_most_common('it'), 'di') - eq_(get_most_common('ja'), 'の') - eq_(get_most_common('nl'), 'de') - eq_(get_most_common('pl'), 'w') - eq_(get_most_common('pt'), 'de') - eq_(get_most_common('ru'), 'в') - eq_(get_most_common('tr'), 'bir') - eq_(get_most_common('zh'), '的') + assert get_most_common('ar') == 'في' + assert get_most_common('de') == 'die' + assert get_most_common('en') == 'the' + assert get_most_common('es') == 'de' + assert get_most_common('fr') == 'de' + assert get_most_common('it') == 'di' + assert get_most_common('ja') == 'の' + assert get_most_common('nl') == 'de' + assert get_most_common('pl') == 'w' + assert get_most_common('pt') == 'de' + assert get_most_common('ru') == 'в' + assert get_most_common('tr') == 'bir' + assert get_most_common('zh') == '的' def test_language_matching(): freq = word_frequency('的', 'zh') - eq_(word_frequency('的', 'zh-TW'), freq) - eq_(word_frequency('的', 'zh-CN'), freq) - eq_(word_frequency('的', 'zh-Hant'), freq) - eq_(word_frequency('的', 'zh-Hans'), freq) - eq_(word_frequency('的', 'yue-HK'), freq) - eq_(word_frequency('的', 'cmn'), freq) + assert word_frequency('的', 'zh-TW') == freq + assert word_frequency('的', 'zh-CN') == freq + assert word_frequency('的', 'zh-Hant') == freq + assert word_frequency('的', 'zh-Hans') == freq + assert word_frequency('的', 'yue-HK') == freq + assert word_frequency('的', 'cmn') == freq def test_cB_conversion(): - eq_(cB_to_freq(0), 1.) - assert_almost_equal(cB_to_freq(-100), 0.1) - assert_almost_equal(cB_to_freq(-600), 1e-6) + assert cB_to_freq(0) == 1. + assert cB_to_freq(-100) == pytest.approx(0.1) + assert cB_to_freq(-600) == pytest.approx(1e-6) -@raises(ValueError) def test_failed_cB_conversion(): - cB_to_freq(1) + with pytest.raises(ValueError): + cB_to_freq(1) def test_tokenization(): # We preserve apostrophes within words, so "can't" is a single word in the # data - eq_(tokenize("I don't split at apostrophes, you see.", 'en'), - ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']) + assert ( + tokenize("I don't split at apostrophes, you see.", 'en') + == ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'] + ) - eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True), - ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']) + assert ( + tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True) + == ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'] + ) # Certain punctuation does not inherently split a word. - eq_(tokenize("Anything is possible at zombo.com", 'en'), - ['anything', 'is', 'possible', 'at', 'zombo.com']) + assert ( + tokenize("Anything is possible at zombo.com", 'en') + == ['anything', 'is', 'possible', 'at', 'zombo.com'] + ) # Splits occur after symbols, and at splitting punctuation such as hyphens. - eq_(tokenize('😂test', 'en'), ['😂', 'test']) - - eq_(tokenize("flip-flop", 'en'), ['flip', 'flop']) - - eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True), - ['this', 'text', 'has', '...', 'punctuation', ':)']) + assert tokenize('😂test', 'en') == ['😂', 'test'] + assert tokenize("flip-flop", 'en') == ['flip', 'flop'] + assert ( + tokenize('this text has... punctuation :)', 'en', include_punctuation=True) + == ['this', 'text', 'has', '...', 'punctuation', ':)'] + ) # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf' # and 'David Bowie' stay together, because our Unicode segmentation algorithm # is up to date - eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽']) - - eq_(tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'), - ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's", - 'nothing', 'i', 'can', 'do', '🌎', '🚀']) + assert tokenize('emoji test 🧕🏽', 'en') == ['emoji', 'test', '🧕🏽'] + assert ( + tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en') + == ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's", + 'nothing', 'i', 'can', 'do', '🌎', '🚀'] + ) # Water wave, surfer, flag of California (indicates ridiculously complete support # for Unicode 10 and Emoji 5.0) - eq_(tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en'), - ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"]) + assert tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en') == ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"] def test_casefolding(): - eq_(tokenize('WEISS', 'de'), ['weiss']) - eq_(tokenize('weiß', 'de'), ['weiss']) - eq_(tokenize('İstanbul', 'tr'), ['istanbul']) - eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca']) + assert tokenize('WEISS', 'de') == ['weiss'] + assert tokenize('weiß', 'de') == ['weiss'] + assert tokenize('İstanbul', 'tr') == ['istanbul'] + assert tokenize('SIKISINCA', 'tr') == ['sıkısınca'] def test_number_smashing(): - eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'), - ['715', 'crσσks', 'by', 'bon', 'iver']) - eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'), - ['000', 'crσσks', 'by', 'bon', 'iver']) - eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True), - ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']) - eq_(lossy_tokenize('1', 'en'), ['1']) - eq_(lossy_tokenize('3.14', 'en'), ['0.00']) - eq_(lossy_tokenize('24601', 'en'), ['00000']) - eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en')) + assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver'] + assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver'] + assert ( + lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True) + == ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'] + ) + assert lossy_tokenize('1', 'en') == ['1'] + assert lossy_tokenize('3.14', 'en') == ['0.00'] + assert lossy_tokenize('24601', 'en') == ['00000'] + assert word_frequency('24601', 'en') == word_frequency('90210', 'en') def test_phrase_freq(): ff = word_frequency("flip-flop", 'en') - assert_greater(ff, 0) - assert_almost_equal( - 1.0 / ff, - 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en') - ) + assert ff > 0 + phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en') + assert 1.0 / ff == pytest.approx(phrase_freq) def test_not_really_random(): # If your xkcd-style password comes out like this, maybe you shouldn't # use it - eq_(random_words(nwords=4, lang='en', bits_per_word=0), - 'the the the the') + assert random_words(nwords=4, lang='en', bits_per_word=0) == 'the the the the' # This not only tests random_ascii_words, it makes sure we didn't end # up with 'eos' as a very common Japanese word - eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0), - '1 1 1 1') + assert random_ascii_words(nwords=4, lang='ja', bits_per_word=0) == '1 1 1 1' -@raises(ValueError) def test_not_enough_ascii(): - random_ascii_words(lang='zh', bits_per_word=14) + with pytest.raises(ValueError): + random_ascii_words(lang='zh', bits_per_word=14) def test_arabic(): # Remove tatweels - eq_( - tokenize('متــــــــعب', 'ar'), - ['متعب'] - ) + assert tokenize('متــــــــعب', 'ar') == ['متعب'] # Remove combining marks - eq_( - tokenize('حَرَكَات', 'ar'), - ['حركات'] - ) + assert tokenize('حَرَكَات', 'ar') == ['حركات'] - eq_( - tokenize('\ufefb', 'ar'), # An Arabic ligature... - ['\u0644\u0627'] # ...that is affected by NFKC normalization - ) + # An Arabic ligature that is affected by NFKC normalization + assert tokenize('\ufefb', 'ar') == ['\u0644\u0627'] def test_ideographic_fallback(): @@ -207,29 +195,28 @@ def test_ideographic_fallback(): # # More complex examples like this, involving the multiple scripts of Japanese, # are in test_japanese.py. - eq_(tokenize('中国文字', 'en'), ['中国文字']) + assert tokenize('中国文字', 'en') == ['中国文字'] def test_other_languages(): # Test that we leave Thai letters stuck together. If we had better Thai support, # we would actually split this into a three-word phrase. - eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี']) - eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'), - ['การเล่นดนตรี', 'means', 'playing', 'music']) + assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี'] + assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music'] # Test Khmer, a script similar to Thai - eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍']) + assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍'] # Test Hindi -- tokens split where there are spaces, and not where there aren't - eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी']) + assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी'] # Remove vowel points in Hebrew - eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה']) + assert tokenize('דֻּגְמָה', 'he') == ['דגמה'] # Deal with commas, cedillas, and I's in Turkish - eq_(tokenize('kișinin', 'tr'), ['kişinin']) - eq_(tokenize('KİȘİNİN', 'tr'), ['kişinin']) + assert tokenize('kișinin', 'tr') == ['kişinin'] + assert tokenize('KİȘİNİN', 'tr') == ['kişinin'] # Deal with cedillas that should be commas-below in Romanian - eq_(tokenize('acelaşi', 'ro'), ['același']) - eq_(tokenize('ACELAŞI', 'ro'), ['același']) + assert tokenize('acelaşi', 'ro') == ['același'] + assert tokenize('ACELAŞI', 'ro') == ['același'] diff --git a/tests/test_chinese.py b/tests/test_chinese.py index 58df4a1..a5e9f51 100644 --- a/tests/test_chinese.py +++ b/tests/test_chinese.py @@ -1,5 +1,5 @@ -from nose.tools import eq_, assert_almost_equal, assert_greater from wordfreq import tokenize, word_frequency +import pytest def test_tokens(): @@ -17,64 +17,49 @@ def test_tokens(): # His name breaks into five pieces, with the only piece staying together # being the one that means 'Bart'. The dot is not included as a token. - eq_( - tokenize(hobart, 'zh'), - ['加', '勒', '特', '霍', '巴特'] - ) + assert tokenize(hobart, 'zh') == ['加', '勒', '特', '霍', '巴特'] - eq_( - tokenize(fact_simplified, 'zh'), - [ - # he / is / history / in / #6 / counter for people - '他', '是', '历史', '上', '第六', '位', - # during / term of office / in / die - '在', '任期', '内', '去世', - # of / U.S. / deputy / president - '的', '美国', '副', '总统' - ] - ) + assert tokenize(fact_simplified, 'zh') == [ + # he / is / history / in / #6 / counter for people + '他', '是', '历史', '上', '第六', '位', + # during / term of office / in / die + '在', '任期', '内', '去世', + # of / U.S. / deputy / president + '的', '美国', '副', '总统' + ] # Jieba's original tokenizer knows a lot of names, it seems. - eq_( - tokenize(hobart, 'zh', external_wordlist=True), - ['加勒特', '霍巴特'] - ) + assert tokenize(hobart, 'zh', external_wordlist=True) == ['加勒特', '霍巴特'] # We get almost the same tokens from the sentence using Jieba's own # wordlist, but it tokenizes "in history" as two words and # "sixth person" as one. - eq_( - tokenize(fact_simplified, 'zh', external_wordlist=True), - [ - # he / is / history / in / sixth person - '他', '是', '历史', '上', '第六位', - # during / term of office / in / die - '在', '任期', '内', '去世', - # of / U.S. / deputy / president - '的', '美国', '副', '总统' - ] - ) + assert tokenize(fact_simplified, 'zh', external_wordlist=True) == [ + # he / is / history / in / sixth person + '他', '是', '历史', '上', '第六位', + # during / term of office / in / die + '在', '任期', '内', '去世', + # of / U.S. / deputy / president + '的', '美国', '副', '总统' + ] # Check that Traditional Chinese works at all - assert_greater(word_frequency(fact_traditional, 'zh'), 0) + assert word_frequency(fact_traditional, 'zh') > 0 # You get the same token lengths if you look it up in Traditional Chinese, # but the words are different simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True) trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True) - eq_(''.join(simp_tokens), fact_simplified) - eq_(''.join(trad_tokens), fact_traditional) + assert ''.join(simp_tokens) == fact_simplified + assert ''.join(trad_tokens) == fact_traditional simp_lengths = [len(token) for token in simp_tokens] trad_lengths = [len(token) for token in trad_tokens] - eq_(simp_lengths, trad_lengths) + assert simp_lengths == trad_lengths def test_combination(): xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks" - assert_almost_equal( - word_frequency('谢谢谢谢', 'zh'), - xiexie_freq / 20 - ) + assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20) def test_alternate_codes(): @@ -83,12 +68,12 @@ def test_alternate_codes(): tokens = ['谢谢', '谢谢'] # Code with a region attached - eq_(tokenize('谢谢谢谢', 'zh-CN'), tokens) + assert tokenize('谢谢谢谢', 'zh-CN') == tokens # Over-long codes for Chinese - eq_(tokenize('谢谢谢谢', 'chi'), tokens) - eq_(tokenize('谢谢谢谢', 'zho'), tokens) + assert tokenize('谢谢谢谢', 'chi') == tokens + assert tokenize('谢谢谢谢', 'zho') == tokens # Separate codes for Mandarin and Cantonese - eq_(tokenize('谢谢谢谢', 'cmn'), tokens) - eq_(tokenize('谢谢谢谢', 'yue'), tokens) + assert tokenize('谢谢谢谢', 'cmn') == tokens + assert tokenize('谢谢谢谢', 'yue') == tokens