port test.py and test_chinese.py to pytest

This commit is contained in:
Robyn Speer 2018-06-01 16:33:06 -04:00
parent 8907423147
commit 75b4d62084
3 changed files with 130 additions and 161 deletions

View File

@ -1,5 +1,2 @@
[nosetests] [aliases]
verbosity=2 test=pytest
with-doctest=1
with-coverage=0
cover-package=wordfreq

View File

@ -2,57 +2,51 @@ from wordfreq import (
word_frequency, available_languages, cB_to_freq, word_frequency, available_languages, cB_to_freq,
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
) )
from nose.tools import ( import pytest
eq_, assert_almost_equal, assert_greater, raises, assert_not_equal
)
def test_freq_examples(): def test_freq_examples():
# Stopwords are most common in the correct language # Stopwords are most common in the correct language
assert_greater(word_frequency('the', 'en'), assert word_frequency('the', 'en') > word_frequency('de', 'en')
word_frequency('de', 'en')) assert word_frequency('de', 'es') > word_frequency('the', 'es')
assert_greater(word_frequency('de', 'es'),
word_frequency('the', 'es'))
# We get word frequencies from the 'large' list when available # We get word frequencies from the 'large' list when available
assert_greater(word_frequency('infrequency', 'en'), 0.) assert word_frequency('infrequency', 'en') > 0.
def test_languages(): def test_languages():
# Make sure we get all the languages when looking for the default # Make sure we get all the languages when looking for the default
# 'best' wordlist # 'best' wordlist
avail = available_languages() avail = available_languages()
assert_greater(len(avail), 32) assert len(avail) > 32
# 'small' covers the same languages, but with some different lists # 'small' covers the same languages, but with some different lists
avail_small = available_languages('small') avail_small = available_languages('small')
eq_(len(avail_small), len(avail)) assert len(avail_small) == len(avail)
assert_not_equal(avail_small, avail) assert avail_small != avail
# 'combined' is the same as 'small' # 'combined' is the same as 'small'
avail_old_name = available_languages('combined') avail_old_name = available_languages('combined')
eq_(avail_old_name, avail_small) assert avail_old_name == avail_small
# 'large' covers fewer languages # 'large' covers fewer languages
avail_large = available_languages('large') avail_large = available_languages('large')
assert_greater(len(avail_large), 12) assert len(avail_large) > 12
assert_greater(len(avail), len(avail_large)) assert len(avail) > len(avail_large)
# Look up the digit '2' in the main word list for each language # Look up the digit '2' in the main word list for each language
for lang in avail: for lang in avail:
assert_greater(word_frequency('2', lang), 0, lang) assert word_frequency('2', lang) > 0
# Make up a weirdly verbose language code and make sure # Make up a weirdly verbose language code and make sure
# we still get it # we still get it
new_lang_code = '%s-001-x-fake-extension' % lang.upper() new_lang_code = '%s-001-x-fake-extension' % lang.upper()
assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code) assert word_frequency('2', new_lang_code) > 0
def test_minimums(): def test_minimums():
eq_(word_frequency('esquivalience', 'en'), 0) assert word_frequency('esquivalience', 'en') == 0
eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6) assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
eq_(word_frequency('the', 'en', minimum=1), 1) assert word_frequency('the', 'en', minimum=1) == 1
def test_most_common_words(): def test_most_common_words():
@ -65,141 +59,135 @@ def test_most_common_words():
""" """
return top_n_list(lang, 1)[0] return top_n_list(lang, 1)[0]
eq_(get_most_common('ar'), 'في') assert get_most_common('ar') == 'في'
eq_(get_most_common('de'), 'die') assert get_most_common('de') == 'die'
eq_(get_most_common('en'), 'the') assert get_most_common('en') == 'the'
eq_(get_most_common('es'), 'de') assert get_most_common('es') == 'de'
eq_(get_most_common('fr'), 'de') assert get_most_common('fr') == 'de'
eq_(get_most_common('it'), 'di') assert get_most_common('it') == 'di'
eq_(get_most_common('ja'), '') assert get_most_common('ja') == ''
eq_(get_most_common('nl'), 'de') assert get_most_common('nl') == 'de'
eq_(get_most_common('pl'), 'w') assert get_most_common('pl') == 'w'
eq_(get_most_common('pt'), 'de') assert get_most_common('pt') == 'de'
eq_(get_most_common('ru'), 'в') assert get_most_common('ru') == 'в'
eq_(get_most_common('tr'), 'bir') assert get_most_common('tr') == 'bir'
eq_(get_most_common('zh'), '') assert get_most_common('zh') == ''
def test_language_matching(): def test_language_matching():
freq = word_frequency('', 'zh') freq = word_frequency('', 'zh')
eq_(word_frequency('', 'zh-TW'), freq) assert word_frequency('', 'zh-TW') == freq
eq_(word_frequency('', 'zh-CN'), freq) assert word_frequency('', 'zh-CN') == freq
eq_(word_frequency('', 'zh-Hant'), freq) assert word_frequency('', 'zh-Hant') == freq
eq_(word_frequency('', 'zh-Hans'), freq) assert word_frequency('', 'zh-Hans') == freq
eq_(word_frequency('', 'yue-HK'), freq) assert word_frequency('', 'yue-HK') == freq
eq_(word_frequency('', 'cmn'), freq) assert word_frequency('', 'cmn') == freq
def test_cB_conversion(): def test_cB_conversion():
eq_(cB_to_freq(0), 1.) assert cB_to_freq(0) == 1.
assert_almost_equal(cB_to_freq(-100), 0.1) assert cB_to_freq(-100) == pytest.approx(0.1)
assert_almost_equal(cB_to_freq(-600), 1e-6) assert cB_to_freq(-600) == pytest.approx(1e-6)
@raises(ValueError)
def test_failed_cB_conversion(): def test_failed_cB_conversion():
cB_to_freq(1) with pytest.raises(ValueError):
cB_to_freq(1)
def test_tokenization(): def test_tokenization():
# We preserve apostrophes within words, so "can't" is a single word in the # We preserve apostrophes within words, so "can't" is a single word in the
# data # data
eq_(tokenize("I don't split at apostrophes, you see.", 'en'), assert (
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']) tokenize("I don't split at apostrophes, you see.", 'en')
== ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']
)
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True), assert (
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']) tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True)
== ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']
)
# Certain punctuation does not inherently split a word. # Certain punctuation does not inherently split a word.
eq_(tokenize("Anything is possible at zombo.com", 'en'), assert (
['anything', 'is', 'possible', 'at', 'zombo.com']) tokenize("Anything is possible at zombo.com", 'en')
== ['anything', 'is', 'possible', 'at', 'zombo.com']
)
# Splits occur after symbols, and at splitting punctuation such as hyphens. # Splits occur after symbols, and at splitting punctuation such as hyphens.
eq_(tokenize('😂test', 'en'), ['😂', 'test']) assert tokenize('😂test', 'en') == ['😂', 'test']
assert tokenize("flip-flop", 'en') == ['flip', 'flop']
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop']) assert (
tokenize('this text has... punctuation :)', 'en', include_punctuation=True)
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True), == ['this', 'text', 'has', '...', 'punctuation', ':)']
['this', 'text', 'has', '...', 'punctuation', ':)']) )
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf' # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
# and 'David Bowie' stay together, because our Unicode segmentation algorithm # and 'David Bowie' stay together, because our Unicode segmentation algorithm
# is up to date # is up to date
eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽']) assert tokenize('emoji test 🧕🏽', 'en') == ['emoji', 'test', '🧕🏽']
assert (
eq_(tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'), tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en')
['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's", == ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
'nothing', 'i', 'can', 'do', '🌎', '🚀']) 'nothing', 'i', 'can', 'do', '🌎', '🚀']
)
# Water wave, surfer, flag of California (indicates ridiculously complete support # Water wave, surfer, flag of California (indicates ridiculously complete support
# for Unicode 10 and Emoji 5.0) # for Unicode 10 and Emoji 5.0)
eq_(tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en'), assert tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en') == ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"]
["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"])
def test_casefolding(): def test_casefolding():
eq_(tokenize('WEISS', 'de'), ['weiss']) assert tokenize('WEISS', 'de') == ['weiss']
eq_(tokenize('weiß', 'de'), ['weiss']) assert tokenize('weiß', 'de') == ['weiss']
eq_(tokenize('İstanbul', 'tr'), ['istanbul']) assert tokenize('İstanbul', 'tr') == ['istanbul']
eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca']) assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']
def test_number_smashing(): def test_number_smashing():
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'), assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
['715', 'crσσks', 'by', 'bon', 'iver']) assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'), assert (
['000', 'crσσks', 'by', 'bon', 'iver']) lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True), == ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']) )
eq_(lossy_tokenize('1', 'en'), ['1']) assert lossy_tokenize('1', 'en') == ['1']
eq_(lossy_tokenize('3.14', 'en'), ['0.00']) assert lossy_tokenize('3.14', 'en') == ['0.00']
eq_(lossy_tokenize('24601', 'en'), ['00000']) assert lossy_tokenize('24601', 'en') == ['00000']
eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en')) assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
def test_phrase_freq(): def test_phrase_freq():
ff = word_frequency("flip-flop", 'en') ff = word_frequency("flip-flop", 'en')
assert_greater(ff, 0) assert ff > 0
assert_almost_equal( phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
1.0 / ff, assert 1.0 / ff == pytest.approx(phrase_freq)
1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
)
def test_not_really_random(): def test_not_really_random():
# If your xkcd-style password comes out like this, maybe you shouldn't # If your xkcd-style password comes out like this, maybe you shouldn't
# use it # use it
eq_(random_words(nwords=4, lang='en', bits_per_word=0), assert random_words(nwords=4, lang='en', bits_per_word=0) == 'the the the the'
'the the the the')
# This not only tests random_ascii_words, it makes sure we didn't end # This not only tests random_ascii_words, it makes sure we didn't end
# up with 'eos' as a very common Japanese word # up with 'eos' as a very common Japanese word
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0), assert random_ascii_words(nwords=4, lang='ja', bits_per_word=0) == '1 1 1 1'
'1 1 1 1')
@raises(ValueError)
def test_not_enough_ascii(): def test_not_enough_ascii():
random_ascii_words(lang='zh', bits_per_word=14) with pytest.raises(ValueError):
random_ascii_words(lang='zh', bits_per_word=14)
def test_arabic(): def test_arabic():
# Remove tatweels # Remove tatweels
eq_( assert tokenize('متــــــــعب', 'ar') == ['متعب']
tokenize('متــــــــعب', 'ar'),
['متعب']
)
# Remove combining marks # Remove combining marks
eq_( assert tokenize('حَرَكَات', 'ar') == ['حركات']
tokenize('حَرَكَات', 'ar'),
['حركات']
)
eq_( # An Arabic ligature that is affected by NFKC normalization
tokenize('\ufefb', 'ar'), # An Arabic ligature... assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
['\u0644\u0627'] # ...that is affected by NFKC normalization
)
def test_ideographic_fallback(): def test_ideographic_fallback():
@ -207,29 +195,28 @@ def test_ideographic_fallback():
# #
# More complex examples like this, involving the multiple scripts of Japanese, # More complex examples like this, involving the multiple scripts of Japanese,
# are in test_japanese.py. # are in test_japanese.py.
eq_(tokenize('中国文字', 'en'), ['中国文字']) assert tokenize('中国文字', 'en') == ['中国文字']
def test_other_languages(): def test_other_languages():
# Test that we leave Thai letters stuck together. If we had better Thai support, # Test that we leave Thai letters stuck together. If we had better Thai support,
# we would actually split this into a three-word phrase. # we would actually split this into a three-word phrase.
eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี']) assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'), assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']
['การเล่นดนตรี', 'means', 'playing', 'music'])
# Test Khmer, a script similar to Thai # Test Khmer, a script similar to Thai
eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍']) assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']
# Test Hindi -- tokens split where there are spaces, and not where there aren't # Test Hindi -- tokens split where there are spaces, and not where there aren't
eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी']) assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']
# Remove vowel points in Hebrew # Remove vowel points in Hebrew
eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה']) assert tokenize('דֻּגְמָה', 'he') == ['דגמה']
# Deal with commas, cedillas, and I's in Turkish # Deal with commas, cedillas, and I's in Turkish
eq_(tokenize('kișinin', 'tr'), ['kişinin']) assert tokenize('kișinin', 'tr') == ['kişinin']
eq_(tokenize('KİȘİNİN', 'tr'), ['kişinin']) assert tokenize('KİȘİNİN', 'tr') == ['kişinin']
# Deal with cedillas that should be commas-below in Romanian # Deal with cedillas that should be commas-below in Romanian
eq_(tokenize('acelaşi', 'ro'), ['același']) assert tokenize('acelaşi', 'ro') == ['același']
eq_(tokenize('ACELAŞI', 'ro'), ['același']) assert tokenize('ACELAŞI', 'ro') == ['același']

View File

@ -1,5 +1,5 @@
from nose.tools import eq_, assert_almost_equal, assert_greater
from wordfreq import tokenize, word_frequency from wordfreq import tokenize, word_frequency
import pytest
def test_tokens(): def test_tokens():
@ -17,64 +17,49 @@ def test_tokens():
# His name breaks into five pieces, with the only piece staying together # His name breaks into five pieces, with the only piece staying together
# being the one that means 'Bart'. The dot is not included as a token. # being the one that means 'Bart'. The dot is not included as a token.
eq_( assert tokenize(hobart, 'zh') == ['', '', '', '', '巴特']
tokenize(hobart, 'zh'),
['', '', '', '', '巴特']
)
eq_( assert tokenize(fact_simplified, 'zh') == [
tokenize(fact_simplified, 'zh'), # he / is / history / in / #6 / counter for people
[ '', '', '历史', '', '第六', '',
# he / is / history / in / #6 / counter for people # during / term of office / in / die
'', '', '历史', '', '第六', '', '', '任期', '', '去世',
# during / term of office / in / die # of / U.S. / deputy / president
'', '任期', '', '去世', '', '美国', '', '总统'
# of / U.S. / deputy / president ]
'', '美国', '', '总统'
]
)
# Jieba's original tokenizer knows a lot of names, it seems. # Jieba's original tokenizer knows a lot of names, it seems.
eq_( assert tokenize(hobart, 'zh', external_wordlist=True) == ['加勒特', '霍巴特']
tokenize(hobart, 'zh', external_wordlist=True),
['加勒特', '霍巴特']
)
# We get almost the same tokens from the sentence using Jieba's own # We get almost the same tokens from the sentence using Jieba's own
# wordlist, but it tokenizes "in history" as two words and # wordlist, but it tokenizes "in history" as two words and
# "sixth person" as one. # "sixth person" as one.
eq_( assert tokenize(fact_simplified, 'zh', external_wordlist=True) == [
tokenize(fact_simplified, 'zh', external_wordlist=True), # he / is / history / in / sixth person
[ '', '', '历史', '', '第六位',
# he / is / history / in / sixth person # during / term of office / in / die
'', '', '历史', '', '第六位', '', '任期', '', '去世',
# during / term of office / in / die # of / U.S. / deputy / president
'', '任期', '', '去世', '', '美国', '', '总统'
# of / U.S. / deputy / president ]
'', '美国', '', '总统'
]
)
# Check that Traditional Chinese works at all # Check that Traditional Chinese works at all
assert_greater(word_frequency(fact_traditional, 'zh'), 0) assert word_frequency(fact_traditional, 'zh') > 0
# You get the same token lengths if you look it up in Traditional Chinese, # You get the same token lengths if you look it up in Traditional Chinese,
# but the words are different # but the words are different
simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True) simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True)
trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True) trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True)
eq_(''.join(simp_tokens), fact_simplified) assert ''.join(simp_tokens) == fact_simplified
eq_(''.join(trad_tokens), fact_traditional) assert ''.join(trad_tokens) == fact_traditional
simp_lengths = [len(token) for token in simp_tokens] simp_lengths = [len(token) for token in simp_tokens]
trad_lengths = [len(token) for token in trad_tokens] trad_lengths = [len(token) for token in trad_tokens]
eq_(simp_lengths, trad_lengths) assert simp_lengths == trad_lengths
def test_combination(): def test_combination():
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks" xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
assert_almost_equal( assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20)
word_frequency('谢谢谢谢', 'zh'),
xiexie_freq / 20
)
def test_alternate_codes(): def test_alternate_codes():
@ -83,12 +68,12 @@ def test_alternate_codes():
tokens = ['谢谢', '谢谢'] tokens = ['谢谢', '谢谢']
# Code with a region attached # Code with a region attached
eq_(tokenize('谢谢谢谢', 'zh-CN'), tokens) assert tokenize('谢谢谢谢', 'zh-CN') == tokens
# Over-long codes for Chinese # Over-long codes for Chinese
eq_(tokenize('谢谢谢谢', 'chi'), tokens) assert tokenize('谢谢谢谢', 'chi') == tokens
eq_(tokenize('谢谢谢谢', 'zho'), tokens) assert tokenize('谢谢谢谢', 'zho') == tokens
# Separate codes for Mandarin and Cantonese # Separate codes for Mandarin and Cantonese
eq_(tokenize('谢谢谢谢', 'cmn'), tokens) assert tokenize('谢谢谢谢', 'cmn') == tokens
eq_(tokenize('谢谢谢谢', 'yue'), tokens) assert tokenize('谢谢谢谢', 'yue') == tokens