From 75b4d62084fb9de3a1ba39ad34e01ac6467bca79 Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Fri, 1 Jun 2018 16:33:06 -0400
Subject: [PATCH] port test.py and test_chinese.py to pytest

---
 setup.cfg             |   7 +-
 tests/test.py         | 211 ++++++++++++++++++++----------------------
 tests/test_chinese.py |  73 ++++++---------
 3 files changed, 130 insertions(+), 161 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index aadbdf3..b7e4789 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,2 @@
-[nosetests]
-verbosity=2
-with-doctest=1
-with-coverage=0
-cover-package=wordfreq
+[aliases]
+test=pytest
diff --git a/tests/test.py b/tests/test.py
index d7fb321..164ea83 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -2,57 +2,51 @@ from wordfreq import (
     word_frequency, available_languages, cB_to_freq,
     top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
 )
-from nose.tools import (
-    eq_, assert_almost_equal, assert_greater, raises, assert_not_equal
-)
+import pytest
 
 
 def test_freq_examples():
     # Stopwords are most common in the correct language
-    assert_greater(word_frequency('the', 'en'),
-                   word_frequency('de', 'en'))
-
-    assert_greater(word_frequency('de', 'es'),
-                   word_frequency('the', 'es'))
-
+    assert word_frequency('the', 'en') > word_frequency('de', 'en')
+    assert word_frequency('de', 'es') > word_frequency('the', 'es')
     # We get word frequencies from the 'large' list when available
-    assert_greater(word_frequency('infrequency', 'en'), 0.)
+    assert word_frequency('infrequency', 'en') > 0.
 
 
 def test_languages():
     # Make sure we get all the languages when looking for the default
     # 'best' wordlist
     avail = available_languages()
-    assert_greater(len(avail), 32)
+    assert len(avail) > 32
 
     # 'small' covers the same languages, but with some different lists
     avail_small = available_languages('small')
-    eq_(len(avail_small), len(avail))
-    assert_not_equal(avail_small, avail)
+    assert len(avail_small) == len(avail)
+    assert avail_small != avail
 
     # 'combined' is the same as 'small'
     avail_old_name = available_languages('combined')
-    eq_(avail_old_name, avail_small)
+    assert avail_old_name == avail_small
 
     # 'large' covers fewer languages
     avail_large = available_languages('large')
-    assert_greater(len(avail_large), 12)
-    assert_greater(len(avail), len(avail_large))
+    assert len(avail_large) > 12
+    assert len(avail) > len(avail_large)
 
     # Look up the digit '2' in the main word list for each language
     for lang in avail:
-        assert_greater(word_frequency('2', lang), 0, lang)
+        assert word_frequency('2', lang) > 0
 
         # Make up a weirdly verbose language code and make sure
         # we still get it
         new_lang_code = '%s-001-x-fake-extension' % lang.upper()
-        assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code)
+        assert word_frequency('2', new_lang_code) > 0
 
 
 def test_minimums():
-    eq_(word_frequency('esquivalience', 'en'), 0)
-    eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
-    eq_(word_frequency('the', 'en', minimum=1), 1)
+    assert word_frequency('esquivalience', 'en') == 0
+    assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
+    assert word_frequency('the', 'en', minimum=1) == 1
 
 
 def test_most_common_words():
@@ -65,141 +59,135 @@ def test_most_common_words():
         """
         return top_n_list(lang, 1)[0]
 
-    eq_(get_most_common('ar'), 'في')
-    eq_(get_most_common('de'), 'die')
-    eq_(get_most_common('en'), 'the')
-    eq_(get_most_common('es'), 'de')
-    eq_(get_most_common('fr'), 'de')
-    eq_(get_most_common('it'), 'di')
-    eq_(get_most_common('ja'), 'の')
-    eq_(get_most_common('nl'), 'de')
-    eq_(get_most_common('pl'), 'w')
-    eq_(get_most_common('pt'), 'de')
-    eq_(get_most_common('ru'), 'в')
-    eq_(get_most_common('tr'), 'bir')
-    eq_(get_most_common('zh'), '的')
+    assert get_most_common('ar') == 'في'
+    assert get_most_common('de') == 'die'
+    assert get_most_common('en') == 'the'
+    assert get_most_common('es') == 'de'
+    assert get_most_common('fr') == 'de'
+    assert get_most_common('it') == 'di'
+    assert get_most_common('ja') == 'の'
+    assert get_most_common('nl') == 'de'
+    assert get_most_common('pl') == 'w'
+    assert get_most_common('pt') == 'de'
+    assert get_most_common('ru') == 'в'
+    assert get_most_common('tr') == 'bir'
+    assert get_most_common('zh') == '的'
 
 
 def test_language_matching():
     freq = word_frequency('的', 'zh')
-    eq_(word_frequency('的', 'zh-TW'), freq)
-    eq_(word_frequency('的', 'zh-CN'), freq)
-    eq_(word_frequency('的', 'zh-Hant'), freq)
-    eq_(word_frequency('的', 'zh-Hans'), freq)
-    eq_(word_frequency('的', 'yue-HK'), freq)
-    eq_(word_frequency('的', 'cmn'), freq)
+    assert word_frequency('的', 'zh-TW') == freq
+    assert word_frequency('的', 'zh-CN') == freq
+    assert word_frequency('的', 'zh-Hant') == freq
+    assert word_frequency('的', 'zh-Hans') == freq
+    assert word_frequency('的', 'yue-HK') == freq
+    assert word_frequency('的', 'cmn') == freq
 
 
 def test_cB_conversion():
-    eq_(cB_to_freq(0), 1.)
-    assert_almost_equal(cB_to_freq(-100), 0.1)
-    assert_almost_equal(cB_to_freq(-600), 1e-6)
+    assert cB_to_freq(0) == 1.
+    assert cB_to_freq(-100) == pytest.approx(0.1)
+    assert cB_to_freq(-600) == pytest.approx(1e-6)
 
 
-@raises(ValueError)
 def test_failed_cB_conversion():
-    cB_to_freq(1)
+    with pytest.raises(ValueError):
+        cB_to_freq(1)
 
 
 def test_tokenization():
     # We preserve apostrophes within words, so "can't" is a single word in the
     # data
-    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
-        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
+    assert (
+        tokenize("I don't split at apostrophes, you see.", 'en')
+        == ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']
+    )
 
-    eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
-        ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
+    assert (
+        tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True)
+        == ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']
+    )
 
     # Certain punctuation does not inherently split a word.
-    eq_(tokenize("Anything is possible at zombo.com", 'en'),
-        ['anything', 'is', 'possible', 'at', 'zombo.com'])
+    assert (
+        tokenize("Anything is possible at zombo.com", 'en')
+        == ['anything', 'is', 'possible', 'at', 'zombo.com']
+    )
 
     # Splits occur after symbols, and at splitting punctuation such as hyphens.
-    eq_(tokenize('😂test', 'en'), ['😂', 'test'])
-
-    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
-
-    eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
-        ['this', 'text', 'has', '...', 'punctuation', ':)'])
+    assert tokenize('😂test', 'en') == ['😂', 'test']
+    assert tokenize("flip-flop", 'en') == ['flip', 'flop']
+    assert (
+        tokenize('this text has... punctuation :)', 'en', include_punctuation=True)
+        == ['this', 'text', 'has', '...', 'punctuation', ':)']
+    )
 
     # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
     # and 'David Bowie' stay together, because our Unicode segmentation algorithm
     # is up to date
-    eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽'])
-
-    eq_(tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'),
-        ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
-         'nothing', 'i', 'can', 'do', '🌎', '🚀'])
+    assert tokenize('emoji test 🧕🏽', 'en') == ['emoji', 'test', '🧕🏽']
+    assert (
+        tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en')
+        == ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
+            'nothing', 'i', 'can', 'do', '🌎', '🚀']
+    )
 
     # Water wave, surfer, flag of California (indicates ridiculously complete support
     # for Unicode 10 and Emoji 5.0)
-    eq_(tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en'),
-        ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"])
+    assert tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en') == ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"]
 
 
 def test_casefolding():
-    eq_(tokenize('WEISS', 'de'), ['weiss'])
-    eq_(tokenize('weiß', 'de'), ['weiss'])
-    eq_(tokenize('İstanbul', 'tr'), ['istanbul'])
-    eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
+    assert tokenize('WEISS', 'de') == ['weiss']
+    assert tokenize('weiß', 'de') == ['weiss']
+    assert tokenize('İstanbul', 'tr') == ['istanbul']
+    assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']
 
 
 def test_number_smashing():
-    eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
-        ['715', 'crσσks', 'by', 'bon', 'iver'])
-    eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
-        ['000', 'crσσks', 'by', 'bon', 'iver'])
-    eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True),
-        ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
-    eq_(lossy_tokenize('1', 'en'), ['1'])
-    eq_(lossy_tokenize('3.14', 'en'), ['0.00'])
-    eq_(lossy_tokenize('24601', 'en'), ['00000'])
-    eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
+    assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
+    assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
+    assert (
+        lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
+        == ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
+    )
+    assert lossy_tokenize('1', 'en') == ['1']
+    assert lossy_tokenize('3.14', 'en') == ['0.00']
+    assert lossy_tokenize('24601', 'en') == ['00000']
+    assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
 
 
 def test_phrase_freq():
     ff = word_frequency("flip-flop", 'en')
-    assert_greater(ff, 0)
-    assert_almost_equal(
-        1.0 / ff,
-        1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
-    )
+    assert ff > 0
+    phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
+    assert 1.0 / ff == pytest.approx(phrase_freq)
 
 
 def test_not_really_random():
     # If your xkcd-style password comes out like this, maybe you shouldn't
     # use it
-    eq_(random_words(nwords=4, lang='en', bits_per_word=0),
-        'the the the the')
+    assert random_words(nwords=4, lang='en', bits_per_word=0) == 'the the the the'
 
     # This not only tests random_ascii_words, it makes sure we didn't end
     # up with 'eos' as a very common Japanese word
-    eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
-        '1 1 1 1')
+    assert random_ascii_words(nwords=4, lang='ja', bits_per_word=0) == '1 1 1 1'
 
 
-@raises(ValueError)
 def test_not_enough_ascii():
-    random_ascii_words(lang='zh', bits_per_word=14)
+    with pytest.raises(ValueError):
+        random_ascii_words(lang='zh', bits_per_word=14)
 
 
 def test_arabic():
     # Remove tatweels
-    eq_(
-        tokenize('متــــــــعب', 'ar'),
-        ['متعب']
-    )
+    assert tokenize('متــــــــعب', 'ar') == ['متعب']
 
     # Remove combining marks
-    eq_(
-        tokenize('حَرَكَات', 'ar'),
-        ['حركات']
-    )
+    assert tokenize('حَرَكَات', 'ar') == ['حركات']
 
-    eq_(
-        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
-        ['\u0644\u0627']  # ...that is affected by NFKC normalization
-    )
+    # An Arabic ligature that is affected by NFKC normalization
+    assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
 
 
 def test_ideographic_fallback():
@@ -207,29 +195,28 @@ def test_ideographic_fallback():
     #
     # More complex examples like this, involving the multiple scripts of Japanese,
     # are in test_japanese.py.
-    eq_(tokenize('中国文字', 'en'), ['中国文字'])
+    assert tokenize('中国文字', 'en') == ['中国文字']
 
 
 def test_other_languages():
     # Test that we leave Thai letters stuck together. If we had better Thai support,
     # we would actually split this into a three-word phrase.
-    eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
-    eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
-        ['การเล่นดนตรี', 'means', 'playing', 'music'])
+    assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
+    assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']
 
     # Test Khmer, a script similar to Thai
-    eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍'])
+    assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']
 
     # Test Hindi -- tokens split where there are spaces, and not where there aren't
-    eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी'])
+    assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']
 
     # Remove vowel points in Hebrew
-    eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה'])
+    assert tokenize('דֻּגְמָה', 'he') == ['דגמה']
 
     # Deal with commas, cedillas, and I's in Turkish
-    eq_(tokenize('kișinin', 'tr'), ['kişinin'])
-    eq_(tokenize('KİȘİNİN', 'tr'), ['kişinin'])
+    assert tokenize('kișinin', 'tr') == ['kişinin']
+    assert tokenize('KİȘİNİN', 'tr') == ['kişinin']
 
     # Deal with cedillas that should be commas-below in Romanian
-    eq_(tokenize('acelaşi', 'ro'), ['același'])
-    eq_(tokenize('ACELAŞI', 'ro'), ['același'])
+    assert tokenize('acelaşi', 'ro') == ['același']
+    assert tokenize('ACELAŞI', 'ro') == ['același']
diff --git a/tests/test_chinese.py b/tests/test_chinese.py
index 58df4a1..a5e9f51 100644
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@@ -1,5 +1,5 @@
-from nose.tools import eq_, assert_almost_equal, assert_greater
 from wordfreq import tokenize, word_frequency
+import pytest
 
 
 def test_tokens():
@@ -17,64 +17,49 @@ def test_tokens():
 
     # His name breaks into five pieces, with the only piece staying together
     # being the one that means 'Bart'. The dot is not included as a token.
-    eq_(
-        tokenize(hobart, 'zh'),
-        ['加', '勒', '特', '霍', '巴特']
-    )
+    assert tokenize(hobart, 'zh') == ['加', '勒', '特', '霍', '巴特']
 
-    eq_(
-        tokenize(fact_simplified, 'zh'),
-        [
-            # he / is / history / in / #6 / counter for people
-            '他', '是',  '历史', '上', '第六', '位',
-            # during / term of office / in / die
-            '在', '任期', '内', '去世',
-            # of / U.S. / deputy / president
-            '的', '美国', '副', '总统'
-        ]
-    )
+    assert tokenize(fact_simplified, 'zh') == [
+        # he / is / history / in / #6 / counter for people
+        '他', '是',  '历史', '上', '第六', '位',
+        # during / term of office / in / die
+        '在', '任期', '内', '去世',
+        # of / U.S. / deputy / president
+        '的', '美国', '副', '总统'
+    ]
 
     # Jieba's original tokenizer knows a lot of names, it seems.
-    eq_(
-        tokenize(hobart, 'zh', external_wordlist=True),
-        ['加勒特', '霍巴特']
-    )
+    assert tokenize(hobart, 'zh', external_wordlist=True) == ['加勒特', '霍巴特']
 
     # We get almost the same tokens from the sentence using Jieba's own
     # wordlist, but it tokenizes "in history" as two words and
     # "sixth person" as one.
-    eq_(
-        tokenize(fact_simplified, 'zh', external_wordlist=True),
-        [
-            # he / is / history / in / sixth person
-            '他', '是', '历史', '上', '第六位',
-            # during / term of office / in / die
-            '在', '任期', '内', '去世',
-            # of / U.S. / deputy / president
-            '的', '美国', '副', '总统'
-        ]
-    )
+    assert tokenize(fact_simplified, 'zh', external_wordlist=True) == [
+        # he / is / history / in / sixth person
+        '他', '是', '历史', '上', '第六位',
+        # during / term of office / in / die
+        '在', '任期', '内', '去世',
+        # of / U.S. / deputy / president
+        '的', '美国', '副', '总统'
+    ]
 
     # Check that Traditional Chinese works at all
-    assert_greater(word_frequency(fact_traditional, 'zh'), 0)
+    assert word_frequency(fact_traditional, 'zh') > 0
 
     # You get the same token lengths if you look it up in Traditional Chinese,
     # but the words are different
     simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True)
     trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True)
-    eq_(''.join(simp_tokens), fact_simplified)
-    eq_(''.join(trad_tokens), fact_traditional)
+    assert ''.join(simp_tokens) == fact_simplified
+    assert ''.join(trad_tokens) == fact_traditional
     simp_lengths = [len(token) for token in simp_tokens]
     trad_lengths = [len(token) for token in trad_tokens]
-    eq_(simp_lengths, trad_lengths)
+    assert simp_lengths == trad_lengths
 
 
 def test_combination():
     xiexie_freq = word_frequency('谢谢', 'zh')   # "Thanks"
-    assert_almost_equal(
-        word_frequency('谢谢谢谢', 'zh'),
-        xiexie_freq / 20
-    )
+    assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20)
 
 
 def test_alternate_codes():
@@ -83,12 +68,12 @@ def test_alternate_codes():
     tokens = ['谢谢', '谢谢']
 
     # Code with a region attached
-    eq_(tokenize('谢谢谢谢', 'zh-CN'), tokens)
+    assert tokenize('谢谢谢谢', 'zh-CN') == tokens
 
     # Over-long codes for Chinese
-    eq_(tokenize('谢谢谢谢', 'chi'), tokens)
-    eq_(tokenize('谢谢谢谢', 'zho'), tokens)
+    assert tokenize('谢谢谢谢', 'chi') == tokens
+    assert tokenize('谢谢谢谢', 'zho') == tokens
 
     # Separate codes for Mandarin and Cantonese
-    eq_(tokenize('谢谢谢谢', 'cmn'), tokens)
-    eq_(tokenize('谢谢谢谢', 'yue'), tokens)
+    assert tokenize('谢谢谢谢', 'cmn') == tokens
+    assert tokenize('谢谢谢谢', 'yue') == tokens