diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..df3eb51 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = --doctest-modules diff --git a/tests/test_french_and_related.py b/tests/test_french_and_related.py index c347213..58b2d88 100644 --- a/tests/test_french_and_related.py +++ b/tests/test_french_and_related.py @@ -1,40 +1,32 @@ -from nose.tools import eq_, assert_almost_equal from wordfreq import tokenize, word_frequency def test_apostrophes(): # Test that we handle apostrophes in French reasonably. - eq_(tokenize("qu'un", 'fr'), ['qu', 'un']) - eq_(tokenize("qu'un", 'fr', include_punctuation=True), - ["qu'", "un"]) - eq_(tokenize("langues d'oïl", 'fr'), - ['langues', "d", 'oïl']) - eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True), - ['langues', "d'", 'oïl']) - eq_(tokenize("l'heure", 'fr'), - ['l', 'heure']) - eq_(tokenize("l'heure", 'fr', include_punctuation=True), - ["l'", 'heure']) - eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True), - ["l'", 'hôpital']) - eq_(tokenize("aujourd'hui", 'fr'), ["aujourd'hui"]) - eq_(tokenize("This isn't French", 'en'), - ['this', "isn't", 'french']) + assert tokenize("qu'un", 'fr') == ['qu', 'un'] + assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"] + assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl'] + assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl'] + assert tokenize("l'heure", 'fr') == ['l', 'heure'] + assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure'] + assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital'] + assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"] + assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french'] def test_catastrophes(): # More apostrophes, but this time they're in Catalan, and there's other # mid-word punctuation going on too. - eq_(tokenize("M'acabo d'instal·lar.", 'ca'), - ['m', 'acabo', 'd', 'instal·lar']) - eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True), - ["m'", 'acabo', "d'", 'instal·lar', '.']) + assert tokenize("M'acabo d'instal·lar.", 'ca') == ['m', 'acabo', 'd', 'instal·lar'] + assert ( + tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True) == + ["m'", 'acabo', "d'", 'instal·lar', '.'] + ) def test_alternate_codes(): # Try over-long language codes for French and Catalan - eq_(tokenize("qu'un", 'fra'), ['qu', 'un']) - eq_(tokenize("qu'un", 'fre'), ['qu', 'un']) - eq_(tokenize("M'acabo d'instal·lar.", 'cat'), - ['m', 'acabo', 'd', 'instal·lar']) + assert tokenize("qu'un", 'fra') == ['qu', 'un'] + assert tokenize("qu'un", 'fre') == ['qu', 'un'] + assert tokenize("M'acabo d'instal·lar.", 'cat') == ['m', 'acabo', 'd', 'instal·lar'] diff --git a/tests/test_japanese.py b/tests/test_japanese.py index 1cd1efa..5e977cf 100644 --- a/tests/test_japanese.py +++ b/tests/test_japanese.py @@ -1,10 +1,9 @@ -from nose.tools import eq_, assert_almost_equal from wordfreq import tokenize, simple_tokenize, word_frequency +import pytest def test_tokens(): - eq_(tokenize('おはようございます', 'ja'), - ['おはよう', 'ござい', 'ます']) + assert tokenize('おはようございます', 'ja') == ['おはよう', 'ござい', 'ます'] def test_simple_tokenize(): @@ -19,31 +18,29 @@ def test_simple_tokenize(): # We used to try to infer word boundaries between hiragana and katakana, # but this leads to edge cases that are unsolvable without a dictionary. ja_text = 'ひらがなカタカナromaji' - eq_( - simple_tokenize(ja_text), - ['ひらがなカタカナ', 'romaji'] - ) + assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji'] + # An example that would be multiple tokens if tokenized as 'ja' via MeCab, # but sticks together in simple_tokenize - eq_(simple_tokenize('おはようございます'), ['おはようございます']) + assert simple_tokenize('おはようございます') == ['おはようございます'] # Names that use the weird possessive marker ヶ, which is technically a # katakana even though it's being used like a kanji, stay together as one # token - eq_(simple_tokenize("犬ヶ島"), ["犬ヶ島"]) + assert simple_tokenize("犬ヶ島") == ["犬ヶ島"] # The word in ConceptNet that made me notice that simple_tokenize used # to have a problem with the character 々 - eq_(simple_tokenize("晴々しい"), ["晴々しい"]) + assert simple_tokenize("晴々しい") == ["晴々しい"] # Explicit word separators are still token boundaries, such as the dot # between "toner" and "cartridge" in "toner cartridge" - eq_(simple_tokenize("トナー・カートリッジ"), ["トナー", "カートリッジ"]) + assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"] # This word has multiple weird characters that aren't quite kanji in it, # and is in the dictionary - eq_(simple_tokenize("見ヶ〆料"), ["見ヶ〆料"]) + assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"] @@ -52,12 +49,11 @@ def test_combination(): gozai_freq = word_frequency('ござい', 'ja') masu_freq = word_frequency('ます', 'ja') - assert_almost_equal( - word_frequency('おはようおはよう', 'ja'), - ohayou_freq / 2 - ) - assert_almost_equal( - 1.0 / word_frequency('おはようございます', 'ja'), - 1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq + assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2) + + assert ( + 1.0 / word_frequency('おはようございます', 'ja') == + pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq) ) + diff --git a/tests/test_korean.py b/tests/test_korean.py index bcbf29c..96d599a 100644 --- a/tests/test_korean.py +++ b/tests/test_korean.py @@ -1,22 +1,18 @@ -from nose.tools import eq_, assert_almost_equal from wordfreq import tokenize, word_frequency +import pytest def test_tokens(): - eq_(tokenize('감사합니다', 'ko'), - ['감사', '합니다']) + assert tokenize('감사합니다', 'ko') == ['감사', '합니다'] def test_combination(): gamsa_freq = word_frequency('감사', 'ko') habnida_freq = word_frequency('합니다', 'ko') - assert_almost_equal( - word_frequency('감사감사', 'ko'), - gamsa_freq / 2 - ) - assert_almost_equal( - 1.0 / word_frequency('감사합니다', 'ko'), - 1.0 / gamsa_freq + 1.0 / habnida_freq + assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2) + assert ( + 1.0 / word_frequency('감사합니다', 'ko') == + pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq) ) diff --git a/tests/test_transliteration.py b/tests/test_transliteration.py index d7e4455..08486b0 100644 --- a/tests/test_transliteration.py +++ b/tests/test_transliteration.py @@ -1,15 +1,18 @@ -from nose.tools import eq_ from wordfreq import tokenize from wordfreq.preprocess import preprocess_text def test_transliteration(): # "Well, there's a lot of things you do not understand." - # (from somewhere in OpenSubtitles) - eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'), - ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']) - eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'), - ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']) + # (from somewhere in OpenSubtitles + assert ( + tokenize("Па, има ту много ствари које не схваташ.", 'sr') == + ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'] + ) + assert ( + tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') == + ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'] + ) # I don't have examples of complete sentences in Azerbaijani that are # naturally in Cyrillic, because it turns out everyone writes Azerbaijani @@ -17,14 +20,14 @@ def test_transliteration(): # So here are some individual words. # 'library' in Azerbaijani Cyrillic - eq_(preprocess_text('китабхана', 'az'), 'kitabxana') - eq_(preprocess_text('КИТАБХАНА', 'az'), 'kitabxana') - eq_(preprocess_text('KİTABXANA', 'az'), 'kitabxana') + assert preprocess_text('китабхана', 'az') == 'kitabxana' + assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana' + assert preprocess_text('KİTABXANA', 'az') == 'kitabxana' # 'scream' in Azerbaijani Cyrillic - eq_(preprocess_text('бағырты', 'az'), 'bağırtı') - eq_(preprocess_text('БАҒЫРТЫ', 'az'), 'bağırtı') - eq_(preprocess_text('BAĞIRTI', 'az'), 'bağırtı') + assert preprocess_text('бағырты', 'az') == 'bağırtı' + assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı' + assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı' def test_actually_russian(): @@ -35,15 +38,13 @@ def test_actually_russian(): # We make sure to handle this case so we don't end up with a mixed-script # word like "pacanы". - eq_(tokenize("сто из ста, пацаны!", 'sr'), - ['sto', 'iz', 'sta', 'pacany']) - - eq_(tokenize("культуры", 'sr'), ["kul'tury"]) + assert tokenize("сто из ста, пацаны!", 'sr') == ['sto', 'iz', 'sta', 'pacany'] + assert tokenize("культуры", 'sr') == ["kul'tury"] def test_alternate_codes(): # Try language codes for Serbo-Croatian that have been split, and now # are canonically mapped to Serbian - eq_(tokenize("культуры", 'sh'), ["kul'tury"]) - eq_(tokenize("культуры", 'hbs'), ["kul'tury"]) + assert tokenize("культуры", 'sh') == ["kul'tury"] + assert tokenize("культуры", 'hbs') == ["kul'tury"]