Merge pull request #57 from LuminosoInsight/version2.1

Version 2.1
2024-12-23 09:21:37 +00:00 · 2018-06-18 12:06:47 -04:00 · 2018-06-18 12:06:47 -04:00 · a95b360563
commit a95b360563
parent 8907423147 39a1308770
61 changed files with 36320 additions and 25967 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -14,7 +14,7 @@ those cases we want to detect only the most obvious token boundaries.

 In this situation, we no longer try to detect script changes, such as between
 kanji and katakana, as token boundaries. This particularly allows us to keep
-together Japanese words where ヶ appears betwen kanji, as well as words that
+together Japanese words where ヶ appears between kanji, as well as words that
 use the iteration mark 々.

 This change does not affect any word frequencies. (The Japanese word list uses
--- a/README.md
+++ b/README.md
@ -174,13 +174,13 @@ least 3 different sources of word frequencies:
    Bosnian     bs [1]  3  -      │ Yes   Yes   -     -     -     Yes   -     -
    Bulgarian   bg      3  -      │ Yes   Yes   -     -     -     Yes   -     -
    Catalan     ca      4  -      │ Yes   Yes   Yes   -     -     Yes   -     -
-    Chinese     zh [3]  6  Yes    │ Yes   -     Yes   Yes   Yes   Yes   -     Jieba
+    Chinese     zh [3]  7  Yes    │ Yes   Yes   Yes   Yes   Yes   Yes   -     Jieba
    Croatian    hr [1]  3         │ Yes   Yes   -     -     -     Yes   -     -
-    Czech       cs      3  -      │ Yes   Yes   -     -     -     Yes   -     -
+    Czech       cs      5  Yes    │ Yes   Yes   Yes   -     Yes   Yes   -     -
    Danish      da      3  -      │ Yes   Yes   -     -     -     Yes   -     -
-    Dutch       nl      4  Yes    │ Yes   Yes   Yes   -     -     Yes   -     -
+    Dutch       nl      5  Yes    │ Yes   Yes   Yes   -     Yes   Yes   -     -
    English     en      7  Yes    │ Yes   Yes   Yes   Yes   Yes   Yes   Yes   -
-    Finnish     fi      5  Yes    │ Yes   Yes   Yes   -     -     Yes   Yes   -
+    Finnish     fi      6  Yes    │ Yes   Yes   Yes   -     Yes   Yes   Yes   -
    French      fr      7  Yes    │ Yes   Yes   Yes   Yes   Yes   Yes   Yes   -
    German      de      7  Yes    │ Yes   Yes   Yes   Yes   Yes   Yes   Yes   -
    Greek       el      3  -      │ Yes   Yes   -     -     Yes   -     -     -
@ -191,13 +191,14 @@ least 3 different sources of word frequencies:
    Italian     it      7  Yes    │ Yes   Yes   Yes   Yes   Yes   Yes   Yes   -
    Japanese    ja      5  Yes    │ Yes   Yes   -     -     Yes   Yes   Yes   -
    Korean      ko      4  -      │ Yes   Yes   -     -     -     Yes   Yes   -
+    Latvian     lv      4  -      │ Yes   Yes   -     -     Yes   Yes   -     -
    Macedonian  mk      3  -      │ Yes   Yes   Yes   -     -     -     -     -
    Malay       ms      3  -      │ Yes   Yes   -     -     -     Yes   -     -
    Norwegian   nb [2]  4  -      │ Yes   Yes   -     -     -     Yes   Yes   -
    Persian     fa      3  -      │ Yes   Yes   -     -     -     Yes   -     -
-    Polish      pl      5  Yes    │ Yes   Yes   Yes   -     -     Yes   Yes   -
+    Polish      pl      6  Yes    │ Yes   Yes   Yes   -     Yes   Yes   Yes   -
    Portuguese  pt      5  Yes    │ Yes   Yes   Yes   -     Yes   Yes   -     -
-    Romanian    ro      3  -      │ Yes   Yes   -     -     -     Yes   -     -
+    Romanian    ro      4  -      │ Yes   Yes   -     -     Yes   Yes   -     -
    Russian     ru      6  Yes    │ Yes   Yes   Yes   Yes   Yes   Yes   -     -
    Serbian     sr [1]  3  -      │ Yes   Yes   -     -     -     Yes   -     -
    Spanish     es      7  Yes    │ Yes   Yes   Yes   Yes   Yes   Yes   Yes   -
@ -219,7 +220,7 @@ Chinese, with primarily Mandarin Chinese vocabulary. See "Multi-script
 languages" below.

 Some languages provide 'large' wordlists, including words with a Zipf frequency
-between 1.0 and 3.0. These are available in 13 languages that are covered by
+between 1.0 and 3.0. These are available in 14 languages that are covered by
 enough data sources.


--- a/pytest.ini
+++ b/pytest.ini
@ -0,0 +1,2 @@
+[pytest]
+addopts = --doctest-modules
--- a/setup.cfg
+++ b/setup.cfg
@ -1,5 +1,2 @@
-[nosetests]
-verbosity=2
-with-doctest=1
-with-coverage=0
-cover-package=wordfreq
+[aliases]
+test=pytest
--- a/setup.py
+++ b/setup.py
@ -35,7 +35,7 @@ if sys.version_info < (3, 4):

 setup(
    name="wordfreq",
-    version='2.0.1',
+    version='2.1.0',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
@ -60,5 +60,5 @@ setup(
        'mecab': 'mecab-python3',
        'jieba': 'jieba'
    },
-    tests_require=['mecab-python3', 'jieba'],
+    tests_require=['pytest', 'mecab-python3', 'jieba'],
 )
--- a/tests/test.py
+++ b/tests/test.py
@ -1,235 +0,0 @@
-from wordfreq import (
-    word_frequency, available_languages, cB_to_freq,
-    top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
-)
-from nose.tools import (
-    eq_, assert_almost_equal, assert_greater, raises, assert_not_equal
-)
-
-
-def test_freq_examples():
-    # Stopwords are most common in the correct language
-    assert_greater(word_frequency('the', 'en'),
-                   word_frequency('de', 'en'))
-
-    assert_greater(word_frequency('de', 'es'),
-                   word_frequency('the', 'es'))
-
-    # We get word frequencies from the 'large' list when available
-    assert_greater(word_frequency('infrequency', 'en'), 0.)
-
-
-def test_languages():
-    # Make sure we get all the languages when looking for the default
-    # 'best' wordlist
-    avail = available_languages()
-    assert_greater(len(avail), 32)
-
-    # 'small' covers the same languages, but with some different lists
-    avail_small = available_languages('small')
-    eq_(len(avail_small), len(avail))
-    assert_not_equal(avail_small, avail)
-
-    # 'combined' is the same as 'small'
-    avail_old_name = available_languages('combined')
-    eq_(avail_old_name, avail_small)
-
-    # 'large' covers fewer languages
-    avail_large = available_languages('large')
-    assert_greater(len(avail_large), 12)
-    assert_greater(len(avail), len(avail_large))
-
-    # Look up the digit '2' in the main word list for each language
-    for lang in avail:
-        assert_greater(word_frequency('2', lang), 0, lang)
-
-        # Make up a weirdly verbose language code and make sure
-        # we still get it
-        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
-        assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code)
-
-
-def test_minimums():
-    eq_(word_frequency('esquivalience', 'en'), 0)
-    eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
-    eq_(word_frequency('the', 'en', minimum=1), 1)
-
-
-def test_most_common_words():
-    # If something causes the most common words in well-supported languages to
-    # change, we should know.
-
-    def get_most_common(lang):
-        """
-        Return the single most common word in the language.
-        """
-        return top_n_list(lang, 1)[0]
-
-    eq_(get_most_common('ar'), 'في')
-    eq_(get_most_common('de'), 'die')
-    eq_(get_most_common('en'), 'the')
-    eq_(get_most_common('es'), 'de')
-    eq_(get_most_common('fr'), 'de')
-    eq_(get_most_common('it'), 'di')
-    eq_(get_most_common('ja'), 'の')
-    eq_(get_most_common('nl'), 'de')
-    eq_(get_most_common('pl'), 'w')
-    eq_(get_most_common('pt'), 'de')
-    eq_(get_most_common('ru'), 'в')
-    eq_(get_most_common('tr'), 'bir')
-    eq_(get_most_common('zh'), '的')
-
-
-def test_language_matching():
-    freq = word_frequency('的', 'zh')
-    eq_(word_frequency('的', 'zh-TW'), freq)
-    eq_(word_frequency('的', 'zh-CN'), freq)
-    eq_(word_frequency('的', 'zh-Hant'), freq)
-    eq_(word_frequency('的', 'zh-Hans'), freq)
-    eq_(word_frequency('的', 'yue-HK'), freq)
-    eq_(word_frequency('的', 'cmn'), freq)
-
-
-def test_cB_conversion():
-    eq_(cB_to_freq(0), 1.)
-    assert_almost_equal(cB_to_freq(-100), 0.1)
-    assert_almost_equal(cB_to_freq(-600), 1e-6)
-
-
-@raises(ValueError)
-def test_failed_cB_conversion():
-    cB_to_freq(1)
-
-
-def test_tokenization():
-    # We preserve apostrophes within words, so "can't" is a single word in the
-    # data
-    eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
-        ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
-
-    eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
-        ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
-
-    # Certain punctuation does not inherently split a word.
-    eq_(tokenize("Anything is possible at zombo.com", 'en'),
-        ['anything', 'is', 'possible', 'at', 'zombo.com'])
-
-    # Splits occur after symbols, and at splitting punctuation such as hyphens.
-    eq_(tokenize('😂test', 'en'), ['😂', 'test'])
-
-    eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
-
-    eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
-        ['this', 'text', 'has', '...', 'punctuation', ':)'])
-
-    # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
-    # and 'David Bowie' stay together, because our Unicode segmentation algorithm
-    # is up to date
-    eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽'])
-
-    eq_(tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'),
-        ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
-         'nothing', 'i', 'can', 'do', '🌎', '🚀'])
-
-    # Water wave, surfer, flag of California (indicates ridiculously complete support
-    # for Unicode 10 and Emoji 5.0)
-    eq_(tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en'),
-        ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"])
-
-
-def test_casefolding():
-    eq_(tokenize('WEISS', 'de'), ['weiss'])
-    eq_(tokenize('weiß', 'de'), ['weiss'])
-    eq_(tokenize('İstanbul', 'tr'), ['istanbul'])
-    eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
-
-
-def test_number_smashing():
-    eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
-        ['715', 'crσσks', 'by', 'bon', 'iver'])
-    eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
-        ['000', 'crσσks', 'by', 'bon', 'iver'])
-    eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True),
-        ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
-    eq_(lossy_tokenize('1', 'en'), ['1'])
-    eq_(lossy_tokenize('3.14', 'en'), ['0.00'])
-    eq_(lossy_tokenize('24601', 'en'), ['00000'])
-    eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
-
-
-def test_phrase_freq():
-    ff = word_frequency("flip-flop", 'en')
-    assert_greater(ff, 0)
-    assert_almost_equal(
-        1.0 / ff,
-        1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
-    )
-
-
-def test_not_really_random():
-    # If your xkcd-style password comes out like this, maybe you shouldn't
-    # use it
-    eq_(random_words(nwords=4, lang='en', bits_per_word=0),
-        'the the the the')
-
-    # This not only tests random_ascii_words, it makes sure we didn't end
-    # up with 'eos' as a very common Japanese word
-    eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
-        '1 1 1 1')
-
-
-@raises(ValueError)
-def test_not_enough_ascii():
-    random_ascii_words(lang='zh', bits_per_word=14)
-
-
-def test_arabic():
-    # Remove tatweels
-    eq_(
-        tokenize('متــــــــعب', 'ar'),
-        ['متعب']
-    )
-
-    # Remove combining marks
-    eq_(
-        tokenize('حَرَكَات', 'ar'),
-        ['حركات']
-    )
-
-    eq_(
-        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
-        ['\u0644\u0627']  # ...that is affected by NFKC normalization
-    )
-
-
-def test_ideographic_fallback():
-    # Try tokenizing Chinese text as English -- it should remain stuck together.
-    #
-    # More complex examples like this, involving the multiple scripts of Japanese,
-    # are in test_japanese.py.
-    eq_(tokenize('中国文字', 'en'), ['中国文字'])
-
-
-def test_other_languages():
-    # Test that we leave Thai letters stuck together. If we had better Thai support,
-    # we would actually split this into a three-word phrase.
-    eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
-    eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
-        ['การเล่นดนตรี', 'means', 'playing', 'music'])
-
-    # Test Khmer, a script similar to Thai
-    eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍'])
-
-    # Test Hindi -- tokens split where there are spaces, and not where there aren't
-    eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी'])
-
-    # Remove vowel points in Hebrew
-    eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה'])
-
-    # Deal with commas, cedillas, and I's in Turkish
-    eq_(tokenize('kișinin', 'tr'), ['kişinin'])
-    eq_(tokenize('KİȘİNİN', 'tr'), ['kişinin'])
-
-    # Deal with cedillas that should be commas-below in Romanian
-    eq_(tokenize('acelaşi', 'ro'), ['același'])
-    eq_(tokenize('ACELAŞI', 'ro'), ['același'])
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@ -1,5 +1,5 @@
-from nose.tools import eq_, assert_almost_equal, assert_greater
 from wordfreq import tokenize, word_frequency
+import pytest


 def test_tokens():
@ -17,64 +17,49 @@ def test_tokens():

    # His name breaks into five pieces, with the only piece staying together
    # being the one that means 'Bart'. The dot is not included as a token.
-    eq_(
-        tokenize(hobart, 'zh'),
-        ['加', '勒', '特', '霍', '巴特']
-    )
+    assert tokenize(hobart, 'zh') == ['加', '勒', '特', '霍', '巴特']

-    eq_(
-        tokenize(fact_simplified, 'zh'),
-        [
-            # he / is / history / in / #6 / counter for people
-            '他', '是',  '历史', '上', '第六', '位',
-            # during / term of office / in / die
-            '在', '任期', '内', '去世',
-            # of / U.S. / deputy / president
-            '的', '美国', '副', '总统'
-        ]
-    )
+    assert tokenize(fact_simplified, 'zh') == [
+        # he / is / history / in / #6 / counter for people
+        '他', '是',  '历史', '上', '第六', '位',
+        # during / term of office / in / die
+        '在', '任期', '内', '去世',
+        # of / U.S. / deputy / president
+        '的', '美国', '副', '总统'
+    ]

    # Jieba's original tokenizer knows a lot of names, it seems.
-    eq_(
-        tokenize(hobart, 'zh', external_wordlist=True),
-        ['加勒特', '霍巴特']
-    )
+    assert tokenize(hobart, 'zh', external_wordlist=True) == ['加勒特', '霍巴特']

    # We get almost the same tokens from the sentence using Jieba's own
    # wordlist, but it tokenizes "in history" as two words and
    # "sixth person" as one.
-    eq_(
-        tokenize(fact_simplified, 'zh', external_wordlist=True),
-        [
-            # he / is / history / in / sixth person
-            '他', '是', '历史', '上', '第六位',
-            # during / term of office / in / die
-            '在', '任期', '内', '去世',
-            # of / U.S. / deputy / president
-            '的', '美国', '副', '总统'
-        ]
-    )
+    assert tokenize(fact_simplified, 'zh', external_wordlist=True) == [
+        # he / is / history / in / sixth person
+        '他', '是', '历史', '上', '第六位',
+        # during / term of office / in / die
+        '在', '任期', '内', '去世',
+        # of / U.S. / deputy / president
+        '的', '美国', '副', '总统'
+    ]

    # Check that Traditional Chinese works at all
-    assert_greater(word_frequency(fact_traditional, 'zh'), 0)
+    assert word_frequency(fact_traditional, 'zh') > 0

    # You get the same token lengths if you look it up in Traditional Chinese,
    # but the words are different
    simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True)
    trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True)
-    eq_(''.join(simp_tokens), fact_simplified)
-    eq_(''.join(trad_tokens), fact_traditional)
+    assert ''.join(simp_tokens) == fact_simplified
+    assert ''.join(trad_tokens) == fact_traditional
    simp_lengths = [len(token) for token in simp_tokens]
    trad_lengths = [len(token) for token in trad_tokens]
-    eq_(simp_lengths, trad_lengths)
+    assert simp_lengths == trad_lengths


 def test_combination():
    xiexie_freq = word_frequency('谢谢', 'zh')   # "Thanks"
-    assert_almost_equal(
-        word_frequency('谢谢谢谢', 'zh'),
-        xiexie_freq / 20
-    )
+    assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20)


 def test_alternate_codes():
@ -83,12 +68,12 @@ def test_alternate_codes():
    tokens = ['谢谢', '谢谢']

    # Code with a region attached
-    eq_(tokenize('谢谢谢谢', 'zh-CN'), tokens)
+    assert tokenize('谢谢谢谢', 'zh-CN') == tokens

    # Over-long codes for Chinese
-    eq_(tokenize('谢谢谢谢', 'chi'), tokens)
-    eq_(tokenize('谢谢谢谢', 'zho'), tokens)
+    assert tokenize('谢谢谢谢', 'chi') == tokens
+    assert tokenize('谢谢谢谢', 'zho') == tokens

    # Separate codes for Mandarin and Cantonese
-    eq_(tokenize('谢谢谢谢', 'cmn'), tokens)
-    eq_(tokenize('谢谢谢谢', 'yue'), tokens)
+    assert tokenize('谢谢谢谢', 'cmn') == tokens
+    assert tokenize('谢谢谢谢', 'yue') == tokens
--- a/tests/test_french_and_related.py
+++ b/tests/test_french_and_related.py
@ -1,40 +1,32 @@
-from nose.tools import eq_, assert_almost_equal
 from wordfreq import tokenize, word_frequency


 def test_apostrophes():
    # Test that we handle apostrophes in French reasonably.
-    eq_(tokenize("qu'un", 'fr'), ['qu', 'un'])
-    eq_(tokenize("qu'un", 'fr', include_punctuation=True),
-        ["qu'", "un"])
-    eq_(tokenize("langues d'oïl", 'fr'),
-        ['langues', "d", 'oïl'])
-    eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True),
-        ['langues', "d'", 'oïl'])
-    eq_(tokenize("l'heure", 'fr'),
-        ['l', 'heure'])
-    eq_(tokenize("l'heure", 'fr', include_punctuation=True),
-        ["l'", 'heure'])
-    eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True),
-        ["l'", 'hôpital'])
-    eq_(tokenize("aujourd'hui", 'fr'), ["aujourd'hui"])
-    eq_(tokenize("This isn't French", 'en'),
-        ['this', "isn't", 'french'])
+    assert tokenize("qu'un", 'fr') == ['qu', 'un']
+    assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"]
+    assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl']
+    assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
+    assert tokenize("l'heure", 'fr') == ['l', 'heure']
+    assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
+    assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
+    assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
+    assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']


 def test_catastrophes():
    # More apostrophes, but this time they're in Catalan, and there's other
    # mid-word punctuation going on too.
-    eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
-        ['m', 'acabo', 'd', 'instal·lar'])
-    eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
-        ["m'", 'acabo', "d'", 'instal·lar', '.'])
+    assert tokenize("M'acabo d'instal·lar.", 'ca') == ['m', 'acabo', 'd', 'instal·lar']
+    assert (
+        tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True) ==
+        ["m'", 'acabo', "d'", 'instal·lar', '.']
+    )


 def test_alternate_codes():
    # Try over-long language codes for French and Catalan
-    eq_(tokenize("qu'un", 'fra'), ['qu', 'un'])
-    eq_(tokenize("qu'un", 'fre'), ['qu', 'un'])
-    eq_(tokenize("M'acabo d'instal·lar.", 'cat'),
-        ['m', 'acabo', 'd', 'instal·lar'])
+    assert tokenize("qu'un", 'fra') == ['qu', 'un']
+    assert tokenize("qu'un", 'fre') == ['qu', 'un']
+    assert tokenize("M'acabo d'instal·lar.", 'cat') == ['m', 'acabo', 'd', 'instal·lar']

--- a/tests/test_general.py
+++ b/tests/test_general.py
@ -0,0 +1,223 @@
+from wordfreq import (
+    word_frequency, available_languages, cB_to_freq,
+    top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
+)
+import pytest
+
+
+def test_freq_examples():
+    # Stopwords are most common in the correct language
+    assert word_frequency('the', 'en') > word_frequency('de', 'en')
+    assert word_frequency('de', 'es') > word_frequency('the', 'es')
+    # We get word frequencies from the 'large' list when available
+    assert word_frequency('infrequency', 'en') > 0.
+
+
+def test_languages():
+    # Make sure we get all the languages when looking for the default
+    # 'best' wordlist
+    avail = available_languages()
+    assert len(avail) >= 34
+
+    # 'small' covers the same languages, but with some different lists
+    avail_small = available_languages('small')
+    assert len(avail_small) == len(avail)
+    assert avail_small != avail
+
+    # 'combined' is the same as 'small'
+    avail_old_name = available_languages('combined')
+    assert avail_old_name == avail_small
+
+    # 'large' covers fewer languages
+    avail_large = available_languages('large')
+    assert len(avail_large) >= 14
+    assert len(avail) > len(avail_large)
+
+    # Look up the digit '2' in the main word list for each language
+    for lang in avail:
+        assert word_frequency('2', lang) > 0
+
+        # Make up a weirdly verbose language code and make sure
+        # we still get it
+        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
+        assert word_frequency('2', new_lang_code) > 0
+
+
+def test_minimums():
+    assert word_frequency('esquivalience', 'en') == 0
+    assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
+    assert word_frequency('the', 'en', minimum=1) == 1
+
+
+def test_most_common_words():
+    # If something causes the most common words in well-supported languages to
+    # change, we should know.
+
+    def get_most_common(lang):
+        """
+        Return the single most common word in the language.
+        """
+        return top_n_list(lang, 1)[0]
+
+    assert get_most_common('ar') == 'في'
+    assert get_most_common('cs') == 'a'
+    assert get_most_common('de') == 'die'
+    assert get_most_common('en') == 'the'
+    assert get_most_common('es') == 'de'
+    assert get_most_common('fr') == 'de'
+    assert get_most_common('it') == 'di'
+    assert get_most_common('ja') == 'の'
+    assert get_most_common('nl') == 'de'
+    assert get_most_common('pl') == 'w'
+    assert get_most_common('pt') == 'de'
+    assert get_most_common('ru') == 'в'
+    assert get_most_common('tr') == 'bir'
+    assert get_most_common('zh') == '的'
+
+
+def test_language_matching():
+    freq = word_frequency('的', 'zh')
+    assert word_frequency('的', 'zh-TW') == freq
+    assert word_frequency('的', 'zh-CN') == freq
+    assert word_frequency('的', 'zh-Hant') == freq
+    assert word_frequency('的', 'zh-Hans') == freq
+    assert word_frequency('的', 'yue-HK') == freq
+    assert word_frequency('的', 'cmn') == freq
+
+
+def test_cB_conversion():
+    assert cB_to_freq(0) == 1.
+    assert cB_to_freq(-100) == pytest.approx(0.1)
+    assert cB_to_freq(-600) == pytest.approx(1e-6)
+
+
+def test_failed_cB_conversion():
+    with pytest.raises(ValueError):
+        cB_to_freq(1)
+
+
+def test_tokenization():
+    # We preserve apostrophes within words, so "can't" is a single word in the
+    # data
+    assert (
+        tokenize("I don't split at apostrophes, you see.", 'en')
+        == ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']
+    )
+
+    assert (
+        tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True)
+        == ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']
+    )
+
+    # Certain punctuation does not inherently split a word.
+    assert (
+        tokenize("Anything is possible at zombo.com", 'en')
+        == ['anything', 'is', 'possible', 'at', 'zombo.com']
+    )
+
+    # Splits occur after symbols, and at splitting punctuation such as hyphens.
+    assert tokenize('😂test', 'en') == ['😂', 'test']
+    assert tokenize("flip-flop", 'en') == ['flip', 'flop']
+    assert (
+        tokenize('this text has... punctuation :)', 'en', include_punctuation=True)
+        == ['this', 'text', 'has', '...', 'punctuation', ':)']
+    )
+
+    # Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
+    # and 'David Bowie' stay together, because our Unicode segmentation algorithm
+    # is up to date
+    assert tokenize('emoji test 🧕🏽', 'en') == ['emoji', 'test', '🧕🏽']
+    assert (
+        tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en')
+        == ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
+            'nothing', 'i', 'can', 'do', '🌎', '🚀']
+    )
+
+    # Water wave, surfer, flag of California (indicates ridiculously complete support
+    # for Unicode 10 and Emoji 5.0)
+    assert tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en') == ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"]
+
+
+def test_casefolding():
+    assert tokenize('WEISS', 'de') == ['weiss']
+    assert tokenize('weiß', 'de') == ['weiss']
+    assert tokenize('İstanbul', 'tr') == ['istanbul']
+    assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']
+
+
+def test_number_smashing():
+    assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
+    assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
+    assert (
+        lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
+        == ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
+    )
+    assert lossy_tokenize('1', 'en') == ['1']
+    assert lossy_tokenize('3.14', 'en') == ['0.00']
+    assert lossy_tokenize('24601', 'en') == ['00000']
+    assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
+
+
+def test_phrase_freq():
+    ff = word_frequency("flip-flop", 'en')
+    assert ff > 0
+    phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
+    assert 1.0 / ff == pytest.approx(phrase_freq)
+
+
+def test_not_really_random():
+    # If your xkcd-style password comes out like this, maybe you shouldn't
+    # use it
+    assert random_words(nwords=4, lang='en', bits_per_word=0) == 'the the the the'
+
+    # This not only tests random_ascii_words, it makes sure we didn't end
+    # up with 'eos' as a very common Japanese word
+    assert random_ascii_words(nwords=4, lang='ja', bits_per_word=0) == '00 00 00 00'
+
+
+def test_not_enough_ascii():
+    with pytest.raises(ValueError):
+        random_ascii_words(lang='zh', bits_per_word=16)
+
+
+def test_arabic():
+    # Remove tatweels
+    assert tokenize('متــــــــعب', 'ar') == ['متعب']
+
+    # Remove combining marks
+    assert tokenize('حَرَكَات', 'ar') == ['حركات']
+
+    # An Arabic ligature that is affected by NFKC normalization
+    assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
+
+
+def test_ideographic_fallback():
+    # Try tokenizing Chinese text as English -- it should remain stuck together.
+    #
+    # More complex examples like this, involving the multiple scripts of Japanese,
+    # are in test_japanese.py.
+    assert tokenize('中国文字', 'en') == ['中国文字']
+
+
+def test_other_languages():
+    # Test that we leave Thai letters stuck together. If we had better Thai support,
+    # we would actually split this into a three-word phrase.
+    assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
+    assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']
+
+    # Test Khmer, a script similar to Thai
+    assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']
+
+    # Test Hindi -- tokens split where there are spaces, and not where there aren't
+    assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']
+
+    # Remove vowel points in Hebrew
+    assert tokenize('דֻּגְמָה', 'he') == ['דגמה']
+
+    # Deal with commas, cedillas, and I's in Turkish
+    assert tokenize('kișinin', 'tr') == ['kişinin']
+    assert tokenize('KİȘİNİN', 'tr') == ['kişinin']
+
+    # Deal with cedillas that should be commas-below in Romanian
+    assert tokenize('acelaşi', 'ro') == ['același']
+    assert tokenize('ACELAŞI', 'ro') == ['același']
--- a/tests/test_japanese.py
+++ b/tests/test_japanese.py
@ -1,10 +1,9 @@
-from nose.tools import eq_, assert_almost_equal
 from wordfreq import tokenize, simple_tokenize, word_frequency
+import pytest


 def test_tokens():
-    eq_(tokenize('おはようございます', 'ja'),
-        ['おはよう', 'ござい', 'ます'])
+    assert tokenize('おはようございます', 'ja') == ['おはよう', 'ござい', 'ます']


 def test_simple_tokenize():
@ -19,31 +18,29 @@ def test_simple_tokenize():
    # We used to try to infer word boundaries between hiragana and katakana,
    # but this leads to edge cases that are unsolvable without a dictionary.
    ja_text = 'ひらがなカタカナromaji'
-    eq_(
-        simple_tokenize(ja_text),
-        ['ひらがなカタカナ', 'romaji']
-    )
+    assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji']
+    

    # An example that would be multiple tokens if tokenized as 'ja' via MeCab,
    # but sticks together in simple_tokenize
-    eq_(simple_tokenize('おはようございます'), ['おはようございます'])
+    assert simple_tokenize('おはようございます') == ['おはようございます']

    # Names that use the weird possessive marker ヶ, which is technically a
    # katakana even though it's being used like a kanji, stay together as one
    # token
-    eq_(simple_tokenize("犬ヶ島"), ["犬ヶ島"])
+    assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]

    # The word in ConceptNet that made me notice that simple_tokenize used
    # to have a problem with the character 々
-    eq_(simple_tokenize("晴々しい"), ["晴々しい"])
+    assert simple_tokenize("晴々しい") == ["晴々しい"]

    # Explicit word separators are still token boundaries, such as the dot
    # between "toner" and "cartridge" in "toner cartridge"
-    eq_(simple_tokenize("トナー・カートリッジ"), ["トナー", "カートリッジ"])
+    assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"]

    # This word has multiple weird characters that aren't quite kanji in it,
    # and is in the dictionary
-    eq_(simple_tokenize("見ヶ〆料"), ["見ヶ〆料"])
+    assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]



@ -52,12 +49,11 @@ def test_combination():
    gozai_freq = word_frequency('ござい', 'ja')
    masu_freq = word_frequency('ます', 'ja')

-    assert_almost_equal(
-        word_frequency('おはようおはよう', 'ja'),
-        ohayou_freq / 2
-    )
-    assert_almost_equal(
-        1.0 / word_frequency('おはようございます', 'ja'),
-        1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
+    assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2)
+    
+    assert (
+        1.0 / word_frequency('おはようございます', 'ja') ==
+        pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq)
    )
+    

--- a/tests/test_korean.py
+++ b/tests/test_korean.py
@ -1,22 +1,18 @@
-from nose.tools import eq_, assert_almost_equal
 from wordfreq import tokenize, word_frequency
+import pytest


 def test_tokens():
-    eq_(tokenize('감사합니다', 'ko'),
-        ['감사', '합니다'])
+    assert tokenize('감사합니다', 'ko') == ['감사', '합니다']


 def test_combination():
    gamsa_freq = word_frequency('감사', 'ko')
    habnida_freq = word_frequency('합니다', 'ko')

-    assert_almost_equal(
-        word_frequency('감사감사', 'ko'),
-        gamsa_freq / 2
-    )
-    assert_almost_equal(
-        1.0 / word_frequency('감사합니다', 'ko'),
-        1.0 / gamsa_freq + 1.0 / habnida_freq
+    assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2)
+    assert (
+        1.0 / word_frequency('감사합니다', 'ko') ==
+        pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq)
    )

--- a/tests/test_transliteration.py
+++ b/tests/test_transliteration.py
@ -1,15 +1,18 @@
-from nose.tools import eq_
 from wordfreq import tokenize
 from wordfreq.preprocess import preprocess_text


 def test_transliteration():
    # "Well, there's a lot of things you do not understand."
-    # (from somewhere in OpenSubtitles)
-    eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'),
-        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
-    eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
-        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
+    # (from somewhere in OpenSubtitles
+    assert (
+        tokenize("Па, има ту много ствари које не схваташ.", 'sr') ==
+        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
+    )
+    assert (
+        tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') ==
+        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
+    )

    # I don't have examples of complete sentences in Azerbaijani that are
    # naturally in Cyrillic, because it turns out everyone writes Azerbaijani
@ -17,14 +20,14 @@ def test_transliteration():
    # So here are some individual words.

    # 'library' in Azerbaijani Cyrillic
-    eq_(preprocess_text('китабхана', 'az'), 'kitabxana')
-    eq_(preprocess_text('КИТАБХАНА', 'az'), 'kitabxana')
-    eq_(preprocess_text('KİTABXANA', 'az'), 'kitabxana')
+    assert preprocess_text('китабхана', 'az') == 'kitabxana'
+    assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
+    assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'

    # 'scream' in Azerbaijani Cyrillic
-    eq_(preprocess_text('бағырты', 'az'), 'bağırtı')
-    eq_(preprocess_text('БАҒЫРТЫ', 'az'), 'bağırtı')
-    eq_(preprocess_text('BAĞIRTI', 'az'), 'bağırtı')
+    assert preprocess_text('бағырты', 'az') == 'bağırtı'
+    assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
+    assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'


 def test_actually_russian():
@ -35,15 +38,13 @@ def test_actually_russian():
    # We make sure to handle this case so we don't end up with a mixed-script
    # word like "pacanы".

-    eq_(tokenize("сто из ста, пацаны!", 'sr'),
-        ['sto', 'iz', 'sta', 'pacany'])
-
-    eq_(tokenize("культуры", 'sr'), ["kul'tury"])
+    assert tokenize("сто из ста, пацаны!", 'sr') == ['sto', 'iz', 'sta', 'pacany']
+    assert tokenize("культуры", 'sr') == ["kul'tury"]


 def test_alternate_codes():
    # Try language codes for Serbo-Croatian that have been split, and now
    # are canonically mapped to Serbian
-    eq_(tokenize("культуры", 'sh'), ["kul'tury"])
-    eq_(tokenize("культуры", 'hbs'), ["kul'tury"])
+    assert tokenize("культуры", 'sh') == ["kul'tury"]
+    assert tokenize("культуры", 'hbs') == ["kul'tury"]

--- a/wordfreq/data/jieba_zh.txt
+++ b/wordfreq/data/jieba_zh.txt
--- a/wordfreq/data/large_ar.msgpack.gz
+++ b/wordfreq/data/large_ar.msgpack.gz
--- a/wordfreq/data/large_cs.msgpack.gz
+++ b/wordfreq/data/large_cs.msgpack.gz
--- a/wordfreq/data/large_de.msgpack.gz
+++ b/wordfreq/data/large_de.msgpack.gz
--- a/wordfreq/data/large_en.msgpack.gz
+++ b/wordfreq/data/large_en.msgpack.gz
--- a/wordfreq/data/large_es.msgpack.gz
+++ b/wordfreq/data/large_es.msgpack.gz
--- a/wordfreq/data/large_fi.msgpack.gz
+++ b/wordfreq/data/large_fi.msgpack.gz
--- a/wordfreq/data/large_fr.msgpack.gz
+++ b/wordfreq/data/large_fr.msgpack.gz
--- a/wordfreq/data/large_it.msgpack.gz
+++ b/wordfreq/data/large_it.msgpack.gz
--- a/wordfreq/data/large_ja.msgpack.gz
+++ b/wordfreq/data/large_ja.msgpack.gz
--- a/wordfreq/data/large_nl.msgpack.gz
+++ b/wordfreq/data/large_nl.msgpack.gz
--- a/wordfreq/data/large_pl.msgpack.gz
+++ b/wordfreq/data/large_pl.msgpack.gz
--- a/wordfreq/data/large_pt.msgpack.gz
+++ b/wordfreq/data/large_pt.msgpack.gz
--- a/wordfreq/data/large_ru.msgpack.gz
+++ b/wordfreq/data/large_ru.msgpack.gz
--- a/wordfreq/data/large_zh.msgpack.gz
+++ b/wordfreq/data/large_zh.msgpack.gz
--- a/wordfreq/data/small_ar.msgpack.gz
+++ b/wordfreq/data/small_ar.msgpack.gz
--- a/wordfreq/data/small_bg.msgpack.gz
+++ b/wordfreq/data/small_bg.msgpack.gz
--- a/wordfreq/data/small_bn.msgpack.gz
+++ b/wordfreq/data/small_bn.msgpack.gz
--- a/wordfreq/data/small_ca.msgpack.gz
+++ b/wordfreq/data/small_ca.msgpack.gz
--- a/wordfreq/data/small_cs.msgpack.gz
+++ b/wordfreq/data/small_cs.msgpack.gz
--- a/wordfreq/data/small_da.msgpack.gz
+++ b/wordfreq/data/small_da.msgpack.gz
--- a/wordfreq/data/small_de.msgpack.gz
+++ b/wordfreq/data/small_de.msgpack.gz
--- a/wordfreq/data/small_el.msgpack.gz
+++ b/wordfreq/data/small_el.msgpack.gz
--- a/wordfreq/data/small_en.msgpack.gz
+++ b/wordfreq/data/small_en.msgpack.gz
--- a/wordfreq/data/small_es.msgpack.gz
+++ b/wordfreq/data/small_es.msgpack.gz
--- a/wordfreq/data/small_fa.msgpack.gz
+++ b/wordfreq/data/small_fa.msgpack.gz
--- a/wordfreq/data/small_fi.msgpack.gz
+++ b/wordfreq/data/small_fi.msgpack.gz
--- a/wordfreq/data/small_fr.msgpack.gz
+++ b/wordfreq/data/small_fr.msgpack.gz
--- a/wordfreq/data/small_he.msgpack.gz
+++ b/wordfreq/data/small_he.msgpack.gz
--- a/wordfreq/data/small_hi.msgpack.gz
+++ b/wordfreq/data/small_hi.msgpack.gz
--- a/wordfreq/data/small_hu.msgpack.gz
+++ b/wordfreq/data/small_hu.msgpack.gz
--- a/wordfreq/data/small_id.msgpack.gz
+++ b/wordfreq/data/small_id.msgpack.gz
--- a/wordfreq/data/small_it.msgpack.gz
+++ b/wordfreq/data/small_it.msgpack.gz
--- a/wordfreq/data/small_ja.msgpack.gz
+++ b/wordfreq/data/small_ja.msgpack.gz
--- a/wordfreq/data/small_ko.msgpack.gz
+++ b/wordfreq/data/small_ko.msgpack.gz
--- a/wordfreq/data/small_lv.msgpack.gz
+++ b/wordfreq/data/small_lv.msgpack.gz
--- a/wordfreq/data/small_mk.msgpack.gz
+++ b/wordfreq/data/small_mk.msgpack.gz
--- a/wordfreq/data/small_ms.msgpack.gz
+++ b/wordfreq/data/small_ms.msgpack.gz
--- a/wordfreq/data/small_nb.msgpack.gz
+++ b/wordfreq/data/small_nb.msgpack.gz
--- a/wordfreq/data/small_nl.msgpack.gz
+++ b/wordfreq/data/small_nl.msgpack.gz
--- a/wordfreq/data/small_pl.msgpack.gz
+++ b/wordfreq/data/small_pl.msgpack.gz
--- a/wordfreq/data/small_pt.msgpack.gz
+++ b/wordfreq/data/small_pt.msgpack.gz
--- a/wordfreq/data/small_ro.msgpack.gz
+++ b/wordfreq/data/small_ro.msgpack.gz
--- a/wordfreq/data/small_ru.msgpack.gz
+++ b/wordfreq/data/small_ru.msgpack.gz
--- a/wordfreq/data/small_sh.msgpack.gz
+++ b/wordfreq/data/small_sh.msgpack.gz
--- a/wordfreq/data/small_sv.msgpack.gz
+++ b/wordfreq/data/small_sv.msgpack.gz
--- a/wordfreq/data/small_tr.msgpack.gz
+++ b/wordfreq/data/small_tr.msgpack.gz
--- a/wordfreq/data/small_uk.msgpack.gz
+++ b/wordfreq/data/small_uk.msgpack.gz
--- a/wordfreq/data/small_zh.msgpack.gz
+++ b/wordfreq/data/small_zh.msgpack.gz