mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
port remaining tests to pytest
This commit is contained in:
parent
863d5be522
commit
96a01b9685
2
pytest.ini
Normal file
2
pytest.ini
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
[pytest]
|
||||||
|
addopts = --doctest-modules
|
@ -1,40 +1,32 @@
|
|||||||
from nose.tools import eq_, assert_almost_equal
|
|
||||||
from wordfreq import tokenize, word_frequency
|
from wordfreq import tokenize, word_frequency
|
||||||
|
|
||||||
|
|
||||||
def test_apostrophes():
|
def test_apostrophes():
|
||||||
# Test that we handle apostrophes in French reasonably.
|
# Test that we handle apostrophes in French reasonably.
|
||||||
eq_(tokenize("qu'un", 'fr'), ['qu', 'un'])
|
assert tokenize("qu'un", 'fr') == ['qu', 'un']
|
||||||
eq_(tokenize("qu'un", 'fr', include_punctuation=True),
|
assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"]
|
||||||
["qu'", "un"])
|
assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl']
|
||||||
eq_(tokenize("langues d'oïl", 'fr'),
|
assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
|
||||||
['langues', "d", 'oïl'])
|
assert tokenize("l'heure", 'fr') == ['l', 'heure']
|
||||||
eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True),
|
assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
|
||||||
['langues', "d'", 'oïl'])
|
assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
|
||||||
eq_(tokenize("l'heure", 'fr'),
|
assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
|
||||||
['l', 'heure'])
|
assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
|
||||||
eq_(tokenize("l'heure", 'fr', include_punctuation=True),
|
|
||||||
["l'", 'heure'])
|
|
||||||
eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True),
|
|
||||||
["l'", 'hôpital'])
|
|
||||||
eq_(tokenize("aujourd'hui", 'fr'), ["aujourd'hui"])
|
|
||||||
eq_(tokenize("This isn't French", 'en'),
|
|
||||||
['this', "isn't", 'french'])
|
|
||||||
|
|
||||||
|
|
||||||
def test_catastrophes():
|
def test_catastrophes():
|
||||||
# More apostrophes, but this time they're in Catalan, and there's other
|
# More apostrophes, but this time they're in Catalan, and there's other
|
||||||
# mid-word punctuation going on too.
|
# mid-word punctuation going on too.
|
||||||
eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
|
assert tokenize("M'acabo d'instal·lar.", 'ca') == ['m', 'acabo', 'd', 'instal·lar']
|
||||||
['m', 'acabo', 'd', 'instal·lar'])
|
assert (
|
||||||
eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
|
tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True) ==
|
||||||
["m'", 'acabo', "d'", 'instal·lar', '.'])
|
["m'", 'acabo', "d'", 'instal·lar', '.']
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_alternate_codes():
|
def test_alternate_codes():
|
||||||
# Try over-long language codes for French and Catalan
|
# Try over-long language codes for French and Catalan
|
||||||
eq_(tokenize("qu'un", 'fra'), ['qu', 'un'])
|
assert tokenize("qu'un", 'fra') == ['qu', 'un']
|
||||||
eq_(tokenize("qu'un", 'fre'), ['qu', 'un'])
|
assert tokenize("qu'un", 'fre') == ['qu', 'un']
|
||||||
eq_(tokenize("M'acabo d'instal·lar.", 'cat'),
|
assert tokenize("M'acabo d'instal·lar.", 'cat') == ['m', 'acabo', 'd', 'instal·lar']
|
||||||
['m', 'acabo', 'd', 'instal·lar'])
|
|
||||||
|
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
from nose.tools import eq_, assert_almost_equal
|
|
||||||
from wordfreq import tokenize, simple_tokenize, word_frequency
|
from wordfreq import tokenize, simple_tokenize, word_frequency
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_tokens():
|
def test_tokens():
|
||||||
eq_(tokenize('おはようございます', 'ja'),
|
assert tokenize('おはようございます', 'ja') == ['おはよう', 'ござい', 'ます']
|
||||||
['おはよう', 'ござい', 'ます'])
|
|
||||||
|
|
||||||
|
|
||||||
def test_simple_tokenize():
|
def test_simple_tokenize():
|
||||||
@ -19,31 +18,29 @@ def test_simple_tokenize():
|
|||||||
# We used to try to infer word boundaries between hiragana and katakana,
|
# We used to try to infer word boundaries between hiragana and katakana,
|
||||||
# but this leads to edge cases that are unsolvable without a dictionary.
|
# but this leads to edge cases that are unsolvable without a dictionary.
|
||||||
ja_text = 'ひらがなカタカナromaji'
|
ja_text = 'ひらがなカタカナromaji'
|
||||||
eq_(
|
assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji']
|
||||||
simple_tokenize(ja_text),
|
|
||||||
['ひらがなカタカナ', 'romaji']
|
|
||||||
)
|
|
||||||
|
|
||||||
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
|
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
|
||||||
# but sticks together in simple_tokenize
|
# but sticks together in simple_tokenize
|
||||||
eq_(simple_tokenize('おはようございます'), ['おはようございます'])
|
assert simple_tokenize('おはようございます') == ['おはようございます']
|
||||||
|
|
||||||
# Names that use the weird possessive marker ヶ, which is technically a
|
# Names that use the weird possessive marker ヶ, which is technically a
|
||||||
# katakana even though it's being used like a kanji, stay together as one
|
# katakana even though it's being used like a kanji, stay together as one
|
||||||
# token
|
# token
|
||||||
eq_(simple_tokenize("犬ヶ島"), ["犬ヶ島"])
|
assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]
|
||||||
|
|
||||||
# The word in ConceptNet that made me notice that simple_tokenize used
|
# The word in ConceptNet that made me notice that simple_tokenize used
|
||||||
# to have a problem with the character 々
|
# to have a problem with the character 々
|
||||||
eq_(simple_tokenize("晴々しい"), ["晴々しい"])
|
assert simple_tokenize("晴々しい") == ["晴々しい"]
|
||||||
|
|
||||||
# Explicit word separators are still token boundaries, such as the dot
|
# Explicit word separators are still token boundaries, such as the dot
|
||||||
# between "toner" and "cartridge" in "toner cartridge"
|
# between "toner" and "cartridge" in "toner cartridge"
|
||||||
eq_(simple_tokenize("トナー・カートリッジ"), ["トナー", "カートリッジ"])
|
assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"]
|
||||||
|
|
||||||
# This word has multiple weird characters that aren't quite kanji in it,
|
# This word has multiple weird characters that aren't quite kanji in it,
|
||||||
# and is in the dictionary
|
# and is in the dictionary
|
||||||
eq_(simple_tokenize("見ヶ〆料"), ["見ヶ〆料"])
|
assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -52,12 +49,11 @@ def test_combination():
|
|||||||
gozai_freq = word_frequency('ござい', 'ja')
|
gozai_freq = word_frequency('ござい', 'ja')
|
||||||
masu_freq = word_frequency('ます', 'ja')
|
masu_freq = word_frequency('ます', 'ja')
|
||||||
|
|
||||||
assert_almost_equal(
|
assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2)
|
||||||
word_frequency('おはようおはよう', 'ja'),
|
|
||||||
ohayou_freq / 2
|
assert (
|
||||||
)
|
1.0 / word_frequency('おはようございます', 'ja') ==
|
||||||
assert_almost_equal(
|
pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq)
|
||||||
1.0 / word_frequency('おはようございます', 'ja'),
|
|
||||||
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,22 +1,18 @@
|
|||||||
from nose.tools import eq_, assert_almost_equal
|
|
||||||
from wordfreq import tokenize, word_frequency
|
from wordfreq import tokenize, word_frequency
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_tokens():
|
def test_tokens():
|
||||||
eq_(tokenize('감사합니다', 'ko'),
|
assert tokenize('감사합니다', 'ko') == ['감사', '합니다']
|
||||||
['감사', '합니다'])
|
|
||||||
|
|
||||||
|
|
||||||
def test_combination():
|
def test_combination():
|
||||||
gamsa_freq = word_frequency('감사', 'ko')
|
gamsa_freq = word_frequency('감사', 'ko')
|
||||||
habnida_freq = word_frequency('합니다', 'ko')
|
habnida_freq = word_frequency('합니다', 'ko')
|
||||||
|
|
||||||
assert_almost_equal(
|
assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2)
|
||||||
word_frequency('감사감사', 'ko'),
|
assert (
|
||||||
gamsa_freq / 2
|
1.0 / word_frequency('감사합니다', 'ko') ==
|
||||||
)
|
pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq)
|
||||||
assert_almost_equal(
|
|
||||||
1.0 / word_frequency('감사합니다', 'ko'),
|
|
||||||
1.0 / gamsa_freq + 1.0 / habnida_freq
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -1,15 +1,18 @@
|
|||||||
from nose.tools import eq_
|
|
||||||
from wordfreq import tokenize
|
from wordfreq import tokenize
|
||||||
from wordfreq.preprocess import preprocess_text
|
from wordfreq.preprocess import preprocess_text
|
||||||
|
|
||||||
|
|
||||||
def test_transliteration():
|
def test_transliteration():
|
||||||
# "Well, there's a lot of things you do not understand."
|
# "Well, there's a lot of things you do not understand."
|
||||||
# (from somewhere in OpenSubtitles)
|
# (from somewhere in OpenSubtitles
|
||||||
eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'),
|
assert (
|
||||||
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
|
tokenize("Па, има ту много ствари које не схваташ.", 'sr') ==
|
||||||
eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
|
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
|
||||||
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
|
)
|
||||||
|
assert (
|
||||||
|
tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') ==
|
||||||
|
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
|
||||||
|
)
|
||||||
|
|
||||||
# I don't have examples of complete sentences in Azerbaijani that are
|
# I don't have examples of complete sentences in Azerbaijani that are
|
||||||
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani
|
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani
|
||||||
@ -17,14 +20,14 @@ def test_transliteration():
|
|||||||
# So here are some individual words.
|
# So here are some individual words.
|
||||||
|
|
||||||
# 'library' in Azerbaijani Cyrillic
|
# 'library' in Azerbaijani Cyrillic
|
||||||
eq_(preprocess_text('китабхана', 'az'), 'kitabxana')
|
assert preprocess_text('китабхана', 'az') == 'kitabxana'
|
||||||
eq_(preprocess_text('КИТАБХАНА', 'az'), 'kitabxana')
|
assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
|
||||||
eq_(preprocess_text('KİTABXANA', 'az'), 'kitabxana')
|
assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'
|
||||||
|
|
||||||
# 'scream' in Azerbaijani Cyrillic
|
# 'scream' in Azerbaijani Cyrillic
|
||||||
eq_(preprocess_text('бағырты', 'az'), 'bağırtı')
|
assert preprocess_text('бағырты', 'az') == 'bağırtı'
|
||||||
eq_(preprocess_text('БАҒЫРТЫ', 'az'), 'bağırtı')
|
assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
|
||||||
eq_(preprocess_text('BAĞIRTI', 'az'), 'bağırtı')
|
assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'
|
||||||
|
|
||||||
|
|
||||||
def test_actually_russian():
|
def test_actually_russian():
|
||||||
@ -35,15 +38,13 @@ def test_actually_russian():
|
|||||||
# We make sure to handle this case so we don't end up with a mixed-script
|
# We make sure to handle this case so we don't end up with a mixed-script
|
||||||
# word like "pacanы".
|
# word like "pacanы".
|
||||||
|
|
||||||
eq_(tokenize("сто из ста, пацаны!", 'sr'),
|
assert tokenize("сто из ста, пацаны!", 'sr') == ['sto', 'iz', 'sta', 'pacany']
|
||||||
['sto', 'iz', 'sta', 'pacany'])
|
assert tokenize("культуры", 'sr') == ["kul'tury"]
|
||||||
|
|
||||||
eq_(tokenize("культуры", 'sr'), ["kul'tury"])
|
|
||||||
|
|
||||||
|
|
||||||
def test_alternate_codes():
|
def test_alternate_codes():
|
||||||
# Try language codes for Serbo-Croatian that have been split, and now
|
# Try language codes for Serbo-Croatian that have been split, and now
|
||||||
# are canonically mapped to Serbian
|
# are canonically mapped to Serbian
|
||||||
eq_(tokenize("культуры", 'sh'), ["kul'tury"])
|
assert tokenize("культуры", 'sh') == ["kul'tury"]
|
||||||
eq_(tokenize("культуры", 'hbs'), ["kul'tury"])
|
assert tokenize("культуры", 'hbs') == ["kul'tury"]
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user