wordfreq/tests/test.py
2018-06-01 16:33:06 -04:00

223 lines
8.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from wordfreq import (
word_frequency, available_languages, cB_to_freq,
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
)
import pytest
def test_freq_examples():
# Stopwords are most common in the correct language
assert word_frequency('the', 'en') > word_frequency('de', 'en')
assert word_frequency('de', 'es') > word_frequency('the', 'es')
# We get word frequencies from the 'large' list when available
assert word_frequency('infrequency', 'en') > 0.
def test_languages():
# Make sure we get all the languages when looking for the default
# 'best' wordlist
avail = available_languages()
assert len(avail) > 32
# 'small' covers the same languages, but with some different lists
avail_small = available_languages('small')
assert len(avail_small) == len(avail)
assert avail_small != avail
# 'combined' is the same as 'small'
avail_old_name = available_languages('combined')
assert avail_old_name == avail_small
# 'large' covers fewer languages
avail_large = available_languages('large')
assert len(avail_large) > 12
assert len(avail) > len(avail_large)
# Look up the digit '2' in the main word list for each language
for lang in avail:
assert word_frequency('2', lang) > 0
# Make up a weirdly verbose language code and make sure
# we still get it
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
assert word_frequency('2', new_lang_code) > 0
def test_minimums():
assert word_frequency('esquivalience', 'en') == 0
assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
assert word_frequency('the', 'en', minimum=1) == 1
def test_most_common_words():
# If something causes the most common words in well-supported languages to
# change, we should know.
def get_most_common(lang):
"""
Return the single most common word in the language.
"""
return top_n_list(lang, 1)[0]
assert get_most_common('ar') == 'في'
assert get_most_common('de') == 'die'
assert get_most_common('en') == 'the'
assert get_most_common('es') == 'de'
assert get_most_common('fr') == 'de'
assert get_most_common('it') == 'di'
assert get_most_common('ja') == ''
assert get_most_common('nl') == 'de'
assert get_most_common('pl') == 'w'
assert get_most_common('pt') == 'de'
assert get_most_common('ru') == 'в'
assert get_most_common('tr') == 'bir'
assert get_most_common('zh') == ''
def test_language_matching():
freq = word_frequency('', 'zh')
assert word_frequency('', 'zh-TW') == freq
assert word_frequency('', 'zh-CN') == freq
assert word_frequency('', 'zh-Hant') == freq
assert word_frequency('', 'zh-Hans') == freq
assert word_frequency('', 'yue-HK') == freq
assert word_frequency('', 'cmn') == freq
def test_cB_conversion():
assert cB_to_freq(0) == 1.
assert cB_to_freq(-100) == pytest.approx(0.1)
assert cB_to_freq(-600) == pytest.approx(1e-6)
def test_failed_cB_conversion():
with pytest.raises(ValueError):
cB_to_freq(1)
def test_tokenization():
# We preserve apostrophes within words, so "can't" is a single word in the
# data
assert (
tokenize("I don't split at apostrophes, you see.", 'en')
== ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']
)
assert (
tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True)
== ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']
)
# Certain punctuation does not inherently split a word.
assert (
tokenize("Anything is possible at zombo.com", 'en')
== ['anything', 'is', 'possible', 'at', 'zombo.com']
)
# Splits occur after symbols, and at splitting punctuation such as hyphens.
assert tokenize('😂test', 'en') == ['😂', 'test']
assert tokenize("flip-flop", 'en') == ['flip', 'flop']
assert (
tokenize('this text has... punctuation :)', 'en', include_punctuation=True)
== ['this', 'text', 'has', '...', 'punctuation', ':)']
)
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
# and 'David Bowie' stay together, because our Unicode segmentation algorithm
# is up to date
assert tokenize('emoji test 🧕🏽', 'en') == ['emoji', 'test', '🧕🏽']
assert (
tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en')
== ['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
'nothing', 'i', 'can', 'do', '🌎', '🚀']
)
# Water wave, surfer, flag of California (indicates ridiculously complete support
# for Unicode 10 and Emoji 5.0)
assert tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en') == ["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"]
def test_casefolding():
assert tokenize('WEISS', 'de') == ['weiss']
assert tokenize('weiß', 'de') == ['weiss']
assert tokenize('İstanbul', 'tr') == ['istanbul']
assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']
def test_number_smashing():
assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
assert (
lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
== ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
)
assert lossy_tokenize('1', 'en') == ['1']
assert lossy_tokenize('3.14', 'en') == ['0.00']
assert lossy_tokenize('24601', 'en') == ['00000']
assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
def test_phrase_freq():
ff = word_frequency("flip-flop", 'en')
assert ff > 0
phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
assert 1.0 / ff == pytest.approx(phrase_freq)
def test_not_really_random():
# If your xkcd-style password comes out like this, maybe you shouldn't
# use it
assert random_words(nwords=4, lang='en', bits_per_word=0) == 'the the the the'
# This not only tests random_ascii_words, it makes sure we didn't end
# up with 'eos' as a very common Japanese word
assert random_ascii_words(nwords=4, lang='ja', bits_per_word=0) == '1 1 1 1'
def test_not_enough_ascii():
with pytest.raises(ValueError):
random_ascii_words(lang='zh', bits_per_word=14)
def test_arabic():
# Remove tatweels
assert tokenize('متــــــــعب', 'ar') == ['متعب']
# Remove combining marks
assert tokenize('حَرَكَات', 'ar') == ['حركات']
# An Arabic ligature that is affected by NFKC normalization
assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
def test_ideographic_fallback():
# Try tokenizing Chinese text as English -- it should remain stuck together.
#
# More complex examples like this, involving the multiple scripts of Japanese,
# are in test_japanese.py.
assert tokenize('中国文字', 'en') == ['中国文字']
def test_other_languages():
# Test that we leave Thai letters stuck together. If we had better Thai support,
# we would actually split this into a three-word phrase.
assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']
# Test Khmer, a script similar to Thai
assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']
# Test Hindi -- tokens split where there are spaces, and not where there aren't
assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']
# Remove vowel points in Hebrew
assert tokenize('דֻּגְמָה', 'he') == ['דגמה']
# Deal with commas, cedillas, and I's in Turkish
assert tokenize('kișinin', 'tr') == ['kişinin']
assert tokenize('KİȘİNİN', 'tr') == ['kişinin']
# Deal with cedillas that should be commas-below in Romanian
assert tokenize('acelaşi', 'ro') == ['același']
assert tokenize('ACELAŞI', 'ro') == ['același']