2015-05-22 00:34:17 +00:00
|
|
|
|
from wordfreq import (
|
2015-07-07 20:21:22 +00:00
|
|
|
|
word_frequency, available_languages, cB_to_freq,
|
2018-03-08 21:25:45 +00:00
|
|
|
|
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
|
2015-05-22 00:34:17 +00:00
|
|
|
|
)
|
2018-06-01 20:33:06 +00:00
|
|
|
|
import pytest
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_freq_examples():
|
|
|
|
|
# Stopwords are most common in the correct language
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert word_frequency('the', 'en') > word_frequency('de', 'en')
|
|
|
|
|
assert word_frequency('de', 'es') > word_frequency('the', 'es')
|
2018-03-08 23:09:57 +00:00
|
|
|
|
# We get word frequencies from the 'large' list when available
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert word_frequency('infrequency', 'en') > 0.
|
2018-03-08 23:09:57 +00:00
|
|
|
|
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
def test_languages():
|
2018-03-08 22:52:44 +00:00
|
|
|
|
# Make sure we get all the languages when looking for the default
|
|
|
|
|
# 'best' wordlist
|
2015-05-22 00:34:17 +00:00
|
|
|
|
avail = available_languages()
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert len(avail) > 32
|
2018-03-08 22:52:44 +00:00
|
|
|
|
|
|
|
|
|
# 'small' covers the same languages, but with some different lists
|
|
|
|
|
avail_small = available_languages('small')
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert len(avail_small) == len(avail)
|
|
|
|
|
assert avail_small != avail
|
2018-03-08 22:52:44 +00:00
|
|
|
|
|
|
|
|
|
# 'combined' is the same as 'small'
|
|
|
|
|
avail_old_name = available_languages('combined')
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert avail_old_name == avail_small
|
2018-03-08 22:52:44 +00:00
|
|
|
|
|
|
|
|
|
# 'large' covers fewer languages
|
|
|
|
|
avail_large = available_languages('large')
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert len(avail_large) > 12
|
|
|
|
|
assert len(avail) > len(avail_large)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
2016-07-29 16:19:12 +00:00
|
|
|
|
# Look up the digit '2' in the main word list for each language
|
|
|
|
|
for lang in avail:
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert word_frequency('2', lang) > 0
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
2015-09-15 17:26:09 +00:00
|
|
|
|
# Make up a weirdly verbose language code and make sure
|
|
|
|
|
# we still get it
|
|
|
|
|
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert word_frequency('2', new_lang_code) > 0
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
2015-07-07 19:46:33 +00:00
|
|
|
|
def test_minimums():
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert word_frequency('esquivalience', 'en') == 0
|
|
|
|
|
assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
|
|
|
|
|
assert word_frequency('the', 'en', minimum=1) == 1
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
2016-07-29 16:19:12 +00:00
|
|
|
|
|
2015-05-22 00:34:17 +00:00
|
|
|
|
def test_most_common_words():
|
|
|
|
|
# If something causes the most common words in well-supported languages to
|
|
|
|
|
# change, we should know.
|
|
|
|
|
|
|
|
|
|
def get_most_common(lang):
|
|
|
|
|
"""
|
|
|
|
|
Return the single most common word in the language.
|
|
|
|
|
"""
|
|
|
|
|
return top_n_list(lang, 1)[0]
|
|
|
|
|
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert get_most_common('ar') == 'في'
|
|
|
|
|
assert get_most_common('de') == 'die'
|
|
|
|
|
assert get_most_common('en') == 'the'
|
|
|
|
|
assert get_most_common('es') == 'de'
|
|
|
|
|
assert get_most_common('fr') == 'de'
|
|
|
|
|
assert get_most_common('it') == 'di'
|
|
|
|
|
assert get_most_common('ja') == 'の'
|
|
|
|
|
assert get_most_common('nl') == 'de'
|
|
|
|
|
assert get_most_common('pl') == 'w'
|
|
|
|
|
assert get_most_common('pt') == 'de'
|
|
|
|
|
assert get_most_common('ru') == 'в'
|
|
|
|
|
assert get_most_common('tr') == 'bir'
|
|
|
|
|
assert get_most_common('zh') == '的'
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_language_matching():
|
|
|
|
|
freq = word_frequency('的', 'zh')
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert word_frequency('的', 'zh-TW') == freq
|
|
|
|
|
assert word_frequency('的', 'zh-CN') == freq
|
|
|
|
|
assert word_frequency('的', 'zh-Hant') == freq
|
|
|
|
|
assert word_frequency('的', 'zh-Hans') == freq
|
|
|
|
|
assert word_frequency('的', 'yue-HK') == freq
|
|
|
|
|
assert word_frequency('的', 'cmn') == freq
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
2015-06-22 21:36:30 +00:00
|
|
|
|
def test_cB_conversion():
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert cB_to_freq(0) == 1.
|
|
|
|
|
assert cB_to_freq(-100) == pytest.approx(0.1)
|
|
|
|
|
assert cB_to_freq(-600) == pytest.approx(1e-6)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
2015-06-22 21:36:30 +00:00
|
|
|
|
def test_failed_cB_conversion():
|
2018-06-01 20:33:06 +00:00
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
|
cB_to_freq(1)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_tokenization():
|
|
|
|
|
# We preserve apostrophes within words, so "can't" is a single word in the
|
2015-07-17 18:50:12 +00:00
|
|
|
|
# data
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert (
|
|
|
|
|
tokenize("I don't split at apostrophes, you see.", 'en')
|
|
|
|
|
== ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']
|
|
|
|
|
)
|
2016-02-22 19:26:50 +00:00
|
|
|
|
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert (
|
|
|
|
|
tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True)
|
|
|
|
|
== ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']
|
|
|
|
|
)
|
2015-06-25 15:25:51 +00:00
|
|
|
|
|
2015-08-24 20:24:49 +00:00
|
|
|
|
# Certain punctuation does not inherently split a word.
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert (
|
|
|
|
|
tokenize("Anything is possible at zombo.com", 'en')
|
|
|
|
|
== ['anything', 'is', 'possible', 'at', 'zombo.com']
|
|
|
|
|
)
|
2015-08-24 20:24:49 +00:00
|
|
|
|
|
|
|
|
|
# Splits occur after symbols, and at splitting punctuation such as hyphens.
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert tokenize('😂test', 'en') == ['😂', 'test']
|
|
|
|
|
assert tokenize("flip-flop", 'en') == ['flip', 'flop']
|
|
|
|
|
assert (
|
|
|
|
|
tokenize('this text has... punctuation :)', 'en', include_punctuation=True)
|
|
|
|
|
== ['this', 'text', 'has', '...', 'punctuation', ':)']
|
|
|
|
|
)
|
2015-09-15 17:26:09 +00:00
|
|
|
|
|
2017-08-08 15:35:28 +00:00
|
|
|
|
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
|
|
|
|
|
# and 'David Bowie' stay together, because our Unicode segmentation algorithm
|
|
|
|
|
# is up to date
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert tokenize('emoji test 🧕🏽', 'en') == ['emoji', 'test', '🧕🏽']
|
|
|
|
|
assert (
|
|
|
|
|
tokenize("👨🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en')
|
|
|
|
|
== ['👨🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
|
|
|
|
|
'nothing', 'i', 'can', 'do', '🌎', '🚀']
|
|
|
|
|
)
|
2017-08-08 15:35:28 +00:00
|
|
|
|
|
|
|
|
|
# Water wave, surfer, flag of California (indicates ridiculously complete support
|
|
|
|
|
# for Unicode 10 and Emoji 5.0)
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert tokenize("Surf's up 🌊🏄🏴'",'en') == ["surf's", "up", "🌊", "🏄", "🏴"]
|
2017-08-08 15:35:28 +00:00
|
|
|
|
|
2015-06-30 19:14:02 +00:00
|
|
|
|
|
|
|
|
|
def test_casefolding():
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert tokenize('WEISS', 'de') == ['weiss']
|
|
|
|
|
assert tokenize('weiß', 'de') == ['weiss']
|
|
|
|
|
assert tokenize('İstanbul', 'tr') == ['istanbul']
|
|
|
|
|
assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']
|
2015-06-30 19:14:02 +00:00
|
|
|
|
|
|
|
|
|
|
2017-01-05 22:59:26 +00:00
|
|
|
|
def test_number_smashing():
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
|
|
|
|
|
assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
|
|
|
|
|
assert (
|
|
|
|
|
lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
|
|
|
|
|
== ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
|
|
|
|
|
)
|
|
|
|
|
assert lossy_tokenize('1', 'en') == ['1']
|
|
|
|
|
assert lossy_tokenize('3.14', 'en') == ['0.00']
|
|
|
|
|
assert lossy_tokenize('24601', 'en') == ['00000']
|
|
|
|
|
assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
|
2017-01-05 22:59:26 +00:00
|
|
|
|
|
|
|
|
|
|
2015-06-25 15:25:51 +00:00
|
|
|
|
def test_phrase_freq():
|
2015-08-24 20:24:49 +00:00
|
|
|
|
ff = word_frequency("flip-flop", 'en')
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert ff > 0
|
|
|
|
|
phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
|
|
|
|
|
assert 1.0 / ff == pytest.approx(phrase_freq)
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_not_really_random():
|
|
|
|
|
# If your xkcd-style password comes out like this, maybe you shouldn't
|
|
|
|
|
# use it
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert random_words(nwords=4, lang='en', bits_per_word=0) == 'the the the the'
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
# This not only tests random_ascii_words, it makes sure we didn't end
|
|
|
|
|
# up with 'eos' as a very common Japanese word
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert random_ascii_words(nwords=4, lang='ja', bits_per_word=0) == '1 1 1 1'
|
2015-05-22 00:34:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_not_enough_ascii():
|
2018-06-01 20:33:06 +00:00
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
|
random_ascii_words(lang='zh', bits_per_word=14)
|
2015-07-07 19:10:59 +00:00
|
|
|
|
|
2015-07-20 20:48:36 +00:00
|
|
|
|
|
2016-07-15 19:10:25 +00:00
|
|
|
|
def test_arabic():
|
2015-07-20 20:48:36 +00:00
|
|
|
|
# Remove tatweels
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert tokenize('متــــــــعب', 'ar') == ['متعب']
|
2015-07-07 19:10:59 +00:00
|
|
|
|
|
2015-07-20 20:48:36 +00:00
|
|
|
|
# Remove combining marks
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert tokenize('حَرَكَات', 'ar') == ['حركات']
|
2015-07-20 20:48:36 +00:00
|
|
|
|
|
2018-06-01 20:33:06 +00:00
|
|
|
|
# An Arabic ligature that is affected by NFKC normalization
|
|
|
|
|
assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
|
2015-08-24 20:24:49 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_ideographic_fallback():
|
2015-09-05 07:16:56 +00:00
|
|
|
|
# Try tokenizing Chinese text as English -- it should remain stuck together.
|
2018-04-26 19:53:07 +00:00
|
|
|
|
#
|
|
|
|
|
# More complex examples like this, involving the multiple scripts of Japanese,
|
|
|
|
|
# are in test_japanese.py.
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert tokenize('中国文字', 'en') == ['中国文字']
|
2015-08-24 20:24:49 +00:00
|
|
|
|
|
2018-03-08 21:25:45 +00:00
|
|
|
|
|
2016-07-15 19:10:25 +00:00
|
|
|
|
def test_other_languages():
|
2016-03-10 16:56:04 +00:00
|
|
|
|
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
|
|
|
|
# we would actually split this into a three-word phrase.
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
|
|
|
|
|
assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']
|
2016-03-10 16:56:04 +00:00
|
|
|
|
|
2016-07-01 22:00:57 +00:00
|
|
|
|
# Test Khmer, a script similar to Thai
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']
|
2016-07-01 22:00:57 +00:00
|
|
|
|
|
|
|
|
|
# Test Hindi -- tokens split where there are spaces, and not where there aren't
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']
|
2016-07-15 19:10:25 +00:00
|
|
|
|
|
|
|
|
|
# Remove vowel points in Hebrew
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert tokenize('דֻּגְמָה', 'he') == ['דגמה']
|
2016-07-15 19:10:25 +00:00
|
|
|
|
|
2016-07-28 23:23:17 +00:00
|
|
|
|
# Deal with commas, cedillas, and I's in Turkish
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert tokenize('kișinin', 'tr') == ['kişinin']
|
|
|
|
|
assert tokenize('KİȘİNİN', 'tr') == ['kişinin']
|
2016-07-28 23:23:17 +00:00
|
|
|
|
|
|
|
|
|
# Deal with cedillas that should be commas-below in Romanian
|
2018-06-01 20:33:06 +00:00
|
|
|
|
assert tokenize('acelaşi', 'ro') == ['același']
|
|
|
|
|
assert tokenize('ACELAŞI', 'ro') == ['același']
|