mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
f885a60bf0
There are Unicode normalization problems with Malayalam -- as best I understand it, Unicode simply neglected to include normalization forms for Malayalam "chillu" characters even though they changed how they're represented in Unicode 5.1 and again in Unicode 9. The result is that words that print the same end up with multiple entries, with different codepoint sequences that don't normalize to each other. I certainly don't know how to resolve this, and it would need to be resolved to have something that we could reasonably call Malayalam word frequencies.
251 lines
9.5 KiB
Python
251 lines
9.5 KiB
Python
from wordfreq import (
|
||
word_frequency, available_languages, cB_to_freq,
|
||
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
|
||
)
|
||
import pytest
|
||
|
||
|
||
def test_freq_examples():
|
||
# Stopwords are most common in the correct language
|
||
assert word_frequency('the', 'en') > word_frequency('de', 'en')
|
||
assert word_frequency('de', 'es') > word_frequency('the', 'es')
|
||
# We get word frequencies from the 'large' list when available
|
||
assert word_frequency('infrequency', 'en') > 0.
|
||
|
||
|
||
def test_languages():
|
||
# Make sure we get all the languages when looking for the default
|
||
# 'best' wordlist
|
||
avail = available_languages()
|
||
assert len(avail) >= 34
|
||
|
||
# 'small' covers the same languages, but with some different lists
|
||
avail_small = available_languages('small')
|
||
assert len(avail_small) == len(avail)
|
||
assert avail_small != avail
|
||
|
||
# 'combined' is the same as 'small'
|
||
avail_old_name = available_languages('combined')
|
||
assert avail_old_name == avail_small
|
||
|
||
# 'large' covers fewer languages
|
||
avail_large = available_languages('large')
|
||
assert len(avail_large) >= 14
|
||
assert len(avail) > len(avail_large)
|
||
|
||
# Look up the digit '2' in the main word list for each language
|
||
for lang in avail:
|
||
assert word_frequency('2', lang) > 0
|
||
|
||
# Make up a weirdly verbose language code and make sure
|
||
# we still get it
|
||
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
|
||
assert word_frequency('2', new_lang_code) > 0
|
||
|
||
|
||
def test_minimums():
|
||
assert word_frequency('esquivalience', 'en') == 0
|
||
assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
|
||
assert word_frequency('the', 'en', minimum=1) == 1
|
||
|
||
|
||
def test_most_common_words():
|
||
# If something causes the most common words in well-supported languages to
|
||
# change, we should know.
|
||
|
||
def get_most_common(lang):
|
||
"""
|
||
Return the single most common word in the language.
|
||
"""
|
||
return top_n_list(lang, 1)[0]
|
||
|
||
assert get_most_common('ar') == 'في'
|
||
assert get_most_common('bg') == 'на'
|
||
assert get_most_common('bn') == 'না'
|
||
assert get_most_common('ca') == 'de'
|
||
assert get_most_common('cs') == 'a'
|
||
assert get_most_common('da') == 'i'
|
||
assert get_most_common('el') == 'και'
|
||
assert get_most_common('de') == 'die'
|
||
assert get_most_common('en') == 'the'
|
||
assert get_most_common('es') == 'de'
|
||
assert get_most_common('fi') == 'ja'
|
||
assert get_most_common('fil') == 'sa'
|
||
assert get_most_common('fr') == 'de'
|
||
assert get_most_common('he') == 'את'
|
||
assert get_most_common('hi') == 'के'
|
||
assert get_most_common('hu') == 'a'
|
||
assert get_most_common('id') == 'yang'
|
||
assert get_most_common('is') == 'og'
|
||
assert get_most_common('it') == 'di'
|
||
assert get_most_common('ja') == 'の'
|
||
assert get_most_common('ko') == '이'
|
||
assert get_most_common('lt') == 'ir'
|
||
assert get_most_common('lv') == 'un'
|
||
assert get_most_common('mk') == 'на'
|
||
assert get_most_common('ms') == 'yang'
|
||
assert get_most_common('nb') == 'i'
|
||
assert get_most_common('nl') == 'de'
|
||
assert get_most_common('pl') == 'w'
|
||
assert get_most_common('pt') == 'de'
|
||
assert get_most_common('ro') == 'de'
|
||
assert get_most_common('ru') == 'в'
|
||
assert get_most_common('sh') == 'je'
|
||
assert get_most_common('sk') == 'a'
|
||
assert get_most_common('sl') == 'je'
|
||
assert get_most_common('sv') == 'är'
|
||
assert get_most_common('ta') == 'ஒரு'
|
||
assert get_most_common('tr') == 've'
|
||
assert get_most_common('uk') == 'в'
|
||
assert get_most_common('ur') == 'کے'
|
||
assert get_most_common('vi') == 'là'
|
||
assert get_most_common('zh') == '的'
|
||
|
||
|
||
def test_language_matching():
|
||
freq = word_frequency('的', 'zh')
|
||
assert word_frequency('的', 'zh-TW') == freq
|
||
assert word_frequency('的', 'zh-CN') == freq
|
||
assert word_frequency('的', 'zh-Hant') == freq
|
||
assert word_frequency('的', 'zh-Hans') == freq
|
||
assert word_frequency('的', 'yue-HK') == freq
|
||
assert word_frequency('的', 'cmn') == freq
|
||
|
||
|
||
def test_cB_conversion():
|
||
assert cB_to_freq(0) == 1.
|
||
assert cB_to_freq(-100) == pytest.approx(0.1)
|
||
assert cB_to_freq(-600) == pytest.approx(1e-6)
|
||
|
||
|
||
def test_failed_cB_conversion():
|
||
with pytest.raises(ValueError):
|
||
cB_to_freq(1)
|
||
|
||
|
||
def test_tokenization():
|
||
# We preserve apostrophes within words, so "can't" is a single word in the
|
||
# data
|
||
assert (
|
||
tokenize("I don't split at apostrophes, you see.", 'en')
|
||
== ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']
|
||
)
|
||
|
||
assert (
|
||
tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True)
|
||
== ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']
|
||
)
|
||
|
||
# Certain punctuation does not inherently split a word.
|
||
assert (
|
||
tokenize("Anything is possible at zombo.com", 'en')
|
||
== ['anything', 'is', 'possible', 'at', 'zombo.com']
|
||
)
|
||
|
||
# Splits occur after symbols, and at splitting punctuation such as hyphens.
|
||
assert tokenize('😂test', 'en') == ['😂', 'test']
|
||
assert tokenize("flip-flop", 'en') == ['flip', 'flop']
|
||
assert (
|
||
tokenize('this text has... punctuation :)', 'en', include_punctuation=True)
|
||
== ['this', 'text', 'has', '...', 'punctuation', ':)']
|
||
)
|
||
|
||
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
|
||
# and 'David Bowie' stay together, because our Unicode segmentation algorithm
|
||
# is up to date
|
||
assert tokenize('emoji test 🧕🏽', 'en') == ['emoji', 'test', '🧕🏽']
|
||
assert (
|
||
tokenize("👨🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en')
|
||
== ['👨🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
|
||
'nothing', 'i', 'can', 'do', '🌎', '🚀']
|
||
)
|
||
|
||
# Water wave, surfer, flag of California (indicates ridiculously complete support
|
||
# for Unicode 10 and Emoji 5.0)
|
||
assert tokenize("Surf's up 🌊🏄🏴'",'en') == ["surf's", "up", "🌊", "🏄", "🏴"]
|
||
|
||
|
||
def test_casefolding():
|
||
assert tokenize('WEISS', 'de') == ['weiss']
|
||
assert tokenize('weiß', 'de') == ['weiss']
|
||
assert tokenize('İstanbul', 'tr') == ['istanbul']
|
||
assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']
|
||
|
||
|
||
def test_number_smashing():
|
||
assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
|
||
assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
|
||
assert (
|
||
lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
|
||
== ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
|
||
)
|
||
assert lossy_tokenize('1', 'en') == ['1']
|
||
assert lossy_tokenize('3.14', 'en') == ['0.00']
|
||
assert lossy_tokenize('24601', 'en') == ['00000']
|
||
assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
|
||
|
||
|
||
def test_phrase_freq():
|
||
ff = word_frequency("flip-flop", 'en')
|
||
assert ff > 0
|
||
phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
|
||
assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01)
|
||
|
||
|
||
def test_not_really_random():
|
||
# If your xkcd-style password comes out like this, maybe you shouldn't
|
||
# use it
|
||
assert random_words(nwords=4, lang='en', bits_per_word=0) == 'the the the the'
|
||
|
||
# This not only tests random_ascii_words, it makes sure we didn't end
|
||
# up with 'eos' as a very common Japanese word
|
||
assert random_ascii_words(nwords=4, lang='ja', bits_per_word=0) == '00 00 00 00'
|
||
|
||
|
||
def test_not_enough_ascii():
|
||
with pytest.raises(ValueError):
|
||
random_ascii_words(lang='zh', bits_per_word=16)
|
||
|
||
|
||
def test_arabic():
|
||
# Remove tatweels
|
||
assert tokenize('متــــــــعب', 'ar') == ['متعب']
|
||
|
||
# Remove combining marks
|
||
assert tokenize('حَرَكَات', 'ar') == ['حركات']
|
||
|
||
# An Arabic ligature that is affected by NFKC normalization
|
||
assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
|
||
|
||
|
||
def test_ideographic_fallback():
|
||
# Try tokenizing Chinese text as English -- it should remain stuck together.
|
||
#
|
||
# More complex examples like this, involving the multiple scripts of Japanese,
|
||
# are in test_japanese.py.
|
||
assert tokenize('中国文字', 'en') == ['中国文字']
|
||
|
||
|
||
def test_other_languages():
|
||
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
||
# we would actually split this into a three-word phrase.
|
||
assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
|
||
assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']
|
||
|
||
# Test Khmer, a script similar to Thai
|
||
assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']
|
||
|
||
# Test Hindi -- tokens split where there are spaces, and not where there aren't
|
||
assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']
|
||
|
||
# Remove vowel points in Hebrew
|
||
assert tokenize('דֻּגְמָה', 'he') == ['דגמה']
|
||
|
||
# Deal with commas, cedillas, and I's in Turkish
|
||
assert tokenize('kișinin', 'tr') == ['kişinin']
|
||
assert tokenize('KİȘİNİN', 'tr') == ['kişinin']
|
||
|
||
# Deal with cedillas that should be commas-below in Romanian
|
||
assert tokenize('acelaşi', 'ro') == ['același']
|
||
assert tokenize('ACELAŞI', 'ro') == ['același']
|