mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-25 10:15:23 +00:00
9758c69ff0
This changes the version from 1.4.2 to 1.5. Things done in this update include:
* include Common Crawl; support 11 more languages
* new frequency-merging strategy
* New sources: Chinese from Wikipedia (mostly Trad.), Dutch big list
* Remove kinda bad sources, i.e. Greek Twitter (too often kaomoji are detected as Greek) and Ukrainian Common Crawl. This results in dropping Ukrainian as an available language, and causing Greek to not be a 'large' language after all.
* Add Korean tokenization, and include MeCab files in data
* Remove marks from more languages
* Deal with commas and cedillas in Turkish and Romanian
Former-commit-id: e6a8f028e3
229 lines
7.5 KiB
Python
229 lines
7.5 KiB
Python
from wordfreq import (
|
||
word_frequency, available_languages, cB_to_freq,
|
||
top_n_list, random_words, random_ascii_words, tokenize
|
||
)
|
||
from nose.tools import (
|
||
eq_, assert_almost_equal, assert_greater, raises
|
||
)
|
||
|
||
|
||
def test_freq_examples():
|
||
# Stopwords are most common in the correct language
|
||
assert_greater(word_frequency('the', 'en'),
|
||
word_frequency('de', 'en'))
|
||
|
||
assert_greater(word_frequency('de', 'es'),
|
||
word_frequency('the', 'es'))
|
||
|
||
|
||
def test_languages():
|
||
# Make sure the number of available languages doesn't decrease
|
||
avail = available_languages()
|
||
assert_greater(len(avail), 26)
|
||
|
||
avail_twitter = available_languages('twitter')
|
||
assert_greater(len(avail_twitter), 15)
|
||
# Look up a word representing laughter in each language, and make sure
|
||
# it has a non-zero frequency in the informal 'twitter' list.
|
||
for lang in avail_twitter:
|
||
if lang == 'zh' or lang == 'ja':
|
||
text = '笑'
|
||
elif lang == 'ko':
|
||
text = 'ᄏᄏᄏ'
|
||
elif lang == 'ar':
|
||
text = 'ههههه'
|
||
elif lang == 'ca' or lang == 'es':
|
||
text = 'jaja'
|
||
elif lang in {'de', 'nb', 'sv', 'da'}:
|
||
text = 'haha'
|
||
elif lang == 'pt':
|
||
text = 'kkkk'
|
||
elif lang == 'he':
|
||
text = 'חחח'
|
||
elif lang == 'ru':
|
||
text = 'лол'
|
||
elif lang == 'bg':
|
||
text = 'хаха'
|
||
elif lang == 'ro':
|
||
text = 'haha'
|
||
elif lang == 'el':
|
||
text = 'χαχα'
|
||
else:
|
||
text = 'lol'
|
||
assert_greater(word_frequency(text, lang, wordlist='twitter'), 0, (text, lang))
|
||
|
||
# Make up a weirdly verbose language code and make sure
|
||
# we still get it
|
||
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
|
||
assert_greater(word_frequency(text, new_lang_code, wordlist='twitter'), 0, (text, new_lang_code))
|
||
|
||
|
||
def test_twitter():
|
||
avail = available_languages('twitter')
|
||
assert_greater(len(avail), 14)
|
||
|
||
for lang in avail:
|
||
assert_greater(word_frequency('rt', lang, 'twitter'),
|
||
word_frequency('rt', lang, 'combined'))
|
||
|
||
|
||
def test_minimums():
|
||
eq_(word_frequency('esquivalience', 'en'), 0)
|
||
eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
|
||
eq_(word_frequency('the', 'en', minimum=1), 1)
|
||
|
||
def test_most_common_words():
|
||
# If something causes the most common words in well-supported languages to
|
||
# change, we should know.
|
||
|
||
def get_most_common(lang):
|
||
"""
|
||
Return the single most common word in the language.
|
||
"""
|
||
return top_n_list(lang, 1)[0]
|
||
|
||
eq_(get_most_common('ar'), 'من')
|
||
eq_(get_most_common('de'), 'die')
|
||
eq_(get_most_common('en'), 'the')
|
||
eq_(get_most_common('es'), 'de')
|
||
eq_(get_most_common('fr'), 'de')
|
||
eq_(get_most_common('it'), 'di')
|
||
eq_(get_most_common('ja'), 'の')
|
||
eq_(get_most_common('nl'), 'de')
|
||
eq_(get_most_common('pt'), 'de')
|
||
eq_(get_most_common('ru'), 'в')
|
||
eq_(get_most_common('tr'), 'bir')
|
||
eq_(get_most_common('zh'), '的')
|
||
|
||
|
||
def test_language_matching():
|
||
freq = word_frequency('的', 'zh')
|
||
eq_(word_frequency('的', 'zh-TW'), freq)
|
||
eq_(word_frequency('的', 'zh-CN'), freq)
|
||
eq_(word_frequency('的', 'zh-Hant'), freq)
|
||
eq_(word_frequency('的', 'zh-Hans'), freq)
|
||
eq_(word_frequency('的', 'yue-HK'), freq)
|
||
eq_(word_frequency('的', 'cmn'), freq)
|
||
|
||
|
||
def test_cB_conversion():
|
||
eq_(cB_to_freq(0), 1.)
|
||
assert_almost_equal(cB_to_freq(-100), 0.1)
|
||
assert_almost_equal(cB_to_freq(-600), 1e-6)
|
||
|
||
|
||
@raises(ValueError)
|
||
def test_failed_cB_conversion():
|
||
cB_to_freq(1)
|
||
|
||
|
||
def test_tokenization():
|
||
# We preserve apostrophes within words, so "can't" is a single word in the
|
||
# data
|
||
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
|
||
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
|
||
|
||
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
|
||
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
|
||
|
||
# Certain punctuation does not inherently split a word.
|
||
eq_(tokenize("Anything is possible at zombo.com", 'en'),
|
||
['anything', 'is', 'possible', 'at', 'zombo.com'])
|
||
|
||
# Splits occur after symbols, and at splitting punctuation such as hyphens.
|
||
eq_(tokenize('😂test', 'en'), ['😂', 'test'])
|
||
|
||
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
|
||
|
||
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
|
||
['this', 'text', 'has', '...', 'punctuation', ':)'])
|
||
|
||
|
||
def test_casefolding():
|
||
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
||
eq_(tokenize('weiß', 'de'), ['weiss'])
|
||
eq_(tokenize('İstanbul', 'tr'), ['istanbul'])
|
||
eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
|
||
|
||
|
||
def test_phrase_freq():
|
||
ff = word_frequency("flip-flop", 'en')
|
||
assert_greater(ff, 0)
|
||
assert_almost_equal(
|
||
1.0 / ff,
|
||
1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
|
||
)
|
||
|
||
|
||
def test_not_really_random():
|
||
# If your xkcd-style password comes out like this, maybe you shouldn't
|
||
# use it
|
||
eq_(random_words(nwords=4, lang='en', bits_per_word=0),
|
||
'the the the the')
|
||
|
||
# This not only tests random_ascii_words, it makes sure we didn't end
|
||
# up with 'eos' as a very common Japanese word
|
||
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
|
||
'1 1 1 1')
|
||
|
||
|
||
@raises(ValueError)
|
||
def test_not_enough_ascii():
|
||
random_ascii_words(lang='zh', bits_per_word=14)
|
||
|
||
|
||
def test_arabic():
|
||
# Remove tatweels
|
||
eq_(
|
||
tokenize('متــــــــعب', 'ar'),
|
||
['متعب']
|
||
)
|
||
|
||
# Remove combining marks
|
||
eq_(
|
||
tokenize('حَرَكَات', 'ar'),
|
||
['حركات']
|
||
)
|
||
|
||
eq_(
|
||
tokenize('\ufefb', 'ar'), # An Arabic ligature...
|
||
['\u0644\u0627'] # ...that is affected by NFKC normalization
|
||
)
|
||
|
||
|
||
def test_ideographic_fallback():
|
||
# Try tokenizing Chinese text as English -- it should remain stuck together.
|
||
eq_(tokenize('中国文字', 'en'), ['中国文字'])
|
||
|
||
# When Japanese is tagged with the wrong language, it will be split
|
||
# at script boundaries.
|
||
ja_text = 'ひらがなカタカナromaji'
|
||
eq_(
|
||
tokenize(ja_text, 'en'),
|
||
['ひらがな', 'カタカナ', 'romaji']
|
||
)
|
||
|
||
def test_other_languages():
|
||
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
||
# we would actually split this into a three-word phrase.
|
||
eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
|
||
eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
|
||
['การเล่นดนตรี', 'means', 'playing', 'music'])
|
||
|
||
# Test Khmer, a script similar to Thai
|
||
eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍'])
|
||
|
||
# Test Hindi -- tokens split where there are spaces, and not where there aren't
|
||
eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी'])
|
||
|
||
# Remove vowel points in Hebrew
|
||
eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה'])
|
||
|
||
# Deal with commas, cedillas, and I's in Turkish
|
||
eq_(tokenize('kișinin', 'tr'), ['kişinin'])
|
||
eq_(tokenize('KİȘİNİN', 'tr'), ['kişinin'])
|
||
|
||
# Deal with cedillas that should be commas-below in Romanian
|
||
eq_(tokenize('acelaşi', 'ro'), ['același'])
|
||
eq_(tokenize('ACELAŞI', 'ro'), ['același'])
|