wordfreq/tests/test.py
Robyn Speer 9dac967ca3 Tokenize by graphemes, not codepoints (#50)
* Tokenize by graphemes, not codepoints

* Add more documentation to TOKEN_RE

* Remove extra line break

* Update docstring - Brahmic scripts are no longer an exception

* approve using version 2017.07.28 of regex
2017-08-08 11:35:28 -04:00

255 lines
8.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from wordfreq import (
word_frequency, available_languages, cB_to_freq,
top_n_list, random_words, random_ascii_words, tokenize
)
from nose.tools import (
eq_, assert_almost_equal, assert_greater, raises
)
def test_freq_examples():
# Stopwords are most common in the correct language
assert_greater(word_frequency('the', 'en'),
word_frequency('de', 'en'))
assert_greater(word_frequency('de', 'es'),
word_frequency('the', 'es'))
# To test the reasonableness of the Twitter list, we want to look up a
# common word representing laughter in each language. The default for
# languages not listed here is 'haha'.
LAUGHTER_WORDS = {
'en': 'lol',
'hi': 'lol',
'cs': 'lol',
'ru': 'лол',
'zh': '',
'ja': '',
'ar': '',
'fa': 'خخخخ',
'ca': 'jaja',
'es': 'jaja',
'fr': 'ptdr',
'pt': 'kkkk',
'he': 'חחח',
'bg': 'ахаха',
'uk': 'хаха',
}
def test_languages():
# Make sure the number of available languages doesn't decrease
avail = available_languages()
assert_greater(len(avail), 26)
# Look up the digit '2' in the main word list for each language
for lang in avail:
assert_greater(word_frequency('2', lang), 0, lang)
# Make up a weirdly verbose language code and make sure
# we still get it
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code)
def test_twitter():
avail = available_languages('twitter')
assert_greater(len(avail), 15)
for lang in avail:
assert_greater(word_frequency('rt', lang, 'twitter'),
word_frequency('rt', lang, 'combined'))
text = LAUGHTER_WORDS.get(lang, 'haha')
assert_greater(word_frequency(text, lang, wordlist='twitter'), 0, (text, lang))
def test_minimums():
eq_(word_frequency('esquivalience', 'en'), 0)
eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
eq_(word_frequency('the', 'en', minimum=1), 1)
def test_most_common_words():
# If something causes the most common words in well-supported languages to
# change, we should know.
def get_most_common(lang):
"""
Return the single most common word in the language.
"""
return top_n_list(lang, 1)[0]
eq_(get_most_common('ar'), 'في')
eq_(get_most_common('de'), 'die')
eq_(get_most_common('en'), 'the')
eq_(get_most_common('es'), 'de')
eq_(get_most_common('fr'), 'de')
eq_(get_most_common('it'), 'di')
eq_(get_most_common('ja'), '')
eq_(get_most_common('nl'), 'de')
eq_(get_most_common('pl'), 'w')
eq_(get_most_common('pt'), 'de')
eq_(get_most_common('ru'), 'в')
eq_(get_most_common('tr'), 'bir')
eq_(get_most_common('zh'), '')
def test_language_matching():
freq = word_frequency('', 'zh')
eq_(word_frequency('', 'zh-TW'), freq)
eq_(word_frequency('', 'zh-CN'), freq)
eq_(word_frequency('', 'zh-Hant'), freq)
eq_(word_frequency('', 'zh-Hans'), freq)
eq_(word_frequency('', 'yue-HK'), freq)
eq_(word_frequency('', 'cmn'), freq)
def test_cB_conversion():
eq_(cB_to_freq(0), 1.)
assert_almost_equal(cB_to_freq(-100), 0.1)
assert_almost_equal(cB_to_freq(-600), 1e-6)
@raises(ValueError)
def test_failed_cB_conversion():
cB_to_freq(1)
def test_tokenization():
# We preserve apostrophes within words, so "can't" is a single word in the
# data
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
# Certain punctuation does not inherently split a word.
eq_(tokenize("Anything is possible at zombo.com", 'en'),
['anything', 'is', 'possible', 'at', 'zombo.com'])
# Splits occur after symbols, and at splitting punctuation such as hyphens.
eq_(tokenize('😂test', 'en'), ['😂', 'test'])
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
['this', 'text', 'has', '...', 'punctuation', ':)'])
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
# and 'David Bowie' stay together, because our Unicode segmentation algorithm
# is up to date
eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽'])
eq_(tokenize("👨‍🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'),
['👨‍🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
'nothing', 'i', 'can', 'do', '🌎', '🚀'])
# Water wave, surfer, flag of California (indicates ridiculously complete support
# for Unicode 10 and Emoji 5.0)
eq_(tokenize("Surf's up 🌊🏄🏴󠁵󠁳󠁣󠁡󠁿'",'en'),
["surf's", "up", "🌊", "🏄", "🏴󠁵󠁳󠁣󠁡󠁿"])
def test_casefolding():
eq_(tokenize('WEISS', 'de'), ['weiss'])
eq_(tokenize('weiß', 'de'), ['weiss'])
eq_(tokenize('İstanbul', 'tr'), ['istanbul'])
eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
def test_number_smashing():
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
['715', 'crσσks', 'by', 'bon', 'iver'])
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True),
['000', 'crσσks', 'by', 'bon', 'iver'])
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True, include_punctuation=True),
['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
eq_(tokenize('1', 'en', combine_numbers=True), ['1'])
eq_(tokenize('3.14', 'en', combine_numbers=True), ['0.00'])
eq_(tokenize('24601', 'en', combine_numbers=True), ['00000'])
eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
def test_phrase_freq():
ff = word_frequency("flip-flop", 'en')
assert_greater(ff, 0)
assert_almost_equal(
1.0 / ff,
1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
)
def test_not_really_random():
# If your xkcd-style password comes out like this, maybe you shouldn't
# use it
eq_(random_words(nwords=4, lang='en', bits_per_word=0),
'the the the the')
# This not only tests random_ascii_words, it makes sure we didn't end
# up with 'eos' as a very common Japanese word
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
'00 00 00 00')
@raises(ValueError)
def test_not_enough_ascii():
random_ascii_words(lang='zh', bits_per_word=14)
def test_arabic():
# Remove tatweels
eq_(
tokenize('متــــــــعب', 'ar'),
['متعب']
)
# Remove combining marks
eq_(
tokenize('حَرَكَات', 'ar'),
['حركات']
)
eq_(
tokenize('\ufefb', 'ar'), # An Arabic ligature...
['\u0644\u0627'] # ...that is affected by NFKC normalization
)
def test_ideographic_fallback():
# Try tokenizing Chinese text as English -- it should remain stuck together.
eq_(tokenize('中国文字', 'en'), ['中国文字'])
# When Japanese is tagged with the wrong language, it will be split
# at script boundaries.
ja_text = 'ひらがなカタカナromaji'
eq_(
tokenize(ja_text, 'en'),
['ひらがな', 'カタカナ', 'romaji']
)
def test_other_languages():
# Test that we leave Thai letters stuck together. If we had better Thai support,
# we would actually split this into a three-word phrase.
eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
['การเล่นดนตรี', 'means', 'playing', 'music'])
# Test Khmer, a script similar to Thai
eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍'])
# Test Hindi -- tokens split where there are spaces, and not where there aren't
eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी'])
# Remove vowel points in Hebrew
eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה'])
# Deal with commas, cedillas, and I's in Turkish
eq_(tokenize('kișinin', 'tr'), ['kişinin'])
eq_(tokenize('KİȘİNİN', 'tr'), ['kişinin'])
# Deal with cedillas that should be commas-below in Romanian
eq_(tokenize('acelaşi', 'ro'), ['același'])
eq_(tokenize('ACELAŞI', 'ro'), ['același'])