mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
commit
a95b360563
@ -14,7 +14,7 @@ those cases we want to detect only the most obvious token boundaries.
|
||||
|
||||
In this situation, we no longer try to detect script changes, such as between
|
||||
kanji and katakana, as token boundaries. This particularly allows us to keep
|
||||
together Japanese words where ヶ appears betwen kanji, as well as words that
|
||||
together Japanese words where ヶ appears between kanji, as well as words that
|
||||
use the iteration mark 々.
|
||||
|
||||
This change does not affect any word frequencies. (The Japanese word list uses
|
||||
|
15
README.md
15
README.md
@ -174,13 +174,13 @@ least 3 different sources of word frequencies:
|
||||
Bosnian bs [1] 3 - │ Yes Yes - - - Yes - -
|
||||
Bulgarian bg 3 - │ Yes Yes - - - Yes - -
|
||||
Catalan ca 4 - │ Yes Yes Yes - - Yes - -
|
||||
Chinese zh [3] 6 Yes │ Yes - Yes Yes Yes Yes - Jieba
|
||||
Chinese zh [3] 7 Yes │ Yes Yes Yes Yes Yes Yes - Jieba
|
||||
Croatian hr [1] 3 │ Yes Yes - - - Yes - -
|
||||
Czech cs 3 - │ Yes Yes - - - Yes - -
|
||||
Czech cs 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||
Danish da 3 - │ Yes Yes - - - Yes - -
|
||||
Dutch nl 4 Yes │ Yes Yes Yes - - Yes - -
|
||||
Dutch nl 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||
English en 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||
Finnish fi 5 Yes │ Yes Yes Yes - - Yes Yes -
|
||||
Finnish fi 6 Yes │ Yes Yes Yes - Yes Yes Yes -
|
||||
French fr 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||
German de 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||
Greek el 3 - │ Yes Yes - - Yes - - -
|
||||
@ -191,13 +191,14 @@ least 3 different sources of word frequencies:
|
||||
Italian it 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||
Japanese ja 5 Yes │ Yes Yes - - Yes Yes Yes -
|
||||
Korean ko 4 - │ Yes Yes - - - Yes Yes -
|
||||
Latvian lv 4 - │ Yes Yes - - Yes Yes - -
|
||||
Macedonian mk 3 - │ Yes Yes Yes - - - - -
|
||||
Malay ms 3 - │ Yes Yes - - - Yes - -
|
||||
Norwegian nb [2] 4 - │ Yes Yes - - - Yes Yes -
|
||||
Persian fa 3 - │ Yes Yes - - - Yes - -
|
||||
Polish pl 5 Yes │ Yes Yes Yes - - Yes Yes -
|
||||
Polish pl 6 Yes │ Yes Yes Yes - Yes Yes Yes -
|
||||
Portuguese pt 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||
Romanian ro 3 - │ Yes Yes - - - Yes - -
|
||||
Romanian ro 4 - │ Yes Yes - - Yes Yes - -
|
||||
Russian ru 6 Yes │ Yes Yes Yes Yes Yes Yes - -
|
||||
Serbian sr [1] 3 - │ Yes Yes - - - Yes - -
|
||||
Spanish es 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||
@ -219,7 +220,7 @@ Chinese, with primarily Mandarin Chinese vocabulary. See "Multi-script
|
||||
languages" below.
|
||||
|
||||
Some languages provide 'large' wordlists, including words with a Zipf frequency
|
||||
between 1.0 and 3.0. These are available in 13 languages that are covered by
|
||||
between 1.0 and 3.0. These are available in 14 languages that are covered by
|
||||
enough data sources.
|
||||
|
||||
|
||||
|
2
pytest.ini
Normal file
2
pytest.ini
Normal file
@ -0,0 +1,2 @@
|
||||
[pytest]
|
||||
addopts = --doctest-modules
|
@ -1,5 +1,2 @@
|
||||
[nosetests]
|
||||
verbosity=2
|
||||
with-doctest=1
|
||||
with-coverage=0
|
||||
cover-package=wordfreq
|
||||
[aliases]
|
||||
test=pytest
|
||||
|
4
setup.py
4
setup.py
@ -35,7 +35,7 @@ if sys.version_info < (3, 4):
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
version='2.0.1',
|
||||
version='2.1.0',
|
||||
maintainer='Luminoso Technologies, Inc.',
|
||||
maintainer_email='info@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
@ -60,5 +60,5 @@ setup(
|
||||
'mecab': 'mecab-python3',
|
||||
'jieba': 'jieba'
|
||||
},
|
||||
tests_require=['mecab-python3', 'jieba'],
|
||||
tests_require=['pytest', 'mecab-python3', 'jieba'],
|
||||
)
|
||||
|
235
tests/test.py
235
tests/test.py
@ -1,235 +0,0 @@
|
||||
from wordfreq import (
|
||||
word_frequency, available_languages, cB_to_freq,
|
||||
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
|
||||
)
|
||||
from nose.tools import (
|
||||
eq_, assert_almost_equal, assert_greater, raises, assert_not_equal
|
||||
)
|
||||
|
||||
|
||||
def test_freq_examples():
|
||||
# Stopwords are most common in the correct language
|
||||
assert_greater(word_frequency('the', 'en'),
|
||||
word_frequency('de', 'en'))
|
||||
|
||||
assert_greater(word_frequency('de', 'es'),
|
||||
word_frequency('the', 'es'))
|
||||
|
||||
# We get word frequencies from the 'large' list when available
|
||||
assert_greater(word_frequency('infrequency', 'en'), 0.)
|
||||
|
||||
|
||||
def test_languages():
|
||||
# Make sure we get all the languages when looking for the default
|
||||
# 'best' wordlist
|
||||
avail = available_languages()
|
||||
assert_greater(len(avail), 32)
|
||||
|
||||
# 'small' covers the same languages, but with some different lists
|
||||
avail_small = available_languages('small')
|
||||
eq_(len(avail_small), len(avail))
|
||||
assert_not_equal(avail_small, avail)
|
||||
|
||||
# 'combined' is the same as 'small'
|
||||
avail_old_name = available_languages('combined')
|
||||
eq_(avail_old_name, avail_small)
|
||||
|
||||
# 'large' covers fewer languages
|
||||
avail_large = available_languages('large')
|
||||
assert_greater(len(avail_large), 12)
|
||||
assert_greater(len(avail), len(avail_large))
|
||||
|
||||
# Look up the digit '2' in the main word list for each language
|
||||
for lang in avail:
|
||||
assert_greater(word_frequency('2', lang), 0, lang)
|
||||
|
||||
# Make up a weirdly verbose language code and make sure
|
||||
# we still get it
|
||||
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
|
||||
assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code)
|
||||
|
||||
|
||||
def test_minimums():
|
||||
eq_(word_frequency('esquivalience', 'en'), 0)
|
||||
eq_(word_frequency('esquivalience', 'en', minimum=1e-6), 1e-6)
|
||||
eq_(word_frequency('the', 'en', minimum=1), 1)
|
||||
|
||||
|
||||
def test_most_common_words():
|
||||
# If something causes the most common words in well-supported languages to
|
||||
# change, we should know.
|
||||
|
||||
def get_most_common(lang):
|
||||
"""
|
||||
Return the single most common word in the language.
|
||||
"""
|
||||
return top_n_list(lang, 1)[0]
|
||||
|
||||
eq_(get_most_common('ar'), 'في')
|
||||
eq_(get_most_common('de'), 'die')
|
||||
eq_(get_most_common('en'), 'the')
|
||||
eq_(get_most_common('es'), 'de')
|
||||
eq_(get_most_common('fr'), 'de')
|
||||
eq_(get_most_common('it'), 'di')
|
||||
eq_(get_most_common('ja'), 'の')
|
||||
eq_(get_most_common('nl'), 'de')
|
||||
eq_(get_most_common('pl'), 'w')
|
||||
eq_(get_most_common('pt'), 'de')
|
||||
eq_(get_most_common('ru'), 'в')
|
||||
eq_(get_most_common('tr'), 'bir')
|
||||
eq_(get_most_common('zh'), '的')
|
||||
|
||||
|
||||
def test_language_matching():
|
||||
freq = word_frequency('的', 'zh')
|
||||
eq_(word_frequency('的', 'zh-TW'), freq)
|
||||
eq_(word_frequency('的', 'zh-CN'), freq)
|
||||
eq_(word_frequency('的', 'zh-Hant'), freq)
|
||||
eq_(word_frequency('的', 'zh-Hans'), freq)
|
||||
eq_(word_frequency('的', 'yue-HK'), freq)
|
||||
eq_(word_frequency('的', 'cmn'), freq)
|
||||
|
||||
|
||||
def test_cB_conversion():
|
||||
eq_(cB_to_freq(0), 1.)
|
||||
assert_almost_equal(cB_to_freq(-100), 0.1)
|
||||
assert_almost_equal(cB_to_freq(-600), 1e-6)
|
||||
|
||||
|
||||
@raises(ValueError)
|
||||
def test_failed_cB_conversion():
|
||||
cB_to_freq(1)
|
||||
|
||||
|
||||
def test_tokenization():
|
||||
# We preserve apostrophes within words, so "can't" is a single word in the
|
||||
# data
|
||||
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
|
||||
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
|
||||
|
||||
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
|
||||
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
|
||||
|
||||
# Certain punctuation does not inherently split a word.
|
||||
eq_(tokenize("Anything is possible at zombo.com", 'en'),
|
||||
['anything', 'is', 'possible', 'at', 'zombo.com'])
|
||||
|
||||
# Splits occur after symbols, and at splitting punctuation such as hyphens.
|
||||
eq_(tokenize('😂test', 'en'), ['😂', 'test'])
|
||||
|
||||
eq_(tokenize("flip-flop", 'en'), ['flip', 'flop'])
|
||||
|
||||
eq_(tokenize('this text has... punctuation :)', 'en', include_punctuation=True),
|
||||
['this', 'text', 'has', '...', 'punctuation', ':)'])
|
||||
|
||||
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
|
||||
# and 'David Bowie' stay together, because our Unicode segmentation algorithm
|
||||
# is up to date
|
||||
eq_(tokenize('emoji test 🧕🏽', 'en'), ['emoji', 'test', '🧕🏽'])
|
||||
|
||||
eq_(tokenize("👨🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en'),
|
||||
['👨🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
|
||||
'nothing', 'i', 'can', 'do', '🌎', '🚀'])
|
||||
|
||||
# Water wave, surfer, flag of California (indicates ridiculously complete support
|
||||
# for Unicode 10 and Emoji 5.0)
|
||||
eq_(tokenize("Surf's up 🌊🏄🏴'",'en'),
|
||||
["surf's", "up", "🌊", "🏄", "🏴"])
|
||||
|
||||
|
||||
def test_casefolding():
|
||||
eq_(tokenize('WEISS', 'de'), ['weiss'])
|
||||
eq_(tokenize('weiß', 'de'), ['weiss'])
|
||||
eq_(tokenize('İstanbul', 'tr'), ['istanbul'])
|
||||
eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
|
||||
|
||||
|
||||
def test_number_smashing():
|
||||
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
|
||||
['715', 'crσσks', 'by', 'bon', 'iver'])
|
||||
eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
|
||||
['000', 'crσσks', 'by', 'bon', 'iver'])
|
||||
eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True),
|
||||
['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
|
||||
eq_(lossy_tokenize('1', 'en'), ['1'])
|
||||
eq_(lossy_tokenize('3.14', 'en'), ['0.00'])
|
||||
eq_(lossy_tokenize('24601', 'en'), ['00000'])
|
||||
eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
|
||||
|
||||
|
||||
def test_phrase_freq():
|
||||
ff = word_frequency("flip-flop", 'en')
|
||||
assert_greater(ff, 0)
|
||||
assert_almost_equal(
|
||||
1.0 / ff,
|
||||
1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
|
||||
)
|
||||
|
||||
|
||||
def test_not_really_random():
|
||||
# If your xkcd-style password comes out like this, maybe you shouldn't
|
||||
# use it
|
||||
eq_(random_words(nwords=4, lang='en', bits_per_word=0),
|
||||
'the the the the')
|
||||
|
||||
# This not only tests random_ascii_words, it makes sure we didn't end
|
||||
# up with 'eos' as a very common Japanese word
|
||||
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
|
||||
'1 1 1 1')
|
||||
|
||||
|
||||
@raises(ValueError)
|
||||
def test_not_enough_ascii():
|
||||
random_ascii_words(lang='zh', bits_per_word=14)
|
||||
|
||||
|
||||
def test_arabic():
|
||||
# Remove tatweels
|
||||
eq_(
|
||||
tokenize('متــــــــعب', 'ar'),
|
||||
['متعب']
|
||||
)
|
||||
|
||||
# Remove combining marks
|
||||
eq_(
|
||||
tokenize('حَرَكَات', 'ar'),
|
||||
['حركات']
|
||||
)
|
||||
|
||||
eq_(
|
||||
tokenize('\ufefb', 'ar'), # An Arabic ligature...
|
||||
['\u0644\u0627'] # ...that is affected by NFKC normalization
|
||||
)
|
||||
|
||||
|
||||
def test_ideographic_fallback():
|
||||
# Try tokenizing Chinese text as English -- it should remain stuck together.
|
||||
#
|
||||
# More complex examples like this, involving the multiple scripts of Japanese,
|
||||
# are in test_japanese.py.
|
||||
eq_(tokenize('中国文字', 'en'), ['中国文字'])
|
||||
|
||||
|
||||
def test_other_languages():
|
||||
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
||||
# we would actually split this into a three-word phrase.
|
||||
eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
|
||||
eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
|
||||
['การเล่นดนตรี', 'means', 'playing', 'music'])
|
||||
|
||||
# Test Khmer, a script similar to Thai
|
||||
eq_(tokenize('សូមស្វាគមន៍', 'km'), ['សូមស្វាគមន៍'])
|
||||
|
||||
# Test Hindi -- tokens split where there are spaces, and not where there aren't
|
||||
eq_(tokenize('हिन्दी विक्षनरी', 'hi'), ['हिन्दी', 'विक्षनरी'])
|
||||
|
||||
# Remove vowel points in Hebrew
|
||||
eq_(tokenize('דֻּגְמָה', 'he'), ['דגמה'])
|
||||
|
||||
# Deal with commas, cedillas, and I's in Turkish
|
||||
eq_(tokenize('kișinin', 'tr'), ['kişinin'])
|
||||
eq_(tokenize('KİȘİNİN', 'tr'), ['kişinin'])
|
||||
|
||||
# Deal with cedillas that should be commas-below in Romanian
|
||||
eq_(tokenize('acelaşi', 'ro'), ['același'])
|
||||
eq_(tokenize('ACELAŞI', 'ro'), ['același'])
|
@ -1,5 +1,5 @@
|
||||
from nose.tools import eq_, assert_almost_equal, assert_greater
|
||||
from wordfreq import tokenize, word_frequency
|
||||
import pytest
|
||||
|
||||
|
||||
def test_tokens():
|
||||
@ -17,64 +17,49 @@ def test_tokens():
|
||||
|
||||
# His name breaks into five pieces, with the only piece staying together
|
||||
# being the one that means 'Bart'. The dot is not included as a token.
|
||||
eq_(
|
||||
tokenize(hobart, 'zh'),
|
||||
['加', '勒', '特', '霍', '巴特']
|
||||
)
|
||||
assert tokenize(hobart, 'zh') == ['加', '勒', '特', '霍', '巴特']
|
||||
|
||||
eq_(
|
||||
tokenize(fact_simplified, 'zh'),
|
||||
[
|
||||
# he / is / history / in / #6 / counter for people
|
||||
'他', '是', '历史', '上', '第六', '位',
|
||||
# during / term of office / in / die
|
||||
'在', '任期', '内', '去世',
|
||||
# of / U.S. / deputy / president
|
||||
'的', '美国', '副', '总统'
|
||||
]
|
||||
)
|
||||
assert tokenize(fact_simplified, 'zh') == [
|
||||
# he / is / history / in / #6 / counter for people
|
||||
'他', '是', '历史', '上', '第六', '位',
|
||||
# during / term of office / in / die
|
||||
'在', '任期', '内', '去世',
|
||||
# of / U.S. / deputy / president
|
||||
'的', '美国', '副', '总统'
|
||||
]
|
||||
|
||||
# Jieba's original tokenizer knows a lot of names, it seems.
|
||||
eq_(
|
||||
tokenize(hobart, 'zh', external_wordlist=True),
|
||||
['加勒特', '霍巴特']
|
||||
)
|
||||
assert tokenize(hobart, 'zh', external_wordlist=True) == ['加勒特', '霍巴特']
|
||||
|
||||
# We get almost the same tokens from the sentence using Jieba's own
|
||||
# wordlist, but it tokenizes "in history" as two words and
|
||||
# "sixth person" as one.
|
||||
eq_(
|
||||
tokenize(fact_simplified, 'zh', external_wordlist=True),
|
||||
[
|
||||
# he / is / history / in / sixth person
|
||||
'他', '是', '历史', '上', '第六位',
|
||||
# during / term of office / in / die
|
||||
'在', '任期', '内', '去世',
|
||||
# of / U.S. / deputy / president
|
||||
'的', '美国', '副', '总统'
|
||||
]
|
||||
)
|
||||
assert tokenize(fact_simplified, 'zh', external_wordlist=True) == [
|
||||
# he / is / history / in / sixth person
|
||||
'他', '是', '历史', '上', '第六位',
|
||||
# during / term of office / in / die
|
||||
'在', '任期', '内', '去世',
|
||||
# of / U.S. / deputy / president
|
||||
'的', '美国', '副', '总统'
|
||||
]
|
||||
|
||||
# Check that Traditional Chinese works at all
|
||||
assert_greater(word_frequency(fact_traditional, 'zh'), 0)
|
||||
assert word_frequency(fact_traditional, 'zh') > 0
|
||||
|
||||
# You get the same token lengths if you look it up in Traditional Chinese,
|
||||
# but the words are different
|
||||
simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True)
|
||||
trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True)
|
||||
eq_(''.join(simp_tokens), fact_simplified)
|
||||
eq_(''.join(trad_tokens), fact_traditional)
|
||||
assert ''.join(simp_tokens) == fact_simplified
|
||||
assert ''.join(trad_tokens) == fact_traditional
|
||||
simp_lengths = [len(token) for token in simp_tokens]
|
||||
trad_lengths = [len(token) for token in trad_tokens]
|
||||
eq_(simp_lengths, trad_lengths)
|
||||
assert simp_lengths == trad_lengths
|
||||
|
||||
|
||||
def test_combination():
|
||||
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
|
||||
assert_almost_equal(
|
||||
word_frequency('谢谢谢谢', 'zh'),
|
||||
xiexie_freq / 20
|
||||
)
|
||||
assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20)
|
||||
|
||||
|
||||
def test_alternate_codes():
|
||||
@ -83,12 +68,12 @@ def test_alternate_codes():
|
||||
tokens = ['谢谢', '谢谢']
|
||||
|
||||
# Code with a region attached
|
||||
eq_(tokenize('谢谢谢谢', 'zh-CN'), tokens)
|
||||
assert tokenize('谢谢谢谢', 'zh-CN') == tokens
|
||||
|
||||
# Over-long codes for Chinese
|
||||
eq_(tokenize('谢谢谢谢', 'chi'), tokens)
|
||||
eq_(tokenize('谢谢谢谢', 'zho'), tokens)
|
||||
assert tokenize('谢谢谢谢', 'chi') == tokens
|
||||
assert tokenize('谢谢谢谢', 'zho') == tokens
|
||||
|
||||
# Separate codes for Mandarin and Cantonese
|
||||
eq_(tokenize('谢谢谢谢', 'cmn'), tokens)
|
||||
eq_(tokenize('谢谢谢谢', 'yue'), tokens)
|
||||
assert tokenize('谢谢谢谢', 'cmn') == tokens
|
||||
assert tokenize('谢谢谢谢', 'yue') == tokens
|
||||
|
@ -1,40 +1,32 @@
|
||||
from nose.tools import eq_, assert_almost_equal
|
||||
from wordfreq import tokenize, word_frequency
|
||||
|
||||
|
||||
def test_apostrophes():
|
||||
# Test that we handle apostrophes in French reasonably.
|
||||
eq_(tokenize("qu'un", 'fr'), ['qu', 'un'])
|
||||
eq_(tokenize("qu'un", 'fr', include_punctuation=True),
|
||||
["qu'", "un"])
|
||||
eq_(tokenize("langues d'oïl", 'fr'),
|
||||
['langues', "d", 'oïl'])
|
||||
eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True),
|
||||
['langues', "d'", 'oïl'])
|
||||
eq_(tokenize("l'heure", 'fr'),
|
||||
['l', 'heure'])
|
||||
eq_(tokenize("l'heure", 'fr', include_punctuation=True),
|
||||
["l'", 'heure'])
|
||||
eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True),
|
||||
["l'", 'hôpital'])
|
||||
eq_(tokenize("aujourd'hui", 'fr'), ["aujourd'hui"])
|
||||
eq_(tokenize("This isn't French", 'en'),
|
||||
['this', "isn't", 'french'])
|
||||
assert tokenize("qu'un", 'fr') == ['qu', 'un']
|
||||
assert tokenize("qu'un", 'fr', include_punctuation=True) == ["qu'", "un"]
|
||||
assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl']
|
||||
assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl']
|
||||
assert tokenize("l'heure", 'fr') == ['l', 'heure']
|
||||
assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure']
|
||||
assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital']
|
||||
assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"]
|
||||
assert tokenize("This isn't French", 'en') == ['this', "isn't", 'french']
|
||||
|
||||
|
||||
def test_catastrophes():
|
||||
# More apostrophes, but this time they're in Catalan, and there's other
|
||||
# mid-word punctuation going on too.
|
||||
eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
|
||||
['m', 'acabo', 'd', 'instal·lar'])
|
||||
eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
|
||||
["m'", 'acabo', "d'", 'instal·lar', '.'])
|
||||
assert tokenize("M'acabo d'instal·lar.", 'ca') == ['m', 'acabo', 'd', 'instal·lar']
|
||||
assert (
|
||||
tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True) ==
|
||||
["m'", 'acabo', "d'", 'instal·lar', '.']
|
||||
)
|
||||
|
||||
|
||||
def test_alternate_codes():
|
||||
# Try over-long language codes for French and Catalan
|
||||
eq_(tokenize("qu'un", 'fra'), ['qu', 'un'])
|
||||
eq_(tokenize("qu'un", 'fre'), ['qu', 'un'])
|
||||
eq_(tokenize("M'acabo d'instal·lar.", 'cat'),
|
||||
['m', 'acabo', 'd', 'instal·lar'])
|
||||
assert tokenize("qu'un", 'fra') == ['qu', 'un']
|
||||
assert tokenize("qu'un", 'fre') == ['qu', 'un']
|
||||
assert tokenize("M'acabo d'instal·lar.", 'cat') == ['m', 'acabo', 'd', 'instal·lar']
|
||||
|
||||
|
223
tests/test_general.py
Normal file
223
tests/test_general.py
Normal file
@ -0,0 +1,223 @@
|
||||
from wordfreq import (
|
||||
word_frequency, available_languages, cB_to_freq,
|
||||
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
|
||||
)
|
||||
import pytest
|
||||
|
||||
|
||||
def test_freq_examples():
|
||||
# Stopwords are most common in the correct language
|
||||
assert word_frequency('the', 'en') > word_frequency('de', 'en')
|
||||
assert word_frequency('de', 'es') > word_frequency('the', 'es')
|
||||
# We get word frequencies from the 'large' list when available
|
||||
assert word_frequency('infrequency', 'en') > 0.
|
||||
|
||||
|
||||
def test_languages():
|
||||
# Make sure we get all the languages when looking for the default
|
||||
# 'best' wordlist
|
||||
avail = available_languages()
|
||||
assert len(avail) >= 34
|
||||
|
||||
# 'small' covers the same languages, but with some different lists
|
||||
avail_small = available_languages('small')
|
||||
assert len(avail_small) == len(avail)
|
||||
assert avail_small != avail
|
||||
|
||||
# 'combined' is the same as 'small'
|
||||
avail_old_name = available_languages('combined')
|
||||
assert avail_old_name == avail_small
|
||||
|
||||
# 'large' covers fewer languages
|
||||
avail_large = available_languages('large')
|
||||
assert len(avail_large) >= 14
|
||||
assert len(avail) > len(avail_large)
|
||||
|
||||
# Look up the digit '2' in the main word list for each language
|
||||
for lang in avail:
|
||||
assert word_frequency('2', lang) > 0
|
||||
|
||||
# Make up a weirdly verbose language code and make sure
|
||||
# we still get it
|
||||
new_lang_code = '%s-001-x-fake-extension' % lang.upper()
|
||||
assert word_frequency('2', new_lang_code) > 0
|
||||
|
||||
|
||||
def test_minimums():
|
||||
assert word_frequency('esquivalience', 'en') == 0
|
||||
assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
|
||||
assert word_frequency('the', 'en', minimum=1) == 1
|
||||
|
||||
|
||||
def test_most_common_words():
|
||||
# If something causes the most common words in well-supported languages to
|
||||
# change, we should know.
|
||||
|
||||
def get_most_common(lang):
|
||||
"""
|
||||
Return the single most common word in the language.
|
||||
"""
|
||||
return top_n_list(lang, 1)[0]
|
||||
|
||||
assert get_most_common('ar') == 'في'
|
||||
assert get_most_common('cs') == 'a'
|
||||
assert get_most_common('de') == 'die'
|
||||
assert get_most_common('en') == 'the'
|
||||
assert get_most_common('es') == 'de'
|
||||
assert get_most_common('fr') == 'de'
|
||||
assert get_most_common('it') == 'di'
|
||||
assert get_most_common('ja') == 'の'
|
||||
assert get_most_common('nl') == 'de'
|
||||
assert get_most_common('pl') == 'w'
|
||||
assert get_most_common('pt') == 'de'
|
||||
assert get_most_common('ru') == 'в'
|
||||
assert get_most_common('tr') == 'bir'
|
||||
assert get_most_common('zh') == '的'
|
||||
|
||||
|
||||
def test_language_matching():
|
||||
freq = word_frequency('的', 'zh')
|
||||
assert word_frequency('的', 'zh-TW') == freq
|
||||
assert word_frequency('的', 'zh-CN') == freq
|
||||
assert word_frequency('的', 'zh-Hant') == freq
|
||||
assert word_frequency('的', 'zh-Hans') == freq
|
||||
assert word_frequency('的', 'yue-HK') == freq
|
||||
assert word_frequency('的', 'cmn') == freq
|
||||
|
||||
|
||||
def test_cB_conversion():
|
||||
assert cB_to_freq(0) == 1.
|
||||
assert cB_to_freq(-100) == pytest.approx(0.1)
|
||||
assert cB_to_freq(-600) == pytest.approx(1e-6)
|
||||
|
||||
|
||||
def test_failed_cB_conversion():
|
||||
with pytest.raises(ValueError):
|
||||
cB_to_freq(1)
|
||||
|
||||
|
||||
def test_tokenization():
|
||||
# We preserve apostrophes within words, so "can't" is a single word in the
|
||||
# data
|
||||
assert (
|
||||
tokenize("I don't split at apostrophes, you see.", 'en')
|
||||
== ['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see']
|
||||
)
|
||||
|
||||
assert (
|
||||
tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True)
|
||||
== ['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.']
|
||||
)
|
||||
|
||||
# Certain punctuation does not inherently split a word.
|
||||
assert (
|
||||
tokenize("Anything is possible at zombo.com", 'en')
|
||||
== ['anything', 'is', 'possible', 'at', 'zombo.com']
|
||||
)
|
||||
|
||||
# Splits occur after symbols, and at splitting punctuation such as hyphens.
|
||||
assert tokenize('😂test', 'en') == ['😂', 'test']
|
||||
assert tokenize("flip-flop", 'en') == ['flip', 'flop']
|
||||
assert (
|
||||
tokenize('this text has... punctuation :)', 'en', include_punctuation=True)
|
||||
== ['this', 'text', 'has', '...', 'punctuation', ':)']
|
||||
)
|
||||
|
||||
# Multi-codepoint emoji sequences such as 'medium-skinned woman with headscarf'
|
||||
# and 'David Bowie' stay together, because our Unicode segmentation algorithm
|
||||
# is up to date
|
||||
assert tokenize('emoji test 🧕🏽', 'en') == ['emoji', 'test', '🧕🏽']
|
||||
assert (
|
||||
tokenize("👨🎤 Planet Earth is blue, and there's nothing I can do 🌎🚀", 'en')
|
||||
== ['👨🎤', 'planet', 'earth', 'is', 'blue', 'and', "there's",
|
||||
'nothing', 'i', 'can', 'do', '🌎', '🚀']
|
||||
)
|
||||
|
||||
# Water wave, surfer, flag of California (indicates ridiculously complete support
|
||||
# for Unicode 10 and Emoji 5.0)
|
||||
assert tokenize("Surf's up 🌊🏄🏴'",'en') == ["surf's", "up", "🌊", "🏄", "🏴"]
|
||||
|
||||
|
||||
def test_casefolding():
|
||||
assert tokenize('WEISS', 'de') == ['weiss']
|
||||
assert tokenize('weiß', 'de') == ['weiss']
|
||||
assert tokenize('İstanbul', 'tr') == ['istanbul']
|
||||
assert tokenize('SIKISINCA', 'tr') == ['sıkısınca']
|
||||
|
||||
|
||||
def test_number_smashing():
|
||||
assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
|
||||
assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
|
||||
assert (
|
||||
lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
|
||||
== ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
|
||||
)
|
||||
assert lossy_tokenize('1', 'en') == ['1']
|
||||
assert lossy_tokenize('3.14', 'en') == ['0.00']
|
||||
assert lossy_tokenize('24601', 'en') == ['00000']
|
||||
assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
|
||||
|
||||
|
||||
def test_phrase_freq():
|
||||
ff = word_frequency("flip-flop", 'en')
|
||||
assert ff > 0
|
||||
phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
|
||||
assert 1.0 / ff == pytest.approx(phrase_freq)
|
||||
|
||||
|
||||
def test_not_really_random():
|
||||
# If your xkcd-style password comes out like this, maybe you shouldn't
|
||||
# use it
|
||||
assert random_words(nwords=4, lang='en', bits_per_word=0) == 'the the the the'
|
||||
|
||||
# This not only tests random_ascii_words, it makes sure we didn't end
|
||||
# up with 'eos' as a very common Japanese word
|
||||
assert random_ascii_words(nwords=4, lang='ja', bits_per_word=0) == '00 00 00 00'
|
||||
|
||||
|
||||
def test_not_enough_ascii():
|
||||
with pytest.raises(ValueError):
|
||||
random_ascii_words(lang='zh', bits_per_word=16)
|
||||
|
||||
|
||||
def test_arabic():
|
||||
# Remove tatweels
|
||||
assert tokenize('متــــــــعب', 'ar') == ['متعب']
|
||||
|
||||
# Remove combining marks
|
||||
assert tokenize('حَرَكَات', 'ar') == ['حركات']
|
||||
|
||||
# An Arabic ligature that is affected by NFKC normalization
|
||||
assert tokenize('\ufefb', 'ar') == ['\u0644\u0627']
|
||||
|
||||
|
||||
def test_ideographic_fallback():
|
||||
# Try tokenizing Chinese text as English -- it should remain stuck together.
|
||||
#
|
||||
# More complex examples like this, involving the multiple scripts of Japanese,
|
||||
# are in test_japanese.py.
|
||||
assert tokenize('中国文字', 'en') == ['中国文字']
|
||||
|
||||
|
||||
def test_other_languages():
|
||||
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
||||
# we would actually split this into a three-word phrase.
|
||||
assert tokenize('การเล่นดนตรี', 'th') == ['การเล่นดนตรี']
|
||||
assert tokenize('"การเล่นดนตรี" means "playing music"', 'en') == ['การเล่นดนตรี', 'means', 'playing', 'music']
|
||||
|
||||
# Test Khmer, a script similar to Thai
|
||||
assert tokenize('សូមស្វាគមន៍', 'km') == ['សូមស្វាគមន៍']
|
||||
|
||||
# Test Hindi -- tokens split where there are spaces, and not where there aren't
|
||||
assert tokenize('हिन्दी विक्षनरी', 'hi') == ['हिन्दी', 'विक्षनरी']
|
||||
|
||||
# Remove vowel points in Hebrew
|
||||
assert tokenize('דֻּגְמָה', 'he') == ['דגמה']
|
||||
|
||||
# Deal with commas, cedillas, and I's in Turkish
|
||||
assert tokenize('kișinin', 'tr') == ['kişinin']
|
||||
assert tokenize('KİȘİNİN', 'tr') == ['kişinin']
|
||||
|
||||
# Deal with cedillas that should be commas-below in Romanian
|
||||
assert tokenize('acelaşi', 'ro') == ['același']
|
||||
assert tokenize('ACELAŞI', 'ro') == ['același']
|
@ -1,10 +1,9 @@
|
||||
from nose.tools import eq_, assert_almost_equal
|
||||
from wordfreq import tokenize, simple_tokenize, word_frequency
|
||||
import pytest
|
||||
|
||||
|
||||
def test_tokens():
|
||||
eq_(tokenize('おはようございます', 'ja'),
|
||||
['おはよう', 'ござい', 'ます'])
|
||||
assert tokenize('おはようございます', 'ja') == ['おはよう', 'ござい', 'ます']
|
||||
|
||||
|
||||
def test_simple_tokenize():
|
||||
@ -19,31 +18,29 @@ def test_simple_tokenize():
|
||||
# We used to try to infer word boundaries between hiragana and katakana,
|
||||
# but this leads to edge cases that are unsolvable without a dictionary.
|
||||
ja_text = 'ひらがなカタカナromaji'
|
||||
eq_(
|
||||
simple_tokenize(ja_text),
|
||||
['ひらがなカタカナ', 'romaji']
|
||||
)
|
||||
assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji']
|
||||
|
||||
|
||||
# An example that would be multiple tokens if tokenized as 'ja' via MeCab,
|
||||
# but sticks together in simple_tokenize
|
||||
eq_(simple_tokenize('おはようございます'), ['おはようございます'])
|
||||
assert simple_tokenize('おはようございます') == ['おはようございます']
|
||||
|
||||
# Names that use the weird possessive marker ヶ, which is technically a
|
||||
# katakana even though it's being used like a kanji, stay together as one
|
||||
# token
|
||||
eq_(simple_tokenize("犬ヶ島"), ["犬ヶ島"])
|
||||
assert simple_tokenize("犬ヶ島") == ["犬ヶ島"]
|
||||
|
||||
# The word in ConceptNet that made me notice that simple_tokenize used
|
||||
# to have a problem with the character 々
|
||||
eq_(simple_tokenize("晴々しい"), ["晴々しい"])
|
||||
assert simple_tokenize("晴々しい") == ["晴々しい"]
|
||||
|
||||
# Explicit word separators are still token boundaries, such as the dot
|
||||
# between "toner" and "cartridge" in "toner cartridge"
|
||||
eq_(simple_tokenize("トナー・カートリッジ"), ["トナー", "カートリッジ"])
|
||||
assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"]
|
||||
|
||||
# This word has multiple weird characters that aren't quite kanji in it,
|
||||
# and is in the dictionary
|
||||
eq_(simple_tokenize("見ヶ〆料"), ["見ヶ〆料"])
|
||||
assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]
|
||||
|
||||
|
||||
|
||||
@ -52,12 +49,11 @@ def test_combination():
|
||||
gozai_freq = word_frequency('ござい', 'ja')
|
||||
masu_freq = word_frequency('ます', 'ja')
|
||||
|
||||
assert_almost_equal(
|
||||
word_frequency('おはようおはよう', 'ja'),
|
||||
ohayou_freq / 2
|
||||
)
|
||||
assert_almost_equal(
|
||||
1.0 / word_frequency('おはようございます', 'ja'),
|
||||
1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
|
||||
assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2)
|
||||
|
||||
assert (
|
||||
1.0 / word_frequency('おはようございます', 'ja') ==
|
||||
pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq)
|
||||
)
|
||||
|
||||
|
||||
|
@ -1,22 +1,18 @@
|
||||
from nose.tools import eq_, assert_almost_equal
|
||||
from wordfreq import tokenize, word_frequency
|
||||
import pytest
|
||||
|
||||
|
||||
def test_tokens():
|
||||
eq_(tokenize('감사합니다', 'ko'),
|
||||
['감사', '합니다'])
|
||||
assert tokenize('감사합니다', 'ko') == ['감사', '합니다']
|
||||
|
||||
|
||||
def test_combination():
|
||||
gamsa_freq = word_frequency('감사', 'ko')
|
||||
habnida_freq = word_frequency('합니다', 'ko')
|
||||
|
||||
assert_almost_equal(
|
||||
word_frequency('감사감사', 'ko'),
|
||||
gamsa_freq / 2
|
||||
)
|
||||
assert_almost_equal(
|
||||
1.0 / word_frequency('감사합니다', 'ko'),
|
||||
1.0 / gamsa_freq + 1.0 / habnida_freq
|
||||
assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2)
|
||||
assert (
|
||||
1.0 / word_frequency('감사합니다', 'ko') ==
|
||||
pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq)
|
||||
)
|
||||
|
||||
|
@ -1,15 +1,18 @@
|
||||
from nose.tools import eq_
|
||||
from wordfreq import tokenize
|
||||
from wordfreq.preprocess import preprocess_text
|
||||
|
||||
|
||||
def test_transliteration():
|
||||
# "Well, there's a lot of things you do not understand."
|
||||
# (from somewhere in OpenSubtitles)
|
||||
eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'),
|
||||
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
|
||||
eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
|
||||
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
|
||||
# (from somewhere in OpenSubtitles
|
||||
assert (
|
||||
tokenize("Па, има ту много ствари које не схваташ.", 'sr') ==
|
||||
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
|
||||
)
|
||||
assert (
|
||||
tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') ==
|
||||
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
|
||||
)
|
||||
|
||||
# I don't have examples of complete sentences in Azerbaijani that are
|
||||
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani
|
||||
@ -17,14 +20,14 @@ def test_transliteration():
|
||||
# So here are some individual words.
|
||||
|
||||
# 'library' in Azerbaijani Cyrillic
|
||||
eq_(preprocess_text('китабхана', 'az'), 'kitabxana')
|
||||
eq_(preprocess_text('КИТАБХАНА', 'az'), 'kitabxana')
|
||||
eq_(preprocess_text('KİTABXANA', 'az'), 'kitabxana')
|
||||
assert preprocess_text('китабхана', 'az') == 'kitabxana'
|
||||
assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
|
||||
assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'
|
||||
|
||||
# 'scream' in Azerbaijani Cyrillic
|
||||
eq_(preprocess_text('бағырты', 'az'), 'bağırtı')
|
||||
eq_(preprocess_text('БАҒЫРТЫ', 'az'), 'bağırtı')
|
||||
eq_(preprocess_text('BAĞIRTI', 'az'), 'bağırtı')
|
||||
assert preprocess_text('бағырты', 'az') == 'bağırtı'
|
||||
assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
|
||||
assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'
|
||||
|
||||
|
||||
def test_actually_russian():
|
||||
@ -35,15 +38,13 @@ def test_actually_russian():
|
||||
# We make sure to handle this case so we don't end up with a mixed-script
|
||||
# word like "pacanы".
|
||||
|
||||
eq_(tokenize("сто из ста, пацаны!", 'sr'),
|
||||
['sto', 'iz', 'sta', 'pacany'])
|
||||
|
||||
eq_(tokenize("культуры", 'sr'), ["kul'tury"])
|
||||
assert tokenize("сто из ста, пацаны!", 'sr') == ['sto', 'iz', 'sta', 'pacany']
|
||||
assert tokenize("культуры", 'sr') == ["kul'tury"]
|
||||
|
||||
|
||||
def test_alternate_codes():
|
||||
# Try language codes for Serbo-Croatian that have been split, and now
|
||||
# are canonically mapped to Serbian
|
||||
eq_(tokenize("культуры", 'sh'), ["kul'tury"])
|
||||
eq_(tokenize("культуры", 'hbs'), ["kul'tury"])
|
||||
assert tokenize("культуры", 'sh') == ["kul'tury"]
|
||||
assert tokenize("культуры", 'hbs') == ["kul'tury"]
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
BIN
wordfreq/data/large_cs.msgpack.gz
Normal file
BIN
wordfreq/data/large_cs.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_lv.msgpack.gz
Normal file
BIN
wordfreq/data/small_lv.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user