mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
import new wordlists from Exquisite Corpus
This commit is contained in:
parent
847b85c5b8
commit
f671a1db7f
@ -22,16 +22,19 @@ def test_freq_examples():
|
||||
LAUGHTER_WORDS = {
|
||||
'en': 'lol',
|
||||
'hi': 'lol',
|
||||
'cs': 'lol',
|
||||
'ru': 'лол',
|
||||
'zh': '笑',
|
||||
'ja': '笑',
|
||||
'ar': 'ﻪﻬﻬﻬﻫ',
|
||||
'fa': 'خخخخ',
|
||||
'ca': 'jaja',
|
||||
'es': 'jaja',
|
||||
'fr': 'ptdr',
|
||||
'pt': 'kkkk',
|
||||
'he': 'חחח',
|
||||
'bg': 'xaxa',
|
||||
'bg': 'ахаха',
|
||||
'uk': 'хаха',
|
||||
}
|
||||
|
||||
|
||||
@ -77,7 +80,7 @@ def test_most_common_words():
|
||||
"""
|
||||
return top_n_list(lang, 1)[0]
|
||||
|
||||
eq_(get_most_common('ar'), 'من')
|
||||
eq_(get_most_common('ar'), 'في')
|
||||
eq_(get_most_common('de'), 'die')
|
||||
eq_(get_most_common('en'), 'the')
|
||||
eq_(get_most_common('es'), 'de')
|
||||
@ -85,6 +88,7 @@ def test_most_common_words():
|
||||
eq_(get_most_common('it'), 'di')
|
||||
eq_(get_most_common('ja'), 'の')
|
||||
eq_(get_most_common('nl'), 'de')
|
||||
eq_(get_most_common('pl'), 'w')
|
||||
eq_(get_most_common('pt'), 'de')
|
||||
eq_(get_most_common('ru'), 'в')
|
||||
eq_(get_most_common('tr'), 'bir')
|
||||
@ -141,6 +145,12 @@ def test_casefolding():
|
||||
eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
|
||||
|
||||
|
||||
def test_number_smashing():
|
||||
eq_(tokenize('1', 'en'), ['1'])
|
||||
eq_(tokenize('3.14', 'en'), ['0.00'])
|
||||
eq_(tokenize('24601', 'en'), ['00000'])
|
||||
|
||||
|
||||
def test_phrase_freq():
|
||||
ff = word_frequency("flip-flop", 'en')
|
||||
assert_greater(ff, 0)
|
||||
@ -159,7 +169,7 @@ def test_not_really_random():
|
||||
# This not only tests random_ascii_words, it makes sure we didn't end
|
||||
# up with 'eos' as a very common Japanese word
|
||||
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
|
||||
'1 1 1 1')
|
||||
'00 00 00 00')
|
||||
|
||||
|
||||
@raises(ValueError)
|
||||
|
@ -25,8 +25,8 @@ def test_tokens():
|
||||
eq_(
|
||||
tokenize(fact_simplified, 'zh'),
|
||||
[
|
||||
# he / is / in history / #6 / counter for people
|
||||
'他', '是', '历史上', '第六', '位',
|
||||
# he / is / history / in / #6 / counter for people
|
||||
'他', '是', '历史', '上', '第六', '位',
|
||||
# during / term of office / in / die
|
||||
'在', '任期', '内', '去世',
|
||||
# of / U.S. / deputy / president
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_cs.msgpack.gz
Normal file
BIN
wordfreq/data/combined_cs.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_fa.msgpack.gz
Normal file
BIN
wordfreq/data/combined_fa.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_sh.msgpack.gz
Normal file
BIN
wordfreq/data/combined_sh.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_uk.msgpack.gz
Normal file
BIN
wordfreq/data/combined_uk.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_fi.msgpack.gz
Normal file
BIN
wordfreq/data/large_fi.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_ja.msgpack.gz
Normal file
BIN
wordfreq/data/large_ja.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_pl.msgpack.gz
Normal file
BIN
wordfreq/data/large_pl.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_zh.msgpack.gz
Normal file
BIN
wordfreq/data/large_zh.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_bg.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_bg.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_cs.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_cs.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_da.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_da.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_fa.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_fa.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_fi.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_fi.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_hu.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_hu.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_nb.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_nb.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_ro.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_ro.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_sh.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_sh.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_uk.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_uk.msgpack.gz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user