mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
import new wordlists from Exquisite Corpus
This commit is contained in:
parent
de32a15b4f
commit
7dc3f03ebd
@ -22,16 +22,19 @@ def test_freq_examples():
|
|||||||
LAUGHTER_WORDS = {
|
LAUGHTER_WORDS = {
|
||||||
'en': 'lol',
|
'en': 'lol',
|
||||||
'hi': 'lol',
|
'hi': 'lol',
|
||||||
|
'cs': 'lol',
|
||||||
'ru': 'лол',
|
'ru': 'лол',
|
||||||
'zh': '笑',
|
'zh': '笑',
|
||||||
'ja': '笑',
|
'ja': '笑',
|
||||||
'ar': 'ﻪﻬﻬﻬﻫ',
|
'ar': 'ﻪﻬﻬﻬﻫ',
|
||||||
|
'fa': 'خخخخ',
|
||||||
'ca': 'jaja',
|
'ca': 'jaja',
|
||||||
'es': 'jaja',
|
'es': 'jaja',
|
||||||
'fr': 'ptdr',
|
'fr': 'ptdr',
|
||||||
'pt': 'kkkk',
|
'pt': 'kkkk',
|
||||||
'he': 'חחח',
|
'he': 'חחח',
|
||||||
'bg': 'xaxa',
|
'bg': 'ахаха',
|
||||||
|
'uk': 'хаха',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -77,7 +80,7 @@ def test_most_common_words():
|
|||||||
"""
|
"""
|
||||||
return top_n_list(lang, 1)[0]
|
return top_n_list(lang, 1)[0]
|
||||||
|
|
||||||
eq_(get_most_common('ar'), 'من')
|
eq_(get_most_common('ar'), 'في')
|
||||||
eq_(get_most_common('de'), 'die')
|
eq_(get_most_common('de'), 'die')
|
||||||
eq_(get_most_common('en'), 'the')
|
eq_(get_most_common('en'), 'the')
|
||||||
eq_(get_most_common('es'), 'de')
|
eq_(get_most_common('es'), 'de')
|
||||||
@ -85,6 +88,7 @@ def test_most_common_words():
|
|||||||
eq_(get_most_common('it'), 'di')
|
eq_(get_most_common('it'), 'di')
|
||||||
eq_(get_most_common('ja'), 'の')
|
eq_(get_most_common('ja'), 'の')
|
||||||
eq_(get_most_common('nl'), 'de')
|
eq_(get_most_common('nl'), 'de')
|
||||||
|
eq_(get_most_common('pl'), 'w')
|
||||||
eq_(get_most_common('pt'), 'de')
|
eq_(get_most_common('pt'), 'de')
|
||||||
eq_(get_most_common('ru'), 'в')
|
eq_(get_most_common('ru'), 'в')
|
||||||
eq_(get_most_common('tr'), 'bir')
|
eq_(get_most_common('tr'), 'bir')
|
||||||
@ -141,6 +145,12 @@ def test_casefolding():
|
|||||||
eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
|
eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_number_smashing():
|
||||||
|
eq_(tokenize('1', 'en'), ['1'])
|
||||||
|
eq_(tokenize('3.14', 'en'), ['0.00'])
|
||||||
|
eq_(tokenize('24601', 'en'), ['00000'])
|
||||||
|
|
||||||
|
|
||||||
def test_phrase_freq():
|
def test_phrase_freq():
|
||||||
ff = word_frequency("flip-flop", 'en')
|
ff = word_frequency("flip-flop", 'en')
|
||||||
assert_greater(ff, 0)
|
assert_greater(ff, 0)
|
||||||
@ -159,7 +169,7 @@ def test_not_really_random():
|
|||||||
# This not only tests random_ascii_words, it makes sure we didn't end
|
# This not only tests random_ascii_words, it makes sure we didn't end
|
||||||
# up with 'eos' as a very common Japanese word
|
# up with 'eos' as a very common Japanese word
|
||||||
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
|
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
|
||||||
'1 1 1 1')
|
'00 00 00 00')
|
||||||
|
|
||||||
|
|
||||||
@raises(ValueError)
|
@raises(ValueError)
|
||||||
|
@ -25,8 +25,8 @@ def test_tokens():
|
|||||||
eq_(
|
eq_(
|
||||||
tokenize(fact_simplified, 'zh'),
|
tokenize(fact_simplified, 'zh'),
|
||||||
[
|
[
|
||||||
# he / is / in history / #6 / counter for people
|
# he / is / history / in / #6 / counter for people
|
||||||
'他', '是', '历史上', '第六', '位',
|
'他', '是', '历史', '上', '第六', '位',
|
||||||
# during / term of office / in / die
|
# during / term of office / in / die
|
||||||
'在', '任期', '内', '去世',
|
'在', '任期', '内', '去世',
|
||||||
# of / U.S. / deputy / president
|
# of / U.S. / deputy / president
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_cs.msgpack.gz
Normal file
BIN
wordfreq/data/combined_cs.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_fa.msgpack.gz
Normal file
BIN
wordfreq/data/combined_fa.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_sh.msgpack.gz
Normal file
BIN
wordfreq/data/combined_sh.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_uk.msgpack.gz
Normal file
BIN
wordfreq/data/combined_uk.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_fi.msgpack.gz
Normal file
BIN
wordfreq/data/large_fi.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_ja.msgpack.gz
Normal file
BIN
wordfreq/data/large_ja.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_pl.msgpack.gz
Normal file
BIN
wordfreq/data/large_pl.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_zh.msgpack.gz
Normal file
BIN
wordfreq/data/large_zh.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_bg.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_bg.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_cs.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_cs.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_da.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_da.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_fa.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_fa.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/twitter_fi.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_fi.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_hu.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_hu.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_nb.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_nb.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_ro.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_ro.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_sh.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_sh.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_uk.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_uk.msgpack.gz
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user