import new wordlists from Exquisite Corpus

This commit is contained in:
Rob Speer 2017-01-05 17:59:26 -05:00
parent 847b85c5b8
commit f671a1db7f
76 changed files with 25520 additions and 24548 deletions

View File

@ -22,16 +22,19 @@ def test_freq_examples():
LAUGHTER_WORDS = {
'en': 'lol',
'hi': 'lol',
'cs': 'lol',
'ru': 'лол',
'zh': '',
'ja': '',
'ar': '',
'fa': 'خخخخ',
'ca': 'jaja',
'es': 'jaja',
'fr': 'ptdr',
'pt': 'kkkk',
'he': 'חחח',
'bg': 'xaxa',
'bg': 'ахаха',
'uk': 'хаха',
}
@ -77,7 +80,7 @@ def test_most_common_words():
"""
return top_n_list(lang, 1)[0]
eq_(get_most_common('ar'), 'من')
eq_(get_most_common('ar'), 'في')
eq_(get_most_common('de'), 'die')
eq_(get_most_common('en'), 'the')
eq_(get_most_common('es'), 'de')
@ -85,6 +88,7 @@ def test_most_common_words():
eq_(get_most_common('it'), 'di')
eq_(get_most_common('ja'), '')
eq_(get_most_common('nl'), 'de')
eq_(get_most_common('pl'), 'w')
eq_(get_most_common('pt'), 'de')
eq_(get_most_common('ru'), 'в')
eq_(get_most_common('tr'), 'bir')
@ -141,6 +145,12 @@ def test_casefolding():
eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
def test_number_smashing():
eq_(tokenize('1', 'en'), ['1'])
eq_(tokenize('3.14', 'en'), ['0.00'])
eq_(tokenize('24601', 'en'), ['00000'])
def test_phrase_freq():
ff = word_frequency("flip-flop", 'en')
assert_greater(ff, 0)
@ -159,7 +169,7 @@ def test_not_really_random():
# This not only tests random_ascii_words, it makes sure we didn't end
# up with 'eos' as a very common Japanese word
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
'1 1 1 1')
'00 00 00 00')
@raises(ValueError)

View File

@ -25,8 +25,8 @@ def test_tokens():
eq_(
tokenize(fact_simplified, 'zh'),
[
# he / is / in history / #6 / counter for people
'', '', '历史', '第六', '',
# he / is / history / in / #6 / counter for people
'', '', '历史', '', '第六', '',
# during / term of office / in / die
'', '任期', '', '去世',
# of / U.S. / deputy / president

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.