import new wordlists from Exquisite Corpus

This commit is contained in:
Robyn Speer 2017-01-05 17:59:26 -05:00
parent de32a15b4f
commit 7dc3f03ebd
76 changed files with 25520 additions and 24548 deletions

View File

@ -22,16 +22,19 @@ def test_freq_examples():
LAUGHTER_WORDS = { LAUGHTER_WORDS = {
'en': 'lol', 'en': 'lol',
'hi': 'lol', 'hi': 'lol',
'cs': 'lol',
'ru': 'лол', 'ru': 'лол',
'zh': '', 'zh': '',
'ja': '', 'ja': '',
'ar': '', 'ar': '',
'fa': 'خخخخ',
'ca': 'jaja', 'ca': 'jaja',
'es': 'jaja', 'es': 'jaja',
'fr': 'ptdr', 'fr': 'ptdr',
'pt': 'kkkk', 'pt': 'kkkk',
'he': 'חחח', 'he': 'חחח',
'bg': 'xaxa', 'bg': 'ахаха',
'uk': 'хаха',
} }
@ -77,7 +80,7 @@ def test_most_common_words():
""" """
return top_n_list(lang, 1)[0] return top_n_list(lang, 1)[0]
eq_(get_most_common('ar'), 'من') eq_(get_most_common('ar'), 'في')
eq_(get_most_common('de'), 'die') eq_(get_most_common('de'), 'die')
eq_(get_most_common('en'), 'the') eq_(get_most_common('en'), 'the')
eq_(get_most_common('es'), 'de') eq_(get_most_common('es'), 'de')
@ -85,6 +88,7 @@ def test_most_common_words():
eq_(get_most_common('it'), 'di') eq_(get_most_common('it'), 'di')
eq_(get_most_common('ja'), '') eq_(get_most_common('ja'), '')
eq_(get_most_common('nl'), 'de') eq_(get_most_common('nl'), 'de')
eq_(get_most_common('pl'), 'w')
eq_(get_most_common('pt'), 'de') eq_(get_most_common('pt'), 'de')
eq_(get_most_common('ru'), 'в') eq_(get_most_common('ru'), 'в')
eq_(get_most_common('tr'), 'bir') eq_(get_most_common('tr'), 'bir')
@ -141,6 +145,12 @@ def test_casefolding():
eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca']) eq_(tokenize('SIKISINCA', 'tr'), ['sıkısınca'])
def test_number_smashing():
eq_(tokenize('1', 'en'), ['1'])
eq_(tokenize('3.14', 'en'), ['0.00'])
eq_(tokenize('24601', 'en'), ['00000'])
def test_phrase_freq(): def test_phrase_freq():
ff = word_frequency("flip-flop", 'en') ff = word_frequency("flip-flop", 'en')
assert_greater(ff, 0) assert_greater(ff, 0)
@ -159,7 +169,7 @@ def test_not_really_random():
# This not only tests random_ascii_words, it makes sure we didn't end # This not only tests random_ascii_words, it makes sure we didn't end
# up with 'eos' as a very common Japanese word # up with 'eos' as a very common Japanese word
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0), eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
'1 1 1 1') '00 00 00 00')
@raises(ValueError) @raises(ValueError)

View File

@ -25,8 +25,8 @@ def test_tokens():
eq_( eq_(
tokenize(fact_simplified, 'zh'), tokenize(fact_simplified, 'zh'),
[ [
# he / is / in history / #6 / counter for people # he / is / history / in / #6 / counter for people
'', '', '历史', '第六', '', '', '', '历史', '', '第六', '',
# during / term of office / in / die # during / term of office / in / die
'', '任期', '', '去世', '', '任期', '', '去世',
# of / U.S. / deputy / president # of / U.S. / deputy / president

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.