mirror of
https://github.com/rspeer/wordfreq.git
synced 2025-01-13 20:56:00 +00:00
update data and tests for 2.5
This commit is contained in:
parent
ed23bf3ebe
commit
ec48c0a123
README.mdsetup.py
tests
wordfreq/data
jieba_zh.txtlarge_ar.msgpack.gzlarge_bn.msgpack.gzlarge_ca.msgpack.gzlarge_cs.msgpack.gzlarge_de.msgpack.gzlarge_en.msgpack.gzlarge_es.msgpack.gzlarge_fi.msgpack.gzlarge_fr.msgpack.gzlarge_he.msgpack.gzlarge_it.msgpack.gzlarge_ja.msgpack.gzlarge_mk.msgpack.gzlarge_nb.msgpack.gzlarge_nl.msgpack.gzlarge_pl.msgpack.gzlarge_pt.msgpack.gzlarge_ru.msgpack.gzlarge_sv.msgpack.gzlarge_uk.msgpack.gzlarge_zh.msgpack.gzsmall_ar.msgpack.gzsmall_bg.msgpack.gzsmall_bn.msgpack.gzsmall_ca.msgpack.gzsmall_cs.msgpack.gzsmall_da.msgpack.gzsmall_de.msgpack.gzsmall_el.msgpack.gzsmall_en.msgpack.gzsmall_es.msgpack.gzsmall_fa.msgpack.gzsmall_fi.msgpack.gzsmall_fil.msgpack.gzsmall_fr.msgpack.gzsmall_gl.msgpack.gzsmall_he.msgpack.gzsmall_hi.msgpack.gzsmall_hu.msgpack.gzsmall_id.msgpack.gzsmall_is.msgpack.gzsmall_it.msgpack.gzsmall_ja.msgpack.gzsmall_ko.msgpack.gzsmall_lt.msgpack.gzsmall_lv.msgpack.gzsmall_mk.msgpack.gzsmall_ml.msgpack.gzsmall_ms.msgpack.gzsmall_nb.msgpack.gzsmall_nl.msgpack.gzsmall_pl.msgpack.gzsmall_pt.msgpack.gzsmall_ro.msgpack.gzsmall_ru.msgpack.gzsmall_sh.msgpack.gzsmall_sk.msgpack.gzsmall_sl.msgpack.gzsmall_sv.msgpack.gzsmall_sw.msgpack.gzsmall_ta.msgpack.gzsmall_tr.msgpack.gzsmall_uk.msgpack.gzsmall_ur.msgpack.gzsmall_vi.msgpack.gzsmall_zh.msgpack.gz
22
README.md
22
README.md
@ -45,16 +45,16 @@ frequency as a decimal between 0 and 1.
|
|||||||
|
|
||||||
>>> from wordfreq import word_frequency
|
>>> from wordfreq import word_frequency
|
||||||
>>> word_frequency('cafe', 'en')
|
>>> word_frequency('cafe', 'en')
|
||||||
1.05e-05
|
1.23e-05
|
||||||
|
|
||||||
>>> word_frequency('café', 'en')
|
>>> word_frequency('café', 'en')
|
||||||
5.62e-06
|
5.62e-06
|
||||||
|
|
||||||
>>> word_frequency('cafe', 'fr')
|
>>> word_frequency('cafe', 'fr')
|
||||||
1.55e-06
|
1.51e-06
|
||||||
|
|
||||||
>>> word_frequency('café', 'fr')
|
>>> word_frequency('café', 'fr')
|
||||||
6.61e-05
|
5.75e-05
|
||||||
|
|
||||||
|
|
||||||
`zipf_frequency` is a variation on `word_frequency` that aims to return the
|
`zipf_frequency` is a variation on `word_frequency` that aims to return the
|
||||||
@ -72,16 +72,16 @@ one occurrence per billion words.
|
|||||||
|
|
||||||
>>> from wordfreq import zipf_frequency
|
>>> from wordfreq import zipf_frequency
|
||||||
>>> zipf_frequency('the', 'en')
|
>>> zipf_frequency('the', 'en')
|
||||||
7.76
|
7.73
|
||||||
|
|
||||||
>>> zipf_frequency('word', 'en')
|
>>> zipf_frequency('word', 'en')
|
||||||
5.26
|
5.26
|
||||||
|
|
||||||
>>> zipf_frequency('frequency', 'en')
|
>>> zipf_frequency('frequency', 'en')
|
||||||
4.48
|
4.36
|
||||||
|
|
||||||
>>> zipf_frequency('zipf', 'en')
|
>>> zipf_frequency('zipf', 'en')
|
||||||
1.62
|
1.49
|
||||||
|
|
||||||
>>> zipf_frequency('zipf', 'en', wordlist='small')
|
>>> zipf_frequency('zipf', 'en', wordlist='small')
|
||||||
0.0
|
0.0
|
||||||
@ -232,7 +232,7 @@ the list, in descending frequency order.
|
|||||||
|
|
||||||
>>> from wordfreq import top_n_list
|
>>> from wordfreq import top_n_list
|
||||||
>>> top_n_list('en', 10)
|
>>> top_n_list('en', 10)
|
||||||
['the', 'of', 'to', 'and', 'a', 'in', 'i', 'is', 'for', 'that']
|
['the', 'to', 'and', 'of', 'a', 'in', 'i', 'is', 'for', 'that']
|
||||||
|
|
||||||
>>> top_n_list('es', 10)
|
>>> top_n_list('es', 10)
|
||||||
['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'un']
|
['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'un']
|
||||||
@ -302,16 +302,16 @@ tokenized according to this function.
|
|||||||
>>> tokenize('l@s niñ@s', 'es')
|
>>> tokenize('l@s niñ@s', 'es')
|
||||||
['l@s', 'niñ@s']
|
['l@s', 'niñ@s']
|
||||||
>>> zipf_frequency('l@s', 'es')
|
>>> zipf_frequency('l@s', 'es')
|
||||||
2.82
|
3.03
|
||||||
|
|
||||||
Because tokenization in the real world is far from consistent, wordfreq will
|
Because tokenization in the real world is far from consistent, wordfreq will
|
||||||
also try to deal gracefully when you query it with texts that actually break
|
also try to deal gracefully when you query it with texts that actually break
|
||||||
into multiple tokens:
|
into multiple tokens:
|
||||||
|
|
||||||
>>> zipf_frequency('New York', 'en')
|
>>> zipf_frequency('New York', 'en')
|
||||||
5.3
|
5.32
|
||||||
>>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway"
|
>>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway"
|
||||||
3.23
|
3.29
|
||||||
|
|
||||||
The word frequencies are combined with the half-harmonic-mean function in order
|
The word frequencies are combined with the half-harmonic-mean function in order
|
||||||
to provide an estimate of what their combined frequency would be. In Chinese,
|
to provide an estimate of what their combined frequency would be. In Chinese,
|
||||||
@ -326,7 +326,7 @@ you give it an uncommon combination of tokens, it will hugely over-estimate
|
|||||||
their frequency:
|
their frequency:
|
||||||
|
|
||||||
>>> zipf_frequency('owl-flavored', 'en')
|
>>> zipf_frequency('owl-flavored', 'en')
|
||||||
3.29
|
3.3
|
||||||
|
|
||||||
|
|
||||||
## Multi-script languages
|
## Multi-script languages
|
||||||
|
2
setup.py
2
setup.py
@ -33,7 +33,7 @@ dependencies = [
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="wordfreq",
|
name="wordfreq",
|
||||||
version='2.4.1',
|
version='2.5.0',
|
||||||
maintainer='Robyn Speer',
|
maintainer='Robyn Speer',
|
||||||
maintainer_email='rspeer@luminoso.com',
|
maintainer_email='rspeer@luminoso.com',
|
||||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||||
|
@ -60,18 +60,48 @@ def test_most_common_words():
|
|||||||
return top_n_list(lang, 1)[0]
|
return top_n_list(lang, 1)[0]
|
||||||
|
|
||||||
assert get_most_common('ar') == 'في'
|
assert get_most_common('ar') == 'في'
|
||||||
|
assert get_most_common('bg') == 'на'
|
||||||
|
assert get_most_common('bn') == 'না'
|
||||||
|
assert get_most_common('ca') == 'de'
|
||||||
assert get_most_common('cs') == 'a'
|
assert get_most_common('cs') == 'a'
|
||||||
|
assert get_most_common('da') == 'i'
|
||||||
|
assert get_most_common('el') == 'και'
|
||||||
assert get_most_common('de') == 'die'
|
assert get_most_common('de') == 'die'
|
||||||
assert get_most_common('en') == 'the'
|
assert get_most_common('en') == 'the'
|
||||||
assert get_most_common('es') == 'de'
|
assert get_most_common('es') == 'de'
|
||||||
|
assert get_most_common('fi') == 'ja'
|
||||||
|
assert get_most_common('fil') == 'sa'
|
||||||
assert get_most_common('fr') == 'de'
|
assert get_most_common('fr') == 'de'
|
||||||
|
assert get_most_common('gl') == 'de'
|
||||||
|
assert get_most_common('he') == 'את'
|
||||||
|
assert get_most_common('hi') == 'के'
|
||||||
|
assert get_most_common('hu') == 'a'
|
||||||
|
assert get_most_common('id') == 'yang'
|
||||||
|
assert get_most_common('is') == 'og'
|
||||||
assert get_most_common('it') == 'di'
|
assert get_most_common('it') == 'di'
|
||||||
assert get_most_common('ja') == 'の'
|
assert get_most_common('ja') == 'の'
|
||||||
|
assert get_most_common('ko') == '이'
|
||||||
|
assert get_most_common('lt') == 'ir'
|
||||||
|
assert get_most_common('lv') == 'un'
|
||||||
|
assert get_most_common('mk') == 'на'
|
||||||
|
assert get_most_common('ml') == 'ഒരു'
|
||||||
|
assert get_most_common('ms') == 'yang'
|
||||||
|
assert get_most_common('nb') == 'i'
|
||||||
assert get_most_common('nl') == 'de'
|
assert get_most_common('nl') == 'de'
|
||||||
assert get_most_common('pl') == 'w'
|
assert get_most_common('pl') == 'w'
|
||||||
assert get_most_common('pt') == 'de'
|
assert get_most_common('pt') == 'de'
|
||||||
|
assert get_most_common('ro') == 'de'
|
||||||
assert get_most_common('ru') == 'в'
|
assert get_most_common('ru') == 'в'
|
||||||
assert get_most_common('tr') == 'bir'
|
assert get_most_common('sh') == 'je'
|
||||||
|
assert get_most_common('sk') == 'a'
|
||||||
|
assert get_most_common('sl') == 'je'
|
||||||
|
assert get_most_common('sv') == 'är'
|
||||||
|
assert get_most_common('sw') == 'ya'
|
||||||
|
assert get_most_common('ta') == 'ஒரு'
|
||||||
|
assert get_most_common('tr') == 've'
|
||||||
|
assert get_most_common('uk') == 'в'
|
||||||
|
assert get_most_common('ur') == 'کے'
|
||||||
|
assert get_most_common('vi') == 'là'
|
||||||
assert get_most_common('zh') == '的'
|
assert get_most_common('zh') == '的'
|
||||||
|
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
BIN
wordfreq/data/large_bn.msgpack.gz
Normal file
BIN
wordfreq/data/large_bn.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/large_ca.msgpack.gz
Normal file
BIN
wordfreq/data/large_ca.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_he.msgpack.gz
Normal file
BIN
wordfreq/data/large_he.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_mk.msgpack.gz
Normal file
BIN
wordfreq/data/large_mk.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/large_nb.msgpack.gz
Normal file
BIN
wordfreq/data/large_nb.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_sv.msgpack.gz
Normal file
BIN
wordfreq/data/large_sv.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/large_uk.msgpack.gz
Normal file
BIN
wordfreq/data/large_uk.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_fil.msgpack.gz
Normal file
BIN
wordfreq/data/small_fil.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_gl.msgpack.gz
Normal file
BIN
wordfreq/data/small_gl.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_is.msgpack.gz
Normal file
BIN
wordfreq/data/small_is.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_lt.msgpack.gz
Normal file
BIN
wordfreq/data/small_lt.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_ml.msgpack.gz
Normal file
BIN
wordfreq/data/small_ml.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_sk.msgpack.gz
Normal file
BIN
wordfreq/data/small_sk.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/small_sl.msgpack.gz
Normal file
BIN
wordfreq/data/small_sl.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_sw.msgpack.gz
Normal file
BIN
wordfreq/data/small_sw.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/small_ta.msgpack.gz
Normal file
BIN
wordfreq/data/small_ta.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_ur.msgpack.gz
Normal file
BIN
wordfreq/data/small_ur.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/small_vi.msgpack.gz
Normal file
BIN
wordfreq/data/small_vi.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user