mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 01:11:37 +00:00
update data and tests for 2.5
This commit is contained in:
parent
ed23bf3ebe
commit
ec48c0a123
22
README.md
22
README.md
@ -45,16 +45,16 @@ frequency as a decimal between 0 and 1.
|
||||
|
||||
>>> from wordfreq import word_frequency
|
||||
>>> word_frequency('cafe', 'en')
|
||||
1.05e-05
|
||||
1.23e-05
|
||||
|
||||
>>> word_frequency('café', 'en')
|
||||
5.62e-06
|
||||
|
||||
>>> word_frequency('cafe', 'fr')
|
||||
1.55e-06
|
||||
1.51e-06
|
||||
|
||||
>>> word_frequency('café', 'fr')
|
||||
6.61e-05
|
||||
5.75e-05
|
||||
|
||||
|
||||
`zipf_frequency` is a variation on `word_frequency` that aims to return the
|
||||
@ -72,16 +72,16 @@ one occurrence per billion words.
|
||||
|
||||
>>> from wordfreq import zipf_frequency
|
||||
>>> zipf_frequency('the', 'en')
|
||||
7.76
|
||||
7.73
|
||||
|
||||
>>> zipf_frequency('word', 'en')
|
||||
5.26
|
||||
|
||||
>>> zipf_frequency('frequency', 'en')
|
||||
4.48
|
||||
4.36
|
||||
|
||||
>>> zipf_frequency('zipf', 'en')
|
||||
1.62
|
||||
1.49
|
||||
|
||||
>>> zipf_frequency('zipf', 'en', wordlist='small')
|
||||
0.0
|
||||
@ -232,7 +232,7 @@ the list, in descending frequency order.
|
||||
|
||||
>>> from wordfreq import top_n_list
|
||||
>>> top_n_list('en', 10)
|
||||
['the', 'of', 'to', 'and', 'a', 'in', 'i', 'is', 'for', 'that']
|
||||
['the', 'to', 'and', 'of', 'a', 'in', 'i', 'is', 'for', 'that']
|
||||
|
||||
>>> top_n_list('es', 10)
|
||||
['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'un']
|
||||
@ -302,16 +302,16 @@ tokenized according to this function.
|
||||
>>> tokenize('l@s niñ@s', 'es')
|
||||
['l@s', 'niñ@s']
|
||||
>>> zipf_frequency('l@s', 'es')
|
||||
2.82
|
||||
3.03
|
||||
|
||||
Because tokenization in the real world is far from consistent, wordfreq will
|
||||
also try to deal gracefully when you query it with texts that actually break
|
||||
into multiple tokens:
|
||||
|
||||
>>> zipf_frequency('New York', 'en')
|
||||
5.3
|
||||
5.32
|
||||
>>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway"
|
||||
3.23
|
||||
3.29
|
||||
|
||||
The word frequencies are combined with the half-harmonic-mean function in order
|
||||
to provide an estimate of what their combined frequency would be. In Chinese,
|
||||
@ -326,7 +326,7 @@ you give it an uncommon combination of tokens, it will hugely over-estimate
|
||||
their frequency:
|
||||
|
||||
>>> zipf_frequency('owl-flavored', 'en')
|
||||
3.29
|
||||
3.3
|
||||
|
||||
|
||||
## Multi-script languages
|
||||
|
2
setup.py
2
setup.py
@ -33,7 +33,7 @@ dependencies = [
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
version='2.4.1',
|
||||
version='2.5.0',
|
||||
maintainer='Robyn Speer',
|
||||
maintainer_email='rspeer@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
|
@ -60,18 +60,48 @@ def test_most_common_words():
|
||||
return top_n_list(lang, 1)[0]
|
||||
|
||||
assert get_most_common('ar') == 'في'
|
||||
assert get_most_common('bg') == 'на'
|
||||
assert get_most_common('bn') == 'না'
|
||||
assert get_most_common('ca') == 'de'
|
||||
assert get_most_common('cs') == 'a'
|
||||
assert get_most_common('da') == 'i'
|
||||
assert get_most_common('el') == 'και'
|
||||
assert get_most_common('de') == 'die'
|
||||
assert get_most_common('en') == 'the'
|
||||
assert get_most_common('es') == 'de'
|
||||
assert get_most_common('fi') == 'ja'
|
||||
assert get_most_common('fil') == 'sa'
|
||||
assert get_most_common('fr') == 'de'
|
||||
assert get_most_common('gl') == 'de'
|
||||
assert get_most_common('he') == 'את'
|
||||
assert get_most_common('hi') == 'के'
|
||||
assert get_most_common('hu') == 'a'
|
||||
assert get_most_common('id') == 'yang'
|
||||
assert get_most_common('is') == 'og'
|
||||
assert get_most_common('it') == 'di'
|
||||
assert get_most_common('ja') == 'の'
|
||||
assert get_most_common('ko') == '이'
|
||||
assert get_most_common('lt') == 'ir'
|
||||
assert get_most_common('lv') == 'un'
|
||||
assert get_most_common('mk') == 'на'
|
||||
assert get_most_common('ml') == 'ഒരു'
|
||||
assert get_most_common('ms') == 'yang'
|
||||
assert get_most_common('nb') == 'i'
|
||||
assert get_most_common('nl') == 'de'
|
||||
assert get_most_common('pl') == 'w'
|
||||
assert get_most_common('pt') == 'de'
|
||||
assert get_most_common('ro') == 'de'
|
||||
assert get_most_common('ru') == 'в'
|
||||
assert get_most_common('tr') == 'bir'
|
||||
assert get_most_common('sh') == 'je'
|
||||
assert get_most_common('sk') == 'a'
|
||||
assert get_most_common('sl') == 'je'
|
||||
assert get_most_common('sv') == 'är'
|
||||
assert get_most_common('sw') == 'ya'
|
||||
assert get_most_common('ta') == 'ஒரு'
|
||||
assert get_most_common('tr') == 've'
|
||||
assert get_most_common('uk') == 'в'
|
||||
assert get_most_common('ur') == 'کے'
|
||||
assert get_most_common('vi') == 'là'
|
||||
assert get_most_common('zh') == '的'
|
||||
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
BIN
wordfreq/data/large_bn.msgpack.gz
Normal file
BIN
wordfreq/data/large_bn.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/large_ca.msgpack.gz
Normal file
BIN
wordfreq/data/large_ca.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_he.msgpack.gz
Normal file
BIN
wordfreq/data/large_he.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_mk.msgpack.gz
Normal file
BIN
wordfreq/data/large_mk.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/large_nb.msgpack.gz
Normal file
BIN
wordfreq/data/large_nb.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/large_sv.msgpack.gz
Normal file
BIN
wordfreq/data/large_sv.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/large_uk.msgpack.gz
Normal file
BIN
wordfreq/data/large_uk.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_fil.msgpack.gz
Normal file
BIN
wordfreq/data/small_fil.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_gl.msgpack.gz
Normal file
BIN
wordfreq/data/small_gl.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_is.msgpack.gz
Normal file
BIN
wordfreq/data/small_is.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_lt.msgpack.gz
Normal file
BIN
wordfreq/data/small_lt.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_ml.msgpack.gz
Normal file
BIN
wordfreq/data/small_ml.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_sk.msgpack.gz
Normal file
BIN
wordfreq/data/small_sk.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/small_sl.msgpack.gz
Normal file
BIN
wordfreq/data/small_sl.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_sw.msgpack.gz
Normal file
BIN
wordfreq/data/small_sw.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/small_ta.msgpack.gz
Normal file
BIN
wordfreq/data/small_ta.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/small_ur.msgpack.gz
Normal file
BIN
wordfreq/data/small_ur.msgpack.gz
Normal file
Binary file not shown.
BIN
wordfreq/data/small_vi.msgpack.gz
Normal file
BIN
wordfreq/data/small_vi.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user