1
0
mirror of https://github.com/rspeer/wordfreq.git synced 2025-01-13 20:56:00 +00:00

update data and tests for 2.5

This commit is contained in:
Robyn Speer 2021-03-29 16:18:08 -04:00
parent ed23bf3ebe
commit ec48c0a123
70 changed files with 38834 additions and 37458 deletions

View File

@ -45,16 +45,16 @@ frequency as a decimal between 0 and 1.
>>> from wordfreq import word_frequency >>> from wordfreq import word_frequency
>>> word_frequency('cafe', 'en') >>> word_frequency('cafe', 'en')
1.05e-05 1.23e-05
>>> word_frequency('café', 'en') >>> word_frequency('café', 'en')
5.62e-06 5.62e-06
>>> word_frequency('cafe', 'fr') >>> word_frequency('cafe', 'fr')
1.55e-06 1.51e-06
>>> word_frequency('café', 'fr') >>> word_frequency('café', 'fr')
6.61e-05 5.75e-05
`zipf_frequency` is a variation on `word_frequency` that aims to return the `zipf_frequency` is a variation on `word_frequency` that aims to return the
@ -72,16 +72,16 @@ one occurrence per billion words.
>>> from wordfreq import zipf_frequency >>> from wordfreq import zipf_frequency
>>> zipf_frequency('the', 'en') >>> zipf_frequency('the', 'en')
7.76 7.73
>>> zipf_frequency('word', 'en') >>> zipf_frequency('word', 'en')
5.26 5.26
>>> zipf_frequency('frequency', 'en') >>> zipf_frequency('frequency', 'en')
4.48 4.36
>>> zipf_frequency('zipf', 'en') >>> zipf_frequency('zipf', 'en')
1.62 1.49
>>> zipf_frequency('zipf', 'en', wordlist='small') >>> zipf_frequency('zipf', 'en', wordlist='small')
0.0 0.0
@ -232,7 +232,7 @@ the list, in descending frequency order.
>>> from wordfreq import top_n_list >>> from wordfreq import top_n_list
>>> top_n_list('en', 10) >>> top_n_list('en', 10)
['the', 'of', 'to', 'and', 'a', 'in', 'i', 'is', 'for', 'that'] ['the', 'to', 'and', 'of', 'a', 'in', 'i', 'is', 'for', 'that']
>>> top_n_list('es', 10) >>> top_n_list('es', 10)
['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'un'] ['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'no', 'un']
@ -302,16 +302,16 @@ tokenized according to this function.
>>> tokenize('l@s niñ@s', 'es') >>> tokenize('l@s niñ@s', 'es')
['l@s', 'niñ@s'] ['l@s', 'niñ@s']
>>> zipf_frequency('l@s', 'es') >>> zipf_frequency('l@s', 'es')
2.82 3.03
Because tokenization in the real world is far from consistent, wordfreq will Because tokenization in the real world is far from consistent, wordfreq will
also try to deal gracefully when you query it with texts that actually break also try to deal gracefully when you query it with texts that actually break
into multiple tokens: into multiple tokens:
>>> zipf_frequency('New York', 'en') >>> zipf_frequency('New York', 'en')
5.3 5.32
>>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway" >>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway"
3.23 3.29
The word frequencies are combined with the half-harmonic-mean function in order The word frequencies are combined with the half-harmonic-mean function in order
to provide an estimate of what their combined frequency would be. In Chinese, to provide an estimate of what their combined frequency would be. In Chinese,
@ -326,7 +326,7 @@ you give it an uncommon combination of tokens, it will hugely over-estimate
their frequency: their frequency:
>>> zipf_frequency('owl-flavored', 'en') >>> zipf_frequency('owl-flavored', 'en')
3.29 3.3
## Multi-script languages ## Multi-script languages

View File

@ -33,7 +33,7 @@ dependencies = [
setup( setup(
name="wordfreq", name="wordfreq",
version='2.4.1', version='2.5.0',
maintainer='Robyn Speer', maintainer='Robyn Speer',
maintainer_email='rspeer@luminoso.com', maintainer_email='rspeer@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/', url='http://github.com/LuminosoInsight/wordfreq/',

View File

@ -60,18 +60,48 @@ def test_most_common_words():
return top_n_list(lang, 1)[0] return top_n_list(lang, 1)[0]
assert get_most_common('ar') == 'في' assert get_most_common('ar') == 'في'
assert get_most_common('bg') == 'на'
assert get_most_common('bn') == 'না'
assert get_most_common('ca') == 'de'
assert get_most_common('cs') == 'a' assert get_most_common('cs') == 'a'
assert get_most_common('da') == 'i'
assert get_most_common('el') == 'και'
assert get_most_common('de') == 'die' assert get_most_common('de') == 'die'
assert get_most_common('en') == 'the' assert get_most_common('en') == 'the'
assert get_most_common('es') == 'de' assert get_most_common('es') == 'de'
assert get_most_common('fi') == 'ja'
assert get_most_common('fil') == 'sa'
assert get_most_common('fr') == 'de' assert get_most_common('fr') == 'de'
assert get_most_common('gl') == 'de'
assert get_most_common('he') == 'את'
assert get_most_common('hi') == 'के'
assert get_most_common('hu') == 'a'
assert get_most_common('id') == 'yang'
assert get_most_common('is') == 'og'
assert get_most_common('it') == 'di' assert get_most_common('it') == 'di'
assert get_most_common('ja') == '' assert get_most_common('ja') == ''
assert get_most_common('ko') == ''
assert get_most_common('lt') == 'ir'
assert get_most_common('lv') == 'un'
assert get_most_common('mk') == 'на'
assert get_most_common('ml') == 'ഒരു'
assert get_most_common('ms') == 'yang'
assert get_most_common('nb') == 'i'
assert get_most_common('nl') == 'de' assert get_most_common('nl') == 'de'
assert get_most_common('pl') == 'w' assert get_most_common('pl') == 'w'
assert get_most_common('pt') == 'de' assert get_most_common('pt') == 'de'
assert get_most_common('ro') == 'de'
assert get_most_common('ru') == 'в' assert get_most_common('ru') == 'в'
assert get_most_common('tr') == 'bir' assert get_most_common('sh') == 'je'
assert get_most_common('sk') == 'a'
assert get_most_common('sl') == 'je'
assert get_most_common('sv') == 'är'
assert get_most_common('sw') == 'ya'
assert get_most_common('ta') == 'ஒரு'
assert get_most_common('tr') == 've'
assert get_most_common('uk') == 'в'
assert get_most_common('ur') == 'کے'
assert get_most_common('vi') == ''
assert get_most_common('zh') == '' assert get_most_common('zh') == ''

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.