mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
include data from xc rebuild
This commit is contained in:
parent
b2d242e8bf
commit
d06a6a48c5
10
README.md
10
README.md
@ -48,13 +48,13 @@ frequency as a decimal between 0 and 1.
|
|||||||
1.07e-05
|
1.07e-05
|
||||||
|
|
||||||
>>> word_frequency('café', 'en')
|
>>> word_frequency('café', 'en')
|
||||||
5.89e-06
|
5.75e-06
|
||||||
|
|
||||||
>>> word_frequency('cafe', 'fr')
|
>>> word_frequency('cafe', 'fr')
|
||||||
1.51e-06
|
1.51e-06
|
||||||
|
|
||||||
>>> word_frequency('café', 'fr')
|
>>> word_frequency('café', 'fr')
|
||||||
5.25e-05
|
5.13e-05
|
||||||
|
|
||||||
|
|
||||||
`zipf_frequency` is a variation on `word_frequency` that aims to return the
|
`zipf_frequency` is a variation on `word_frequency` that aims to return the
|
||||||
@ -78,10 +78,10 @@ one occurrence per billion words.
|
|||||||
5.29
|
5.29
|
||||||
|
|
||||||
>>> zipf_frequency('frequency', 'en')
|
>>> zipf_frequency('frequency', 'en')
|
||||||
4.42
|
4.43
|
||||||
|
|
||||||
>>> zipf_frequency('zipf', 'en')
|
>>> zipf_frequency('zipf', 'en')
|
||||||
1.55
|
1.57
|
||||||
|
|
||||||
>>> zipf_frequency('zipf', 'en', wordlist='small')
|
>>> zipf_frequency('zipf', 'en', wordlist='small')
|
||||||
0.0
|
0.0
|
||||||
@ -300,7 +300,7 @@ into multiple tokens:
|
|||||||
>>> zipf_frequency('New York', 'en')
|
>>> zipf_frequency('New York', 'en')
|
||||||
5.28
|
5.28
|
||||||
>>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway"
|
>>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway"
|
||||||
3.57
|
3.61
|
||||||
|
|
||||||
The word frequencies are combined with the half-harmonic-mean function in order
|
The word frequencies are combined with the half-harmonic-mean function in order
|
||||||
to provide an estimate of what their combined frequency would be. In Chinese,
|
to provide an estimate of what their combined frequency would be. In Chinese,
|
||||||
|
@ -41,6 +41,14 @@ def test_gender_neutral_at():
|
|||||||
assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
|
assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_at_in_corpus():
|
||||||
|
# We have a word frequency for "l@s"
|
||||||
|
assert word_frequency('l@s', 'es') > 0
|
||||||
|
|
||||||
|
# It's not just treated as a word break
|
||||||
|
assert word_frequency('l@s', 'es') < word_frequency('l s', 'es')
|
||||||
|
|
||||||
|
|
||||||
def test_punctuation_at():
|
def test_punctuation_at():
|
||||||
# If the @ appears alone in a word, we consider it to be punctuation
|
# If the @ appears alone in a word, we consider it to be punctuation
|
||||||
text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
|
text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
|
||||||
|
@ -59,7 +59,7 @@ def test_tokens():
|
|||||||
|
|
||||||
def test_combination():
|
def test_combination():
|
||||||
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
|
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
|
||||||
assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20)
|
assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)
|
||||||
|
|
||||||
|
|
||||||
def test_alternate_codes():
|
def test_alternate_codes():
|
||||||
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user