mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
include data from xc rebuild
This commit is contained in:
parent
65692c3d81
commit
86b928f967
10
README.md
10
README.md
@ -48,13 +48,13 @@ frequency as a decimal between 0 and 1.
|
||||
1.07e-05
|
||||
|
||||
>>> word_frequency('café', 'en')
|
||||
5.89e-06
|
||||
5.75e-06
|
||||
|
||||
>>> word_frequency('cafe', 'fr')
|
||||
1.51e-06
|
||||
|
||||
>>> word_frequency('café', 'fr')
|
||||
5.25e-05
|
||||
5.13e-05
|
||||
|
||||
|
||||
`zipf_frequency` is a variation on `word_frequency` that aims to return the
|
||||
@ -78,10 +78,10 @@ one occurrence per billion words.
|
||||
5.29
|
||||
|
||||
>>> zipf_frequency('frequency', 'en')
|
||||
4.42
|
||||
4.43
|
||||
|
||||
>>> zipf_frequency('zipf', 'en')
|
||||
1.55
|
||||
1.57
|
||||
|
||||
>>> zipf_frequency('zipf', 'en', wordlist='small')
|
||||
0.0
|
||||
@ -300,7 +300,7 @@ into multiple tokens:
|
||||
>>> zipf_frequency('New York', 'en')
|
||||
5.28
|
||||
>>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway"
|
||||
3.57
|
||||
3.61
|
||||
|
||||
The word frequencies are combined with the half-harmonic-mean function in order
|
||||
to provide an estimate of what their combined frequency would be. In Chinese,
|
||||
|
@ -41,6 +41,14 @@ def test_gender_neutral_at():
|
||||
assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
|
||||
|
||||
|
||||
def test_at_in_corpus():
|
||||
# We have a word frequency for "l@s"
|
||||
assert word_frequency('l@s', 'es') > 0
|
||||
|
||||
# It's not just treated as a word break
|
||||
assert word_frequency('l@s', 'es') < word_frequency('l s', 'es')
|
||||
|
||||
|
||||
def test_punctuation_at():
|
||||
# If the @ appears alone in a word, we consider it to be punctuation
|
||||
text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
|
||||
|
@ -59,7 +59,7 @@ def test_tokens():
|
||||
|
||||
def test_combination():
|
||||
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
|
||||
assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20)
|
||||
assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)
|
||||
|
||||
|
||||
def test_alternate_codes():
|
||||
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user