include data from xc rebuild

This commit is contained in:
Rob Speer 2018-07-15 01:01:35 -04:00
parent b2d242e8bf
commit d06a6a48c5
52 changed files with 36488 additions and 35932 deletions

View File

@ -48,13 +48,13 @@ frequency as a decimal between 0 and 1.
1.07e-05 1.07e-05
>>> word_frequency('café', 'en') >>> word_frequency('café', 'en')
5.89e-06 5.75e-06
>>> word_frequency('cafe', 'fr') >>> word_frequency('cafe', 'fr')
1.51e-06 1.51e-06
>>> word_frequency('café', 'fr') >>> word_frequency('café', 'fr')
5.25e-05 5.13e-05
`zipf_frequency` is a variation on `word_frequency` that aims to return the `zipf_frequency` is a variation on `word_frequency` that aims to return the
@ -78,10 +78,10 @@ one occurrence per billion words.
5.29 5.29
>>> zipf_frequency('frequency', 'en') >>> zipf_frequency('frequency', 'en')
4.42 4.43
>>> zipf_frequency('zipf', 'en') >>> zipf_frequency('zipf', 'en')
1.55 1.57
>>> zipf_frequency('zipf', 'en', wordlist='small') >>> zipf_frequency('zipf', 'en', wordlist='small')
0.0 0.0
@ -300,7 +300,7 @@ into multiple tokens:
>>> zipf_frequency('New York', 'en') >>> zipf_frequency('New York', 'en')
5.28 5.28
>>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway" >>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway"
3.57 3.61
The word frequencies are combined with the half-harmonic-mean function in order The word frequencies are combined with the half-harmonic-mean function in order
to provide an estimate of what their combined frequency would be. In Chinese, to provide an estimate of what their combined frequency would be. In Chinese,

View File

@ -41,6 +41,14 @@ def test_gender_neutral_at():
assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"] assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
def test_at_in_corpus():
# We have a word frequency for "l@s"
assert word_frequency('l@s', 'es') > 0
# It's not just treated as a word break
assert word_frequency('l@s', 'es') < word_frequency('l s', 'es')
def test_punctuation_at(): def test_punctuation_at():
# If the @ appears alone in a word, we consider it to be punctuation # If the @ appears alone in a word, we consider it to be punctuation
text = "operadores de canal, que são aqueles que têm um @ ao lado do nick" text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"

View File

@ -59,7 +59,7 @@ def test_tokens():
def test_combination(): def test_combination():
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks" xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20) assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)
def test_alternate_codes(): def test_alternate_codes():

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.