include data from xc rebuild

2024-12-23 09:21:37 +00:00 · 2018-07-15 01:01:35 -04:00 · 2018-07-15 01:01:35 -04:00 · 86b928f967
commit 86b928f967
parent 65692c3d81
52 changed files with 36488 additions and 35932 deletions
--- a/README.md
+++ b/README.md
@ -48,13 +48,13 @@ frequency as a decimal between 0 and 1.
    1.07e-05

    >>> word_frequency('café', 'en')
-    5.89e-06
+    5.75e-06

    >>> word_frequency('cafe', 'fr')
    1.51e-06

    >>> word_frequency('café', 'fr')
-    5.25e-05
+    5.13e-05


 `zipf_frequency` is a variation on `word_frequency` that aims to return the
@ -78,10 +78,10 @@ one occurrence per billion words.
    5.29

    >>> zipf_frequency('frequency', 'en')
-    4.42
+    4.43

    >>> zipf_frequency('zipf', 'en')
-    1.55
+    1.57

    >>> zipf_frequency('zipf', 'en', wordlist='small')
    0.0
@ -300,7 +300,7 @@ into multiple tokens:
    >>> zipf_frequency('New York', 'en')
    5.28
    >>> zipf_frequency('北京地铁', 'zh')  # "Beijing Subway"
-    3.57
+    3.61

 The word frequencies are combined with the half-harmonic-mean function in order
 to provide an estimate of what their combined frequency would be. In Chinese,
--- a/tests/test_at_sign.py
+++ b/tests/test_at_sign.py
@ -41,6 +41,14 @@ def test_gender_neutral_at():
    assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]


+def test_at_in_corpus():
+    # We have a word frequency for "l@s"
+    assert word_frequency('l@s', 'es') > 0
+
+    # It's not just treated as a word break
+    assert word_frequency('l@s', 'es') < word_frequency('l s', 'es')
+
+
 def test_punctuation_at():
    # If the @ appears alone in a word, we consider it to be punctuation
    text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@ -59,7 +59,7 @@ def test_tokens():

 def test_combination():
    xiexie_freq = word_frequency('谢谢', 'zh')   # "Thanks"
-    assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20)
+    assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)


 def test_alternate_codes():
--- a/wordfreq/data/jieba_zh.txt
+++ b/wordfreq/data/jieba_zh.txt
--- a/wordfreq/data/large_ar.msgpack.gz
+++ b/wordfreq/data/large_ar.msgpack.gz
--- a/wordfreq/data/large_cs.msgpack.gz
+++ b/wordfreq/data/large_cs.msgpack.gz
--- a/wordfreq/data/large_de.msgpack.gz
+++ b/wordfreq/data/large_de.msgpack.gz
--- a/wordfreq/data/large_en.msgpack.gz
+++ b/wordfreq/data/large_en.msgpack.gz
--- a/wordfreq/data/large_es.msgpack.gz
+++ b/wordfreq/data/large_es.msgpack.gz
--- a/wordfreq/data/large_fi.msgpack.gz
+++ b/wordfreq/data/large_fi.msgpack.gz
--- a/wordfreq/data/large_fr.msgpack.gz
+++ b/wordfreq/data/large_fr.msgpack.gz
--- a/wordfreq/data/large_it.msgpack.gz
+++ b/wordfreq/data/large_it.msgpack.gz
--- a/wordfreq/data/large_ja.msgpack.gz
+++ b/wordfreq/data/large_ja.msgpack.gz
--- a/wordfreq/data/large_nl.msgpack.gz
+++ b/wordfreq/data/large_nl.msgpack.gz
--- a/wordfreq/data/large_pl.msgpack.gz
+++ b/wordfreq/data/large_pl.msgpack.gz
--- a/wordfreq/data/large_pt.msgpack.gz
+++ b/wordfreq/data/large_pt.msgpack.gz
--- a/wordfreq/data/large_ru.msgpack.gz
+++ b/wordfreq/data/large_ru.msgpack.gz
--- a/wordfreq/data/large_zh.msgpack.gz
+++ b/wordfreq/data/large_zh.msgpack.gz
--- a/wordfreq/data/small_ar.msgpack.gz
+++ b/wordfreq/data/small_ar.msgpack.gz
--- a/wordfreq/data/small_bg.msgpack.gz
+++ b/wordfreq/data/small_bg.msgpack.gz
--- a/wordfreq/data/small_bn.msgpack.gz
+++ b/wordfreq/data/small_bn.msgpack.gz
--- a/wordfreq/data/small_ca.msgpack.gz
+++ b/wordfreq/data/small_ca.msgpack.gz
--- a/wordfreq/data/small_cs.msgpack.gz
+++ b/wordfreq/data/small_cs.msgpack.gz
--- a/wordfreq/data/small_da.msgpack.gz
+++ b/wordfreq/data/small_da.msgpack.gz
--- a/wordfreq/data/small_de.msgpack.gz
+++ b/wordfreq/data/small_de.msgpack.gz
--- a/wordfreq/data/small_el.msgpack.gz
+++ b/wordfreq/data/small_el.msgpack.gz
--- a/wordfreq/data/small_en.msgpack.gz
+++ b/wordfreq/data/small_en.msgpack.gz
--- a/wordfreq/data/small_es.msgpack.gz
+++ b/wordfreq/data/small_es.msgpack.gz
--- a/wordfreq/data/small_fa.msgpack.gz
+++ b/wordfreq/data/small_fa.msgpack.gz
--- a/wordfreq/data/small_fi.msgpack.gz
+++ b/wordfreq/data/small_fi.msgpack.gz
--- a/wordfreq/data/small_fr.msgpack.gz
+++ b/wordfreq/data/small_fr.msgpack.gz
--- a/wordfreq/data/small_he.msgpack.gz
+++ b/wordfreq/data/small_he.msgpack.gz
--- a/wordfreq/data/small_hi.msgpack.gz
+++ b/wordfreq/data/small_hi.msgpack.gz
--- a/wordfreq/data/small_hu.msgpack.gz
+++ b/wordfreq/data/small_hu.msgpack.gz
--- a/wordfreq/data/small_id.msgpack.gz
+++ b/wordfreq/data/small_id.msgpack.gz
--- a/wordfreq/data/small_it.msgpack.gz
+++ b/wordfreq/data/small_it.msgpack.gz
--- a/wordfreq/data/small_ja.msgpack.gz
+++ b/wordfreq/data/small_ja.msgpack.gz
--- a/wordfreq/data/small_ko.msgpack.gz
+++ b/wordfreq/data/small_ko.msgpack.gz
--- a/wordfreq/data/small_lv.msgpack.gz
+++ b/wordfreq/data/small_lv.msgpack.gz
--- a/wordfreq/data/small_mk.msgpack.gz
+++ b/wordfreq/data/small_mk.msgpack.gz
--- a/wordfreq/data/small_ms.msgpack.gz
+++ b/wordfreq/data/small_ms.msgpack.gz
--- a/wordfreq/data/small_nb.msgpack.gz
+++ b/wordfreq/data/small_nb.msgpack.gz
--- a/wordfreq/data/small_nl.msgpack.gz
+++ b/wordfreq/data/small_nl.msgpack.gz
--- a/wordfreq/data/small_pl.msgpack.gz
+++ b/wordfreq/data/small_pl.msgpack.gz
--- a/wordfreq/data/small_pt.msgpack.gz
+++ b/wordfreq/data/small_pt.msgpack.gz
--- a/wordfreq/data/small_ro.msgpack.gz
+++ b/wordfreq/data/small_ro.msgpack.gz
--- a/wordfreq/data/small_ru.msgpack.gz
+++ b/wordfreq/data/small_ru.msgpack.gz
--- a/wordfreq/data/small_sh.msgpack.gz
+++ b/wordfreq/data/small_sh.msgpack.gz
--- a/wordfreq/data/small_sv.msgpack.gz
+++ b/wordfreq/data/small_sv.msgpack.gz
--- a/wordfreq/data/small_tr.msgpack.gz
+++ b/wordfreq/data/small_tr.msgpack.gz
--- a/wordfreq/data/small_uk.msgpack.gz
+++ b/wordfreq/data/small_uk.msgpack.gz
--- a/wordfreq/data/small_zh.msgpack.gz
+++ b/wordfreq/data/small_zh.msgpack.gz