diff --git a/README.md b/README.md index 431a881..0f0ec9f 100644 --- a/README.md +++ b/README.md @@ -167,41 +167,49 @@ least 3 different sources of word frequencies: Language Code # Large? WP Subs News Books Web Twit. Redd. Misc. ──────────────────────────────┼──────────────────────────────────────────────── Arabic ar 5 Yes │ Yes Yes Yes - Yes Yes - - - Bengali bn 3 - │ Yes - Yes - - Yes - - + Bangla bn 5 Yes │ Yes Yes Yes - Yes Yes - - Bosnian bs [1] 3 - │ Yes Yes - - - Yes - - - Bulgarian bg 3 - │ Yes Yes - - - Yes - - - Catalan ca 4 - │ Yes Yes Yes - - Yes - - + Bulgarian bg 4 - │ Yes Yes - - Yes Yes - - + Catalan ca 5 Yes │ Yes Yes Yes - Yes Yes - - Chinese zh [3] 7 Yes │ Yes Yes Yes Yes Yes Yes - Jieba Croatian hr [1] 3 │ Yes Yes - - - Yes - - Czech cs 5 Yes │ Yes Yes Yes - Yes Yes - - - Danish da 3 - │ Yes Yes - - - Yes - - + Danish da 4 - │ Yes Yes - - Yes Yes - - Dutch nl 5 Yes │ Yes Yes Yes - Yes Yes - - English en 7 Yes │ Yes Yes Yes Yes Yes Yes Yes - Finnish fi 6 Yes │ Yes Yes Yes - Yes Yes Yes - French fr 7 Yes │ Yes Yes Yes Yes Yes Yes Yes - German de 7 Yes │ Yes Yes Yes Yes Yes Yes Yes - - Greek el 3 - │ Yes Yes - - Yes - - - - Hebrew he 4 - │ Yes Yes - Yes - Yes - - - Hindi hi 3 - │ Yes - - - - Yes Yes - - Hungarian hu 3 - │ Yes Yes - - Yes - - - + Greek el 4 - │ Yes Yes - - Yes Yes - - + Hebrew he 5 Yes │ Yes Yes - Yes Yes Yes - - + Hindi hi 4 Yes │ Yes - - - Yes Yes Yes - + Hungarian hu 4 - │ Yes Yes - - Yes Yes - - + Icelandic is 3 - │ Yes Yes - - Yes - - - Indonesian id 3 - │ Yes Yes - - - Yes - - Italian it 7 Yes │ Yes Yes Yes Yes Yes Yes Yes - Japanese ja 5 Yes │ Yes Yes - - Yes Yes Yes - Korean ko 4 - │ Yes Yes - - - Yes Yes - Latvian lv 4 - │ Yes Yes - - Yes Yes - - - Macedonian mk 3 - │ Yes Yes Yes - - - - - + Lithuanian lt 3 - │ Yes Yes - - Yes - - - + Macedonian mk 5 Yes │ Yes Yes Yes - Yes Yes - - Malay ms 3 - │ Yes Yes - - - Yes - - - Norwegian nb [2] 4 - │ Yes Yes - - - Yes Yes - - Persian fa 3 - │ Yes Yes - - - Yes - - + Norwegian nb [2] 5 Yes │ Yes Yes - - Yes Yes Yes - + Persian fa 4 - │ Yes Yes - - Yes Yes - - Polish pl 6 Yes │ Yes Yes Yes - Yes Yes Yes - Portuguese pt 5 Yes │ Yes Yes Yes - Yes Yes - - - Romanian ro 4 - │ Yes Yes - - Yes Yes - - + Romanian ro 3 - │ Yes Yes - - Yes - - - Russian ru 6 Yes │ Yes Yes Yes Yes Yes Yes - - + Slovak sl 3 - │ Yes Yes - - Yes - - - + Slovenian sk 3 - │ Yes Yes - - Yes - - - Serbian sr [1] 3 - │ Yes Yes - - - Yes - - Spanish es 7 Yes │ Yes Yes Yes Yes Yes Yes Yes - - Swedish sv 4 - │ Yes Yes - - - Yes Yes - - Turkish tr 3 - │ Yes Yes - - - Yes - - - Ukrainian uk 4 - │ Yes Yes - - - Yes Yes - + Swedish sv 5 Yes │ Yes Yes - - Yes Yes Yes - + Tagalog fil 3 - │ Yes Yes - - Yes - - - + Tamil ta 3 - │ Yes - - - Yes Yes - - + Turkish tr 4 - │ Yes Yes - - Yes Yes - - + Ukrainian uk 5 Yes │ Yes Yes - - Yes Yes Yes - + Urdu ur 3 - │ Yes - - - Yes Yes - - + Vietnamese vi 3 - │ Yes Yes - - Yes - - - [1] Bosnian, Croatian, and Serbian use the same underlying word list, because they share most of their vocabulary and grammar, they were once considered the @@ -523,6 +531,12 @@ The same citation in BibTex format: International Conference on Language Resources and Evaluation (LREC 2016). http://stp.lingfil.uu.se/~joerg/paper/opensubs2016.pdf +- Ortiz Suárez, P. J., Sagot, B., and Romary, L. (2019). Asynchronous pipelines + for processing huge corpora on medium to low resource infrastructures. In + Proceedings of the Workshop on Challenges in the Management of Large Corpora + (CMLC-7) 2019. + https://oscar-corpus.com/publication/2019/clmc7/asynchronous/ + - ParaCrawl (2018). Provision of Web-Scale Parallel Corpora for Official European Languages. https://paracrawl.eu/ diff --git a/tests/test_general.py b/tests/test_general.py index 920acbb..badbb73 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -72,7 +72,6 @@ def test_most_common_words(): assert get_most_common('fi') == 'ja' assert get_most_common('fil') == 'sa' assert get_most_common('fr') == 'de' - assert get_most_common('gl') == 'de' assert get_most_common('he') == 'את' assert get_most_common('hi') == 'के' assert get_most_common('hu') == 'a' @@ -84,7 +83,6 @@ def test_most_common_words(): assert get_most_common('lt') == 'ir' assert get_most_common('lv') == 'un' assert get_most_common('mk') == 'на' - assert get_most_common('ml') == 'ഒരു' assert get_most_common('ms') == 'yang' assert get_most_common('nb') == 'i' assert get_most_common('nl') == 'de' @@ -96,7 +94,6 @@ def test_most_common_words(): assert get_most_common('sk') == 'a' assert get_most_common('sl') == 'je' assert get_most_common('sv') == 'är' - assert get_most_common('sw') == 'ya' assert get_most_common('ta') == 'ஒரு' assert get_most_common('tr') == 've' assert get_most_common('uk') == 'в' diff --git a/wordfreq/data/small_gl.msgpack.gz b/wordfreq/data/small_gl.msgpack.gz deleted file mode 100644 index bbb89ee..0000000 Binary files a/wordfreq/data/small_gl.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/small_ml.msgpack.gz b/wordfreq/data/small_ml.msgpack.gz deleted file mode 100644 index 9f0dc19..0000000 Binary files a/wordfreq/data/small_ml.msgpack.gz and /dev/null differ diff --git a/wordfreq/data/small_sw.msgpack.gz b/wordfreq/data/small_sw.msgpack.gz deleted file mode 100644 index 3e6ec84..0000000 Binary files a/wordfreq/data/small_sw.msgpack.gz and /dev/null differ