mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Merge branch 'data-update-2.5' of github.com:LuminosoInsight/wordfreq into data-update-2.5
This commit is contained in:
commit
b6614c1a33
44
README.md
44
README.md
@ -167,41 +167,49 @@ least 3 different sources of word frequencies:
|
|||||||
Language Code # Large? WP Subs News Books Web Twit. Redd. Misc.
|
Language Code # Large? WP Subs News Books Web Twit. Redd. Misc.
|
||||||
──────────────────────────────┼────────────────────────────────────────────────
|
──────────────────────────────┼────────────────────────────────────────────────
|
||||||
Arabic ar 5 Yes │ Yes Yes Yes - Yes Yes - -
|
Arabic ar 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||||
Bengali bn 3 - │ Yes - Yes - - Yes - -
|
Bangla bn 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||||
Bosnian bs [1] 3 - │ Yes Yes - - - Yes - -
|
Bosnian bs [1] 3 - │ Yes Yes - - - Yes - -
|
||||||
Bulgarian bg 3 - │ Yes Yes - - - Yes - -
|
Bulgarian bg 4 - │ Yes Yes - - Yes Yes - -
|
||||||
Catalan ca 4 - │ Yes Yes Yes - - Yes - -
|
Catalan ca 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||||
Chinese zh [3] 7 Yes │ Yes Yes Yes Yes Yes Yes - Jieba
|
Chinese zh [3] 7 Yes │ Yes Yes Yes Yes Yes Yes - Jieba
|
||||||
Croatian hr [1] 3 │ Yes Yes - - - Yes - -
|
Croatian hr [1] 3 │ Yes Yes - - - Yes - -
|
||||||
Czech cs 5 Yes │ Yes Yes Yes - Yes Yes - -
|
Czech cs 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||||
Danish da 3 - │ Yes Yes - - - Yes - -
|
Danish da 4 - │ Yes Yes - - Yes Yes - -
|
||||||
Dutch nl 5 Yes │ Yes Yes Yes - Yes Yes - -
|
Dutch nl 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||||
English en 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
English en 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||||
Finnish fi 6 Yes │ Yes Yes Yes - Yes Yes Yes -
|
Finnish fi 6 Yes │ Yes Yes Yes - Yes Yes Yes -
|
||||||
French fr 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
French fr 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||||
German de 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
German de 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||||
Greek el 3 - │ Yes Yes - - Yes - - -
|
Greek el 4 - │ Yes Yes - - Yes Yes - -
|
||||||
Hebrew he 4 - │ Yes Yes - Yes - Yes - -
|
Hebrew he 5 Yes │ Yes Yes - Yes Yes Yes - -
|
||||||
Hindi hi 3 - │ Yes - - - - Yes Yes -
|
Hindi hi 4 Yes │ Yes - - - Yes Yes Yes -
|
||||||
Hungarian hu 3 - │ Yes Yes - - Yes - - -
|
Hungarian hu 4 - │ Yes Yes - - Yes Yes - -
|
||||||
|
Icelandic is 3 - │ Yes Yes - - Yes - - -
|
||||||
Indonesian id 3 - │ Yes Yes - - - Yes - -
|
Indonesian id 3 - │ Yes Yes - - - Yes - -
|
||||||
Italian it 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
Italian it 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||||
Japanese ja 5 Yes │ Yes Yes - - Yes Yes Yes -
|
Japanese ja 5 Yes │ Yes Yes - - Yes Yes Yes -
|
||||||
Korean ko 4 - │ Yes Yes - - - Yes Yes -
|
Korean ko 4 - │ Yes Yes - - - Yes Yes -
|
||||||
Latvian lv 4 - │ Yes Yes - - Yes Yes - -
|
Latvian lv 4 - │ Yes Yes - - Yes Yes - -
|
||||||
Macedonian mk 3 - │ Yes Yes Yes - - - - -
|
Lithuanian lt 3 - │ Yes Yes - - Yes - - -
|
||||||
|
Macedonian mk 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||||
Malay ms 3 - │ Yes Yes - - - Yes - -
|
Malay ms 3 - │ Yes Yes - - - Yes - -
|
||||||
Norwegian nb [2] 4 - │ Yes Yes - - - Yes Yes -
|
Norwegian nb [2] 5 Yes │ Yes Yes - - Yes Yes Yes -
|
||||||
Persian fa 3 - │ Yes Yes - - - Yes - -
|
Persian fa 4 - │ Yes Yes - - Yes Yes - -
|
||||||
Polish pl 6 Yes │ Yes Yes Yes - Yes Yes Yes -
|
Polish pl 6 Yes │ Yes Yes Yes - Yes Yes Yes -
|
||||||
Portuguese pt 5 Yes │ Yes Yes Yes - Yes Yes - -
|
Portuguese pt 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||||
Romanian ro 4 - │ Yes Yes - - Yes Yes - -
|
Romanian ro 3 - │ Yes Yes - - Yes - - -
|
||||||
Russian ru 6 Yes │ Yes Yes Yes Yes Yes Yes - -
|
Russian ru 6 Yes │ Yes Yes Yes Yes Yes Yes - -
|
||||||
|
Slovak sl 3 - │ Yes Yes - - Yes - - -
|
||||||
|
Slovenian sk 3 - │ Yes Yes - - Yes - - -
|
||||||
Serbian sr [1] 3 - │ Yes Yes - - - Yes - -
|
Serbian sr [1] 3 - │ Yes Yes - - - Yes - -
|
||||||
Spanish es 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
Spanish es 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||||
Swedish sv 4 - │ Yes Yes - - - Yes Yes -
|
Swedish sv 5 Yes │ Yes Yes - - Yes Yes Yes -
|
||||||
Turkish tr 3 - │ Yes Yes - - - Yes - -
|
Tagalog fil 3 - │ Yes Yes - - Yes - - -
|
||||||
Ukrainian uk 4 - │ Yes Yes - - - Yes Yes -
|
Tamil ta 3 - │ Yes - - - Yes Yes - -
|
||||||
|
Turkish tr 4 - │ Yes Yes - - Yes Yes - -
|
||||||
|
Ukrainian uk 5 Yes │ Yes Yes - - Yes Yes Yes -
|
||||||
|
Urdu ur 3 - │ Yes - - - Yes Yes - -
|
||||||
|
Vietnamese vi 3 - │ Yes Yes - - Yes - - -
|
||||||
|
|
||||||
[1] Bosnian, Croatian, and Serbian use the same underlying word list, because
|
[1] Bosnian, Croatian, and Serbian use the same underlying word list, because
|
||||||
they share most of their vocabulary and grammar, they were once considered the
|
they share most of their vocabulary and grammar, they were once considered the
|
||||||
@ -523,6 +531,12 @@ The same citation in BibTex format:
|
|||||||
International Conference on Language Resources and Evaluation (LREC 2016).
|
International Conference on Language Resources and Evaluation (LREC 2016).
|
||||||
http://stp.lingfil.uu.se/~joerg/paper/opensubs2016.pdf
|
http://stp.lingfil.uu.se/~joerg/paper/opensubs2016.pdf
|
||||||
|
|
||||||
|
- Ortiz Suárez, P. J., Sagot, B., and Romary, L. (2019). Asynchronous pipelines
|
||||||
|
for processing huge corpora on medium to low resource infrastructures. In
|
||||||
|
Proceedings of the Workshop on Challenges in the Management of Large Corpora
|
||||||
|
(CMLC-7) 2019.
|
||||||
|
https://oscar-corpus.com/publication/2019/clmc7/asynchronous/
|
||||||
|
|
||||||
- ParaCrawl (2018). Provision of Web-Scale Parallel Corpora for Official
|
- ParaCrawl (2018). Provision of Web-Scale Parallel Corpora for Official
|
||||||
European Languages. https://paracrawl.eu/
|
European Languages. https://paracrawl.eu/
|
||||||
|
|
||||||
|
@ -72,7 +72,6 @@ def test_most_common_words():
|
|||||||
assert get_most_common('fi') == 'ja'
|
assert get_most_common('fi') == 'ja'
|
||||||
assert get_most_common('fil') == 'sa'
|
assert get_most_common('fil') == 'sa'
|
||||||
assert get_most_common('fr') == 'de'
|
assert get_most_common('fr') == 'de'
|
||||||
assert get_most_common('gl') == 'de'
|
|
||||||
assert get_most_common('he') == 'את'
|
assert get_most_common('he') == 'את'
|
||||||
assert get_most_common('hi') == 'के'
|
assert get_most_common('hi') == 'के'
|
||||||
assert get_most_common('hu') == 'a'
|
assert get_most_common('hu') == 'a'
|
||||||
@ -84,7 +83,6 @@ def test_most_common_words():
|
|||||||
assert get_most_common('lt') == 'ir'
|
assert get_most_common('lt') == 'ir'
|
||||||
assert get_most_common('lv') == 'un'
|
assert get_most_common('lv') == 'un'
|
||||||
assert get_most_common('mk') == 'на'
|
assert get_most_common('mk') == 'на'
|
||||||
assert get_most_common('ml') == 'ഒരു'
|
|
||||||
assert get_most_common('ms') == 'yang'
|
assert get_most_common('ms') == 'yang'
|
||||||
assert get_most_common('nb') == 'i'
|
assert get_most_common('nb') == 'i'
|
||||||
assert get_most_common('nl') == 'de'
|
assert get_most_common('nl') == 'de'
|
||||||
@ -96,7 +94,6 @@ def test_most_common_words():
|
|||||||
assert get_most_common('sk') == 'a'
|
assert get_most_common('sk') == 'a'
|
||||||
assert get_most_common('sl') == 'je'
|
assert get_most_common('sl') == 'je'
|
||||||
assert get_most_common('sv') == 'är'
|
assert get_most_common('sv') == 'är'
|
||||||
assert get_most_common('sw') == 'ya'
|
|
||||||
assert get_most_common('ta') == 'ஒரு'
|
assert get_most_common('ta') == 'ஒரு'
|
||||||
assert get_most_common('tr') == 've'
|
assert get_most_common('tr') == 've'
|
||||||
assert get_most_common('uk') == 'в'
|
assert get_most_common('uk') == 'в'
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user