mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 01:11:37 +00:00
Merge branch 'data-update-2.5' of github.com:LuminosoInsight/wordfreq into data-update-2.5
This commit is contained in:
commit
b6614c1a33
44
README.md
44
README.md
@ -167,41 +167,49 @@ least 3 different sources of word frequencies:
|
||||
Language Code # Large? WP Subs News Books Web Twit. Redd. Misc.
|
||||
──────────────────────────────┼────────────────────────────────────────────────
|
||||
Arabic ar 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||
Bengali bn 3 - │ Yes - Yes - - Yes - -
|
||||
Bangla bn 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||
Bosnian bs [1] 3 - │ Yes Yes - - - Yes - -
|
||||
Bulgarian bg 3 - │ Yes Yes - - - Yes - -
|
||||
Catalan ca 4 - │ Yes Yes Yes - - Yes - -
|
||||
Bulgarian bg 4 - │ Yes Yes - - Yes Yes - -
|
||||
Catalan ca 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||
Chinese zh [3] 7 Yes │ Yes Yes Yes Yes Yes Yes - Jieba
|
||||
Croatian hr [1] 3 │ Yes Yes - - - Yes - -
|
||||
Czech cs 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||
Danish da 3 - │ Yes Yes - - - Yes - -
|
||||
Danish da 4 - │ Yes Yes - - Yes Yes - -
|
||||
Dutch nl 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||
English en 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||
Finnish fi 6 Yes │ Yes Yes Yes - Yes Yes Yes -
|
||||
French fr 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||
German de 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||
Greek el 3 - │ Yes Yes - - Yes - - -
|
||||
Hebrew he 4 - │ Yes Yes - Yes - Yes - -
|
||||
Hindi hi 3 - │ Yes - - - - Yes Yes -
|
||||
Hungarian hu 3 - │ Yes Yes - - Yes - - -
|
||||
Greek el 4 - │ Yes Yes - - Yes Yes - -
|
||||
Hebrew he 5 Yes │ Yes Yes - Yes Yes Yes - -
|
||||
Hindi hi 4 Yes │ Yes - - - Yes Yes Yes -
|
||||
Hungarian hu 4 - │ Yes Yes - - Yes Yes - -
|
||||
Icelandic is 3 - │ Yes Yes - - Yes - - -
|
||||
Indonesian id 3 - │ Yes Yes - - - Yes - -
|
||||
Italian it 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||
Japanese ja 5 Yes │ Yes Yes - - Yes Yes Yes -
|
||||
Korean ko 4 - │ Yes Yes - - - Yes Yes -
|
||||
Latvian lv 4 - │ Yes Yes - - Yes Yes - -
|
||||
Macedonian mk 3 - │ Yes Yes Yes - - - - -
|
||||
Lithuanian lt 3 - │ Yes Yes - - Yes - - -
|
||||
Macedonian mk 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||
Malay ms 3 - │ Yes Yes - - - Yes - -
|
||||
Norwegian nb [2] 4 - │ Yes Yes - - - Yes Yes -
|
||||
Persian fa 3 - │ Yes Yes - - - Yes - -
|
||||
Norwegian nb [2] 5 Yes │ Yes Yes - - Yes Yes Yes -
|
||||
Persian fa 4 - │ Yes Yes - - Yes Yes - -
|
||||
Polish pl 6 Yes │ Yes Yes Yes - Yes Yes Yes -
|
||||
Portuguese pt 5 Yes │ Yes Yes Yes - Yes Yes - -
|
||||
Romanian ro 4 - │ Yes Yes - - Yes Yes - -
|
||||
Romanian ro 3 - │ Yes Yes - - Yes - - -
|
||||
Russian ru 6 Yes │ Yes Yes Yes Yes Yes Yes - -
|
||||
Slovak sl 3 - │ Yes Yes - - Yes - - -
|
||||
Slovenian sk 3 - │ Yes Yes - - Yes - - -
|
||||
Serbian sr [1] 3 - │ Yes Yes - - - Yes - -
|
||||
Spanish es 7 Yes │ Yes Yes Yes Yes Yes Yes Yes -
|
||||
Swedish sv 4 - │ Yes Yes - - - Yes Yes -
|
||||
Turkish tr 3 - │ Yes Yes - - - Yes - -
|
||||
Ukrainian uk 4 - │ Yes Yes - - - Yes Yes -
|
||||
Swedish sv 5 Yes │ Yes Yes - - Yes Yes Yes -
|
||||
Tagalog fil 3 - │ Yes Yes - - Yes - - -
|
||||
Tamil ta 3 - │ Yes - - - Yes Yes - -
|
||||
Turkish tr 4 - │ Yes Yes - - Yes Yes - -
|
||||
Ukrainian uk 5 Yes │ Yes Yes - - Yes Yes Yes -
|
||||
Urdu ur 3 - │ Yes - - - Yes Yes - -
|
||||
Vietnamese vi 3 - │ Yes Yes - - Yes - - -
|
||||
|
||||
[1] Bosnian, Croatian, and Serbian use the same underlying word list, because
|
||||
they share most of their vocabulary and grammar, they were once considered the
|
||||
@ -523,6 +531,12 @@ The same citation in BibTex format:
|
||||
International Conference on Language Resources and Evaluation (LREC 2016).
|
||||
http://stp.lingfil.uu.se/~joerg/paper/opensubs2016.pdf
|
||||
|
||||
- Ortiz Suárez, P. J., Sagot, B., and Romary, L. (2019). Asynchronous pipelines
|
||||
for processing huge corpora on medium to low resource infrastructures. In
|
||||
Proceedings of the Workshop on Challenges in the Management of Large Corpora
|
||||
(CMLC-7) 2019.
|
||||
https://oscar-corpus.com/publication/2019/clmc7/asynchronous/
|
||||
|
||||
- ParaCrawl (2018). Provision of Web-Scale Parallel Corpora for Official
|
||||
European Languages. https://paracrawl.eu/
|
||||
|
||||
|
@ -72,7 +72,6 @@ def test_most_common_words():
|
||||
assert get_most_common('fi') == 'ja'
|
||||
assert get_most_common('fil') == 'sa'
|
||||
assert get_most_common('fr') == 'de'
|
||||
assert get_most_common('gl') == 'de'
|
||||
assert get_most_common('he') == 'את'
|
||||
assert get_most_common('hi') == 'के'
|
||||
assert get_most_common('hu') == 'a'
|
||||
@ -84,7 +83,6 @@ def test_most_common_words():
|
||||
assert get_most_common('lt') == 'ir'
|
||||
assert get_most_common('lv') == 'un'
|
||||
assert get_most_common('mk') == 'на'
|
||||
assert get_most_common('ml') == 'ഒരു'
|
||||
assert get_most_common('ms') == 'yang'
|
||||
assert get_most_common('nb') == 'i'
|
||||
assert get_most_common('nl') == 'de'
|
||||
@ -96,7 +94,6 @@ def test_most_common_words():
|
||||
assert get_most_common('sk') == 'a'
|
||||
assert get_most_common('sl') == 'je'
|
||||
assert get_most_common('sv') == 'är'
|
||||
assert get_most_common('sw') == 'ya'
|
||||
assert get_most_common('ta') == 'ஒரு'
|
||||
assert get_most_common('tr') == 've'
|
||||
assert get_most_common('uk') == 'в'
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user