v1.7: update tokenization, update data, add bn and mk

This commit is contained in:
Robyn Speer 2017-08-25 17:37:48 -04:00
parent 9dac967ca3
commit 46e32fbd36
80 changed files with 25643 additions and 25456 deletions

View File

@ -1,3 +1,23 @@
## Version 1.7.0 (2017-08-25)
- Tokenization will always keep Unicode graphemes together, including
complex emoji introduced in Unicode 10
- Update the Wikipedia source data to April 2017
- Remove some non-words, such as the Unicode replacement character and the
pilcrow sign, from frequency lists
- Support Bengali and Macedonian, which passed the threshold of having enough
source data to be included
## Version 1.6.1 (2017-05-10)
- Depend on langcodes 1.4, with a new language-matching system that does not
depend on SQLite.
This prevents silly conflicts where langcodes' SQLite connection was
preventing langcodes from being used in threads.
## Version 1.6.0 (2017-01-05)
- Support Czech, Persian, Ukrainian, and Croatian/Bosnian/Serbian

14
scripts/top_n.py Normal file
View File

@ -0,0 +1,14 @@
"""
A quick script to output the top N words (1000 for now) in each language.
You can send the output to a file and diff it to see changes between wordfreq
versions.
"""
import wordfreq
N = 1000
for lang in sorted(wordfreq.available_languages()):
for (i, word) in enumerate(wordfreq.top_n_list(lang, 1000)):
print('{}\t{}'.format(lang, word))

View File

@ -35,6 +35,8 @@ LAUGHTER_WORDS = {
'he': 'חחח',
'bg': 'ахаха',
'uk': 'хаха',
'bn': 'হা হা',
'mk': 'хаха'
}
@ -190,7 +192,7 @@ def test_not_really_random():
# This not only tests random_ascii_words, it makes sure we didn't end
# up with 'eos' as a very common Japanese word
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
'00 00 00 00')
'1 1 1 1')
@raises(ValueError)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.