mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
v1.7: update tokenization, update data, add bn
and mk
This commit is contained in:
parent
9dac967ca3
commit
46e32fbd36
20
CHANGELOG.md
20
CHANGELOG.md
@ -1,3 +1,23 @@
|
||||
## Version 1.7.0 (2017-08-25)
|
||||
|
||||
- Tokenization will always keep Unicode graphemes together, including
|
||||
complex emoji introduced in Unicode 10
|
||||
- Update the Wikipedia source data to April 2017
|
||||
- Remove some non-words, such as the Unicode replacement character and the
|
||||
pilcrow sign, from frequency lists
|
||||
- Support Bengali and Macedonian, which passed the threshold of having enough
|
||||
source data to be included
|
||||
|
||||
|
||||
## Version 1.6.1 (2017-05-10)
|
||||
|
||||
- Depend on langcodes 1.4, with a new language-matching system that does not
|
||||
depend on SQLite.
|
||||
|
||||
This prevents silly conflicts where langcodes' SQLite connection was
|
||||
preventing langcodes from being used in threads.
|
||||
|
||||
|
||||
## Version 1.6.0 (2017-01-05)
|
||||
|
||||
- Support Czech, Persian, Ukrainian, and Croatian/Bosnian/Serbian
|
||||
|
14
scripts/top_n.py
Normal file
14
scripts/top_n.py
Normal file
@ -0,0 +1,14 @@
|
||||
"""
|
||||
A quick script to output the top N words (1000 for now) in each language.
|
||||
You can send the output to a file and diff it to see changes between wordfreq
|
||||
versions.
|
||||
"""
|
||||
import wordfreq
|
||||
|
||||
|
||||
N = 1000
|
||||
|
||||
|
||||
for lang in sorted(wordfreq.available_languages()):
|
||||
for (i, word) in enumerate(wordfreq.top_n_list(lang, 1000)):
|
||||
print('{}\t{}'.format(lang, word))
|
@ -35,6 +35,8 @@ LAUGHTER_WORDS = {
|
||||
'he': 'חחח',
|
||||
'bg': 'ахаха',
|
||||
'uk': 'хаха',
|
||||
'bn': 'হা হা',
|
||||
'mk': 'хаха'
|
||||
}
|
||||
|
||||
|
||||
@ -190,7 +192,7 @@ def test_not_really_random():
|
||||
# This not only tests random_ascii_words, it makes sure we didn't end
|
||||
# up with 'eos' as a very common Japanese word
|
||||
eq_(random_ascii_words(nwords=4, lang='ja', bits_per_word=0),
|
||||
'00 00 00 00')
|
||||
'1 1 1 1')
|
||||
|
||||
|
||||
@raises(ValueError)
|
||||
|
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_bn.msgpack.gz
Normal file
BIN
wordfreq/data/combined_bn.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_mk.msgpack.gz
Normal file
BIN
wordfreq/data/combined_mk.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_bn.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_bn.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user