mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
sort Jieba wordlists consistently; update data files
This commit is contained in:
parent
bc8ebd23e9
commit
0ab23f8a28
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_pl.msgpack.gz
Normal file
BIN
wordfreq/data/combined_pl.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_sv.msgpack.gz
Normal file
BIN
wordfreq/data/combined_sv.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_pl.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_pl.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_sv.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_sv.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
@ -171,7 +171,7 @@ def write_jieba(freqs, filename):
|
||||
tokenization of Chinese.
|
||||
"""
|
||||
with open(filename, 'w', encoding='utf-8', newline='\n') as outfile:
|
||||
items = sorted(freqs.items(), key=itemgetter(1), reverse=True)
|
||||
items = sorted(freqs.items(), key=lambda item: (-item[1], item[0]))
|
||||
for word, freq in items:
|
||||
if HAN_RE.search(word):
|
||||
# Only store this word as a token if it contains at least one
|
||||
|
Loading…
Reference in New Issue
Block a user