mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
sort Jieba wordlists consistently; update data files
Former-commit-id: 0ab23f8a28
This commit is contained in:
parent
4aef1dc338
commit
48f9d4520c
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_pl.msgpack.gz
Normal file
BIN
wordfreq/data/combined_pl.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/combined_sv.msgpack.gz
Normal file
BIN
wordfreq/data/combined_sv.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_pl.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_pl.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq/data/twitter_sv.msgpack.gz
Normal file
BIN
wordfreq/data/twitter_sv.msgpack.gz
Normal file
Binary file not shown.
Binary file not shown.
@ -171,7 +171,7 @@ def write_jieba(freqs, filename):
|
|||||||
tokenization of Chinese.
|
tokenization of Chinese.
|
||||||
"""
|
"""
|
||||||
with open(filename, 'w', encoding='utf-8', newline='\n') as outfile:
|
with open(filename, 'w', encoding='utf-8', newline='\n') as outfile:
|
||||||
items = sorted(freqs.items(), key=itemgetter(1), reverse=True)
|
items = sorted(freqs.items(), key=lambda item: (-item[1], item[0]))
|
||||||
for word, freq in items:
|
for word, freq in items:
|
||||||
if HAN_RE.search(word):
|
if HAN_RE.search(word):
|
||||||
# Only store this word as a token if it contains at least one
|
# Only store this word as a token if it contains at least one
|
||||||
|
Loading…
Reference in New Issue
Block a user