sort Jieba wordlists consistently; update data files

Former-commit-id: 0ab23f8a28
This commit is contained in:
Robyn Speer 2015-09-08 16:09:15 -04:00
parent 4aef1dc338
commit 48f9d4520c
37 changed files with 2404 additions and 2404 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -171,7 +171,7 @@ def write_jieba(freqs, filename):
tokenization of Chinese. tokenization of Chinese.
""" """
with open(filename, 'w', encoding='utf-8', newline='\n') as outfile: with open(filename, 'w', encoding='utf-8', newline='\n') as outfile:
items = sorted(freqs.items(), key=itemgetter(1), reverse=True) items = sorted(freqs.items(), key=lambda item: (-item[1], item[0]))
for word, freq in items: for word, freq in items:
if HAN_RE.search(word): if HAN_RE.search(word):
# Only store this word as a token if it contains at least one # Only store this word as a token if it contains at least one