mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
parent
6802a4f89d
commit
fe8a6b51e7
@ -1,3 +1,16 @@
|
|||||||
|
"""
|
||||||
|
Generate a Python file, _chinese_mapping.py, that maps Traditional Chinese
|
||||||
|
characters to their Simplified Chinese equivalents.
|
||||||
|
|
||||||
|
This is meant to be a normalization of text, somewhat like case-folding -- not
|
||||||
|
an actual translator, a task for which this method would be unsuitable. We
|
||||||
|
store word frequencies using Simplified Chinese characters so that, in the large
|
||||||
|
number of cases where a Traditional Chinese word has an obvious Simplified Chinese
|
||||||
|
mapping, we can get a frequency for it that's the same in Simplified and Traditional
|
||||||
|
Chinese.
|
||||||
|
|
||||||
|
Generating this mapping requires the external Chinese conversion tool OpenCC.
|
||||||
|
"""
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import itertools
|
import itertools
|
||||||
import os
|
import os
|
||||||
|
Loading…
Reference in New Issue
Block a user