From 06f8b299712a5952c5847ece8b13c1a18fcc2ed0 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Tue, 22 Sep 2015 15:31:27 -0400 Subject: [PATCH] document what this file is for --- scripts/make_chinese_mapping.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/scripts/make_chinese_mapping.py b/scripts/make_chinese_mapping.py index 19b7826..9855e18 100644 --- a/scripts/make_chinese_mapping.py +++ b/scripts/make_chinese_mapping.py @@ -1,3 +1,16 @@ +""" +Generate a Python file, _chinese_mapping.py, that maps Traditional Chinese +characters to their Simplified Chinese equivalents. + +This is meant to be a normalization of text, somewhat like case-folding -- not +an actual translator, a task for which this method would be unsuitable. We +store word frequencies using Simplified Chinese characters so that, in the large +number of cases where a Traditional Chinese word has an obvious Simplified Chinese +mapping, we can get a frequency for it that's the same in Simplified and Traditional +Chinese. + +Generating this mapping requires the external Chinese conversion tool OpenCC. +""" import unicodedata import itertools import os