document what this file is for

2024-12-23 09:21:37 +00:00 · 2015-09-22 15:31:27 -04:00 · 2015-09-22 15:31:27 -04:00 · 06f8b29971
commit 06f8b29971
parent 5b918e7bb0
1 changed files with 13 additions and 0 deletions
--- a/scripts/make_chinese_mapping.py
+++ b/scripts/make_chinese_mapping.py
@ -1,3 +1,16 @@
+"""
+Generate a Python file, _chinese_mapping.py, that maps Traditional Chinese
+characters to their Simplified Chinese equivalents.
+
+This is meant to be a normalization of text, somewhat like case-folding -- not
+an actual translator, a task for which this method would be unsuitable. We
+store word frequencies using Simplified Chinese characters so that, in the large
+number of cases where a Traditional Chinese word has an obvious Simplified Chinese
+mapping, we can get a frequency for it that's the same in Simplified and Traditional
+Chinese.
+
+Generating this mapping requires the external Chinese conversion tool OpenCC.
+"""
 import unicodedata
 import itertools
 import os