add util.py, which provides standardize_word

2024-12-23 09:21:37 +00:00 · 2013-10-30 18:14:43 -04:00 · 2013-10-30 18:14:43 -04:00 · 52bcb99c48
commit 52bcb99c48
parent 5b31bd415f
1 changed files with 24 additions and 0 deletions
--- a/wordfreq/util.py
+++ b/wordfreq/util.py
@ -0,0 +1,24 @@
+# coding: utf-8
+from unicodedata import normalize
+from ftfy.fixes import remove_unsafe_private_use
+
+
+def standardize_word(word):
+    u"""
+    Apply various normalizations to the text. In languages where this is
+    relevant, it will end up in all lowercase letters, with pre-composed
+    diacritics.
+
+    Some language-specific gotchas:
+
+    - Words ending with a capital "Σ" in Greek have a lowercase version that
+      ends with "ς" on Python 3, but "σ" on Python 2. (Python 3 is
+      orthographically correct.) This will lead to different frequencies on
+      such Greek words, and different numbers of words in total.
+
+    - Words containing a capital "I" in Turkish will be normalized to a
+      lowercase "i", incorrectly, instead of "ı". The effective result is
+      that the capitalized versions will not share a word count with the
+      lowercase versions.
+    """
+    return normalize('NFKC', remove_unsafe_private_use(word)).lower()