From 52bcb99c48fbcdff825d215ce7610538019bd60c Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Wed, 30 Oct 2013 18:14:43 -0400
Subject: [PATCH] add util.py, which provides standardize_word

---
 wordfreq/util.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 wordfreq/util.py

diff --git a/wordfreq/util.py b/wordfreq/util.py
new file mode 100644
index 0000000..6072281
--- /dev/null
+++ b/wordfreq/util.py
@@ -0,0 +1,24 @@
+# coding: utf-8
+from unicodedata import normalize
+from ftfy.fixes import remove_unsafe_private_use
+
+
+def standardize_word(word):
+    u"""
+    Apply various normalizations to the text. In languages where this is
+    relevant, it will end up in all lowercase letters, with pre-composed
+    diacritics.
+
+    Some language-specific gotchas:
+
+    - Words ending with a capital "Σ" in Greek have a lowercase version that
+      ends with "ς" on Python 3, but "σ" on Python 2. (Python 3 is
+      orthographically correct.) This will lead to different frequencies on
+      such Greek words, and different numbers of words in total.
+
+    - Words containing a capital "I" in Turkish will be normalized to a
+      lowercase "i", incorrectly, instead of "ı". The effective result is
+      that the capitalized versions will not share a word count with the
+      lowercase versions.
+    """
+    return normalize('NFKC', remove_unsafe_private_use(word)).lower()