From 52bcb99c48fbcdff825d215ce7610538019bd60c Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Wed, 30 Oct 2013 18:14:43 -0400 Subject: [PATCH] add util.py, which provides standardize_word --- wordfreq/util.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 wordfreq/util.py diff --git a/wordfreq/util.py b/wordfreq/util.py new file mode 100644 index 0000000..6072281 --- /dev/null +++ b/wordfreq/util.py @@ -0,0 +1,24 @@ +# coding: utf-8 +from unicodedata import normalize +from ftfy.fixes import remove_unsafe_private_use + + +def standardize_word(word): + u""" + Apply various normalizations to the text. In languages where this is + relevant, it will end up in all lowercase letters, with pre-composed + diacritics. + + Some language-specific gotchas: + + - Words ending with a capital "Σ" in Greek have a lowercase version that + ends with "ς" on Python 3, but "σ" on Python 2. (Python 3 is + orthographically correct.) This will lead to different frequencies on + such Greek words, and different numbers of words in total. + + - Words containing a capital "I" in Turkish will be normalized to a + lowercase "i", incorrectly, instead of "ı". The effective result is + that the capitalized versions will not share a word count with the + lowercase versions. + """ + return normalize('NFKC', remove_unsafe_private_use(word)).lower()