add util.py, which provides standardize_word

2024-12-23 17:31:41 +00:00 · 2013-10-30 18:14:43 -04:00 · 2013-10-30 18:14:43 -04:00 · 52bcb99c48
commit 52bcb99c48
parent 5b31bd415f
1 changed files with 24 additions and 0 deletions
--- a/wordfreq/util.py
+++ b/wordfreq/util.py
@ -0,0 +1,24 @@
 # coding: utf-8
 from unicodedata import normalize
 from ftfy.fixes import remove_unsafe_private_use
 def standardize_word(word):
    u"""
    Apply various normalizations to the text. In languages where this is
    relevant, it will end up in all lowercase letters, with pre-composed
    diacritics.
    Some language-specific gotchas:
    - Words ending with a capital "Σ" in Greek have a lowercase version that
      ends with "ς" on Python 3, but "σ" on Python 2. (Python 3 is
      orthographically correct.) This will lead to different frequencies on
      such Greek words, and different numbers of words in total.
    - Words containing a capital "I" in Turkish will be normalized to a
      lowercase "i", incorrectly, instead of "ı". The effective result is
      that the capitalized versions will not share a word count with the
      lowercase versions.
    """
    return normalize('NFKC', remove_unsafe_private_use(word)).lower()