mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
add util.py, which provides standardize_word
This commit is contained in:
parent
5b31bd415f
commit
52bcb99c48
24
wordfreq/util.py
Normal file
24
wordfreq/util.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
# coding: utf-8
|
||||||
|
from unicodedata import normalize
|
||||||
|
from ftfy.fixes import remove_unsafe_private_use
|
||||||
|
|
||||||
|
|
||||||
|
def standardize_word(word):
|
||||||
|
u"""
|
||||||
|
Apply various normalizations to the text. In languages where this is
|
||||||
|
relevant, it will end up in all lowercase letters, with pre-composed
|
||||||
|
diacritics.
|
||||||
|
|
||||||
|
Some language-specific gotchas:
|
||||||
|
|
||||||
|
- Words ending with a capital "Σ" in Greek have a lowercase version that
|
||||||
|
ends with "ς" on Python 3, but "σ" on Python 2. (Python 3 is
|
||||||
|
orthographically correct.) This will lead to different frequencies on
|
||||||
|
such Greek words, and different numbers of words in total.
|
||||||
|
|
||||||
|
- Words containing a capital "I" in Turkish will be normalized to a
|
||||||
|
lowercase "i", incorrectly, instead of "ı". The effective result is
|
||||||
|
that the capitalized versions will not share a word count with the
|
||||||
|
lowercase versions.
|
||||||
|
"""
|
||||||
|
return normalize('NFKC', remove_unsafe_private_use(word)).lower()
|
Loading…
Reference in New Issue
Block a user