diff --git a/CHANGELOG.md b/CHANGELOG.md index 619e4fc..2add68f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ - Add automatic transliteration of Serbian text - Adjust tokenization of apostrophes next to vowel sounds: the French word "l'heure" is now tokenized similarly to "l'arc" -- Numbers longer than a single digit are smashed into the same word frequency, +- Multi-digit numbers of each length are smashed into the same word frequency, to remove meaningless differences and increase compatibility with word2vec. (Internally, their digits are replaced by zeroes.) - Another new frequency-merging strategy (drop the highest and lowest, diff --git a/wordfreq/transliterate.py b/wordfreq/transliterate.py index 0a059dc..1a0e26b 100644 --- a/wordfreq/transliterate.py +++ b/wordfreq/transliterate.py @@ -39,7 +39,7 @@ SR_CYRL_TO_LATN_DICT = { # letters surrounded by Latin. # Russian letters - ord('Ё'): 'Jo', ord('ё'): 'Jo', + ord('Ё'): 'Jo', ord('ё'): 'jo', ord('Й'): 'J', ord('й'): 'j', ord('Щ'): 'Šč', ord('щ'): 'šč', ord('Ъ'): '', ord('ъ'): '',