From 99eac54b310309702abc9d91f001ce75b70f4377 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Wed, 4 Jan 2017 18:51:00 -0500 Subject: [PATCH] transliterate: Handle unexpected Russian invasions --- tests/test_serbian.py | 13 +++++++++++++ wordfreq/transliterate.py | 23 +++++++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/tests/test_serbian.py b/tests/test_serbian.py index bc64389..7d33367 100644 --- a/tests/test_serbian.py +++ b/tests/test_serbian.py @@ -10,3 +10,16 @@ def test_transliteration(): eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'), ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']) + +def test_actually_russian(): + # This looks mostly like Serbian, but was probably actually Russian. + # In Russian, Google Translate says it means: + # "a hundred out of a hundred, boys!" + # + # We make sure to handle this case so we don't end up with a mixed-script + # word like "pacanы". + + eq_(tokenize("сто из ста, пацаны!", 'sr'), + ['sto', 'iz', 'sta', 'pacany']) + + eq_(tokenize("культуры", 'sr'), ["kul'tury"]) diff --git a/wordfreq/transliterate.py b/wordfreq/transliterate.py index 6c5224c..ba676c1 100644 --- a/wordfreq/transliterate.py +++ b/wordfreq/transliterate.py @@ -31,6 +31,29 @@ SR_CYRL_TO_LATN_DICT = { ord('Ч'): 'Č', ord('ч'): 'č', ord('Џ'): 'Dž', ord('џ'): 'dž', ord('Ш'): 'Š', ord('ш'): 'š', + + # Handle borrowed letters from Russian + ord('Ё'): 'Jo', ord('ё'): 'Jo', + ord('Й'): 'J', ord('й'): 'j', + ord('Ў'): 'U', ord('ў'): 'u', + ord('Щ'): 'Šč', ord('щ'): 'šč', + ord('Ъ'): '', ord('ъ'): '', + ord('Ы'): 'Y', ord('ы'): 'y', + ord('Ь'): "'", ord('ь'): "'", + ord('Э'): 'E', ord('э'): 'e', + ord('Ю'): 'Ju', ord('ю'): 'ju', + ord('Я'): 'Ja', ord('я'): 'ja', + + # Handle borrowed letters from Ukrainian + ord('Є'): 'Je', ord('є'): 'je', + ord('І'): 'I', ord('і'): 'i', + ord('Ї'): 'Ji', ord('ї'): 'ji', + ord('Ґ'): 'G', ord('ґ'): 'g', + + # Handle borrowed letters from Macedonian + ord('Ѕ'): 'Dz', ord('ѕ'): 'dz', + ord('Ѓ'): 'Ǵ', ord('ѓ'): 'ǵ', + ord('Ќ'): 'Ḱ', ord('ќ'): 'ḱ', }