diff --git a/tests/test_serbian.py b/tests/test_serbian.py index bc64389..7d33367 100644 --- a/tests/test_serbian.py +++ b/tests/test_serbian.py @@ -10,3 +10,16 @@ def test_transliteration(): eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'), ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']) + +def test_actually_russian(): + # This looks mostly like Serbian, but was probably actually Russian. + # In Russian, Google Translate says it means: + # "a hundred out of a hundred, boys!" + # + # We make sure to handle this case so we don't end up with a mixed-script + # word like "pacanы". + + eq_(tokenize("сто из ста, пацаны!", 'sr'), + ['sto', 'iz', 'sta', 'pacany']) + + eq_(tokenize("культуры", 'sr'), ["kul'tury"]) diff --git a/wordfreq/transliterate.py b/wordfreq/transliterate.py index 6c5224c..57046d8 100644 --- a/wordfreq/transliterate.py +++ b/wordfreq/transliterate.py @@ -31,6 +31,37 @@ SR_CYRL_TO_LATN_DICT = { ord('Ч'): 'Č', ord('ч'): 'č', ord('Џ'): 'Dž', ord('џ'): 'dž', ord('Ш'): 'Š', ord('ш'): 'š', + + # Handle Cyrillic letters from other languages. We hope these cases don't + # come up often when we're trying to transliterate Serbian, but if these + # letters show up in loan-words or code-switching text, we can at least + # transliterate them approximately instead of leaving them as Cyrillic + # letters surrounded by Latin. + + # Russian letters + ord('Ё'): 'Jo', ord('ё'): 'Jo', + ord('Й'): 'J', ord('й'): 'j', + ord('Щ'): 'Šč', ord('щ'): 'šč', + ord('Ъ'): '', ord('ъ'): '', + ord('Ы'): 'Y', ord('ы'): 'y', + ord('Ь'): "'", ord('ь'): "'", + ord('Э'): 'E', ord('э'): 'e', + ord('Ю'): 'Ju', ord('ю'): 'ju', + ord('Я'): 'Ja', ord('я'): 'ja', + + # Belarusian letter + ord('Ў'): 'Ŭ', ord('ў'): 'ŭ', + + # Ukrainian letters + ord('Є'): 'Je', ord('є'): 'je', + ord('І'): 'I', ord('і'): 'i', + ord('Ї'): 'Ji', ord('ї'): 'ji', + ord('Ґ'): 'G', ord('ґ'): 'g', + + # Macedonian letters + ord('Ѕ'): 'Dz', ord('ѕ'): 'dz', + ord('Ѓ'): 'Ǵ', ord('ѓ'): 'ǵ', + ord('Ќ'): 'Ḱ', ord('ќ'): 'ḱ', }