mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
transliterate: Handle unexpected Russian invasions
This commit is contained in:
parent
6211b35fb3
commit
87b03325db
@ -10,3 +10,16 @@ def test_transliteration():
|
||||
eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
|
||||
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
|
||||
|
||||
|
||||
def test_actually_russian():
|
||||
# This looks mostly like Serbian, but was probably actually Russian.
|
||||
# In Russian, Google Translate says it means:
|
||||
# "a hundred out of a hundred, boys!"
|
||||
#
|
||||
# We make sure to handle this case so we don't end up with a mixed-script
|
||||
# word like "pacanы".
|
||||
|
||||
eq_(tokenize("сто из ста, пацаны!", 'sr'),
|
||||
['sto', 'iz', 'sta', 'pacany'])
|
||||
|
||||
eq_(tokenize("культуры", 'sr'), ["kul'tury"])
|
||||
|
@ -31,6 +31,29 @@ SR_CYRL_TO_LATN_DICT = {
|
||||
ord('Ч'): 'Č', ord('ч'): 'č',
|
||||
ord('Џ'): 'Dž', ord('џ'): 'dž',
|
||||
ord('Ш'): 'Š', ord('ш'): 'š',
|
||||
|
||||
# Handle borrowed letters from Russian
|
||||
ord('Ё'): 'Jo', ord('ё'): 'Jo',
|
||||
ord('Й'): 'J', ord('й'): 'j',
|
||||
ord('Ў'): 'U', ord('ў'): 'u',
|
||||
ord('Щ'): 'Šč', ord('щ'): 'šč',
|
||||
ord('Ъ'): '', ord('ъ'): '',
|
||||
ord('Ы'): 'Y', ord('ы'): 'y',
|
||||
ord('Ь'): "'", ord('ь'): "'",
|
||||
ord('Э'): 'E', ord('э'): 'e',
|
||||
ord('Ю'): 'Ju', ord('ю'): 'ju',
|
||||
ord('Я'): 'Ja', ord('я'): 'ja',
|
||||
|
||||
# Handle borrowed letters from Ukrainian
|
||||
ord('Є'): 'Je', ord('є'): 'je',
|
||||
ord('І'): 'I', ord('і'): 'i',
|
||||
ord('Ї'): 'Ji', ord('ї'): 'ji',
|
||||
ord('Ґ'): 'G', ord('ґ'): 'g',
|
||||
|
||||
# Handle borrowed letters from Macedonian
|
||||
ord('Ѕ'): 'Dz', ord('ѕ'): 'dz',
|
||||
ord('Ѓ'): 'Ǵ', ord('ѓ'): 'ǵ',
|
||||
ord('Ќ'): 'Ḱ', ord('ќ'): 'ḱ',
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user