wordfreq/wordfreq/transliterate.py
2018-03-08 16:47:36 -05:00

109 lines
3.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# This table comes from https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping.py,
# from the 'cyrtranslit' module. We originally had to reimplement it because
# 'cyrtranslit' didn't work in Python 3; now it does, but we've made the table
# more robust than the one in cyrtranslit.
SR_LATN_TABLE = {
ord('А'): 'A', ord('а'): 'a',
ord('Б'): 'B', ord('б'): 'b',
ord('В'): 'V', ord('в'): 'v',
ord('Г'): 'G', ord('г'): 'g',
ord('Д'): 'D', ord('д'): 'd',
ord('Ђ'): 'Đ', ord('ђ'): 'đ',
ord('Е'): 'E', ord('е'): 'e',
ord('Ж'): 'Ž', ord('ж'): 'ž',
ord('З'): 'Z', ord('з'): 'z',
ord('И'): 'I', ord('и'): 'i',
ord('Ј'): 'J', ord('ј'): 'j',
ord('К'): 'K', ord('к'): 'k',
ord('Л'): 'L', ord('л'): 'l',
ord('Љ'): 'Lj', ord('љ'): 'lj',
ord('М'): 'M', ord('м'): 'm',
ord('Н'): 'N', ord('н'): 'n',
ord('Њ'): 'Nj', ord('њ'): 'nj',
ord('О'): 'O', ord('о'): 'o',
ord('П'): 'P', ord('п'): 'p',
ord('Р'): 'R', ord('р'): 'r',
ord('С'): 'S', ord('с'): 's',
ord('Т'): 'T', ord('т'): 't',
ord('Ћ'): 'Ć', ord('ћ'): 'ć',
ord('У'): 'U', ord('у'): 'u',
ord('Ф'): 'F', ord('ф'): 'f',
ord('Х'): 'H', ord('х'): 'h',
ord('Ц'): 'C', ord('ц'): 'c',
ord('Ч'): 'Č', ord('ч'): 'č',
ord('Џ'): '', ord('џ'): '',
ord('Ш'): 'Š', ord('ш'): 'š',
# Handle Cyrillic letters from other languages. We hope these cases don't
# come up often when we're trying to transliterate Serbian, but if these
# letters show up in loan-words or code-switching text, we can at least
# transliterate them approximately instead of leaving them as Cyrillic
# letters surrounded by Latin.
# Russian letters
ord('Ё'): 'Jo', ord('ё'): 'jo',
ord('Й'): 'J', ord('й'): 'j',
ord('Щ'): 'Šč', ord('щ'): 'šč',
ord('Ъ'): '', ord('ъ'): '',
ord('Ы'): 'Y', ord('ы'): 'y',
ord('Ь'): "'", ord('ь'): "'",
ord('Э'): 'E', ord('э'): 'e',
ord('Ю'): 'Ju', ord('ю'): 'ju',
ord('Я'): 'Ja', ord('я'): 'ja',
# Belarusian letter
ord('Ў'): 'Ŭ', ord('ў'): 'ŭ',
# Ukrainian letters
ord('Є'): 'Je', ord('є'): 'je',
ord('І'): 'I', ord('і'): 'i',
ord('Ї'): 'Ï', ord('ї'): 'ï',
ord('Ґ'): 'G', ord('ґ'): 'g',
# Macedonian letters
ord('Ѕ'): 'Dz', ord('ѕ'): 'dz',
ord('Ѓ'): 'Ǵ', ord('ѓ'): 'ǵ',
ord('Ќ'): '', ord('ќ'): '',
}
AZ_LATN_TABLE = SR_LATN_TABLE.copy()
AZ_LATN_TABLE.update({
# Distinct Azerbaijani letters
ord('Ҹ'): 'C', ord('ҹ'): 'c',
ord('Ә'): 'Ə', ord('ә'): 'ə',
ord('Ғ'): 'Ğ', ord('ғ'): 'ğ',
ord('Һ'): 'H', ord('һ'): 'h',
ord('Ө'): 'Ö', ord('ө'): 'ö',
ord('Ҝ'): 'G', ord('ҝ'): 'g',
ord('Ү'): 'Ü', ord('ү'): 'ü',
# Azerbaijani letters with different transliterations
ord('Ч'): 'Ç', ord('ч'): 'ç',
ord('Х'): 'X', ord('х'): 'x',
ord('Ы'): 'I', ord('ы'): 'ı',
ord('И'): 'İ', ord('и'): 'i',
ord('Ж'): 'J', ord('ж'): 'j',
ord('Ј'): 'Y', ord('ј'): 'y',
ord('Г'): 'Q', ord('г'): 'q',
ord('Ш'): 'Ş', ord('ш'): 'ş',
})
def transliterate(table, text):
"""
Transliterate text according to one of the tables above.
`table` chooses the table. It looks like a language code but comes from a
very restricted set:
- 'sr-Latn' means to convert Serbian, which may be in Cyrillic, into the
Latin alphabet.
- 'az-Latn' means the same for Azerbaijani Cyrillic to Latn.
"""
if table == 'sr-Latn':
return text.translate(SR_LATN_TABLE)
elif table == 'az-Latn':
return text.translate(AZ_LATN_TABLE)
else:
raise ValueError("Unknown transliteration table: {!r}".format(table))