fix az-Latn transliteration, and test

This commit is contained in:
Robyn Speer 2018-03-08 16:47:36 -05:00
parent a4d9614e39
commit fe85b4e124
3 changed files with 20 additions and 1 deletions

View File

@ -1,5 +1,6 @@
from nose.tools import eq_
from wordfreq import tokenize
from wordfreq.preprocess import preprocess_text
def test_transliteration():
@ -10,6 +11,21 @@ def test_transliteration():
eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
# I don't have examples of complete sentences in Azerbaijani that are
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani
# in Latin letters on the Internet, _except_ sometimes for Wiktionary.
# So here are some individual words.
# 'library' in Azerbaijani Cyrillic
eq_(preprocess_text('китабхана', 'az'), 'kitabxana')
eq_(preprocess_text('КИТАБХАНА', 'az'), 'kitabxana')
eq_(preprocess_text('KİTABXANA', 'az'), 'kitabxana')
# 'scream' in Azerbaijani Cyrillic
eq_(preprocess_text('бағырты', 'az'), 'bağırtı')
eq_(preprocess_text('БАҒЫРТЫ', 'az'), 'bağırtı')
eq_(preprocess_text('BAĞIRTI', 'az'), 'bağırtı')
def test_actually_russian():
# This looks mostly like Serbian, but was probably actually Russian.

View File

@ -162,6 +162,9 @@ def preprocess_text(text, language):
Azerbaijani (Azeri) has a similar transliteration step to Serbian,
and then the Latin-alphabet text is handled similarly to Turkish.
>>> preprocess_text('бағырты', 'az')
'bağırtı'
We don't transliterate Traditional to Simplified Chinese in this step.
There are some steps where we unify them internally: see chinese.py
for more information.

View File

@ -81,7 +81,7 @@ AZ_LATN_TABLE.update({
ord('Ч'): 'Ç', ord('ч'): 'ç',
ord('Х'): 'X', ord('х'): 'x',
ord('Ы'): 'I', ord('ы'): 'ı',
ord('И'): 'İ', ord('и'): 'ı',
ord('И'): 'İ', ord('и'): 'i',
ord('Ж'): 'J', ord('ж'): 'j',
ord('Ј'): 'Y', ord('ј'): 'y',
ord('Г'): 'Q', ord('г'): 'q',