diff --git a/tests/test_serbian.py b/tests/test_transliteration.py similarity index 61% rename from tests/test_serbian.py rename to tests/test_transliteration.py index 3f8c93b..d7e4455 100644 --- a/tests/test_serbian.py +++ b/tests/test_transliteration.py @@ -1,5 +1,6 @@ from nose.tools import eq_ from wordfreq import tokenize +from wordfreq.preprocess import preprocess_text def test_transliteration(): @@ -10,6 +11,21 @@ def test_transliteration(): eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'), ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']) + # I don't have examples of complete sentences in Azerbaijani that are + # naturally in Cyrillic, because it turns out everyone writes Azerbaijani + # in Latin letters on the Internet, _except_ sometimes for Wiktionary. + # So here are some individual words. + + # 'library' in Azerbaijani Cyrillic + eq_(preprocess_text('китабхана', 'az'), 'kitabxana') + eq_(preprocess_text('КИТАБХАНА', 'az'), 'kitabxana') + eq_(preprocess_text('KİTABXANA', 'az'), 'kitabxana') + + # 'scream' in Azerbaijani Cyrillic + eq_(preprocess_text('бағырты', 'az'), 'bağırtı') + eq_(preprocess_text('БАҒЫРТЫ', 'az'), 'bağırtı') + eq_(preprocess_text('BAĞIRTI', 'az'), 'bağırtı') + def test_actually_russian(): # This looks mostly like Serbian, but was probably actually Russian. diff --git a/wordfreq/preprocess.py b/wordfreq/preprocess.py index 2a18d53..45a4be7 100644 --- a/wordfreq/preprocess.py +++ b/wordfreq/preprocess.py @@ -162,6 +162,9 @@ def preprocess_text(text, language): Azerbaijani (Azeri) has a similar transliteration step to Serbian, and then the Latin-alphabet text is handled similarly to Turkish. + >>> preprocess_text('бағырты', 'az') + 'bağırtı' + We don't transliterate Traditional to Simplified Chinese in this step. There are some steps where we unify them internally: see chinese.py for more information. diff --git a/wordfreq/transliterate.py b/wordfreq/transliterate.py index e451991..788f14d 100644 --- a/wordfreq/transliterate.py +++ b/wordfreq/transliterate.py @@ -81,7 +81,7 @@ AZ_LATN_TABLE.update({ ord('Ч'): 'Ç', ord('ч'): 'ç', ord('Х'): 'X', ord('х'): 'x', ord('Ы'): 'I', ord('ы'): 'ı', - ord('И'): 'İ', ord('и'): 'ı', + ord('И'): 'İ', ord('и'): 'i', ord('Ж'): 'J', ord('ж'): 'j', ord('Ј'): 'Y', ord('ј'): 'y', ord('Г'): 'Q', ord('г'): 'q',