fix az-Latn transliteration, and test

2024-12-23 09:21:37 +00:00 · 2018-03-08 16:47:36 -05:00 · 2018-03-08 16:47:36 -05:00 · fe85b4e124
commit fe85b4e124
parent a4d9614e39
3 changed files with 20 additions and 1 deletions
--- a/tests/test_transliteration.py
+++ b/tests/test_transliteration.py
@ -1,5 +1,6 @@
 from nose.tools import eq_
 from wordfreq import tokenize
+from wordfreq.preprocess import preprocess_text


 def test_transliteration():
@ -10,6 +11,21 @@ def test_transliteration():
    eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])

+    # I don't have examples of complete sentences in Azerbaijani that are
+    # naturally in Cyrillic, because it turns out everyone writes Azerbaijani
+    # in Latin letters on the Internet, _except_ sometimes for Wiktionary.
+    # So here are some individual words.
+
+    # 'library' in Azerbaijani Cyrillic
+    eq_(preprocess_text('китабхана', 'az'), 'kitabxana')
+    eq_(preprocess_text('КИТАБХАНА', 'az'), 'kitabxana')
+    eq_(preprocess_text('KİTABXANA', 'az'), 'kitabxana')
+
+    # 'scream' in Azerbaijani Cyrillic
+    eq_(preprocess_text('бағырты', 'az'), 'bağırtı')
+    eq_(preprocess_text('БАҒЫРТЫ', 'az'), 'bağırtı')
+    eq_(preprocess_text('BAĞIRTI', 'az'), 'bağırtı')
+

 def test_actually_russian():
    # This looks mostly like Serbian, but was probably actually Russian.
--- a/wordfreq/preprocess.py
+++ b/wordfreq/preprocess.py
@ -162,6 +162,9 @@ def preprocess_text(text, language):
    Azerbaijani (Azeri) has a similar transliteration step to Serbian,
    and then the Latin-alphabet text is handled similarly to Turkish.

+    >>> preprocess_text('бағырты', 'az')
+    'bağırtı'
+
    We don't transliterate Traditional to Simplified Chinese in this step.
    There are some steps where we unify them internally: see chinese.py
    for more information.
--- a/wordfreq/transliterate.py
+++ b/wordfreq/transliterate.py
@ -81,7 +81,7 @@ AZ_LATN_TABLE.update({
    ord('Ч'): 'Ç',   ord('ч'): 'ç',
    ord('Х'): 'X',   ord('х'): 'x',
    ord('Ы'): 'I',   ord('ы'): 'ı',
-    ord('И'): 'İ',   ord('и'): 'ı',
+    ord('И'): 'İ',   ord('и'): 'i',
    ord('Ж'): 'J',   ord('ж'): 'j',
    ord('Ј'): 'Y',   ord('ј'): 'y',
    ord('Г'): 'Q',   ord('г'): 'q',