Add transliteration of Cyrillic Serbian

2024-12-23 09:21:37 +00:00 · 2016-12-29 18:27:17 -05:00 · 2016-12-29 18:27:17 -05:00 · 6211b35fb3
commit 6211b35fb3
parent 0aa7ad46ae
3 changed files with 82 additions and 39 deletions
--- a/tests/test_serbian.py
+++ b/tests/test_serbian.py
@ -0,0 +1,12 @@
+from nose.tools import eq_
+from wordfreq import tokenize
+
+
+def test_transliteration():
+    # "Well, there's a lot of things you do not understand."
+    # (from somewhere in OpenSubtitles)
+    eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'),
+        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
+    eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
+        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
+
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -1,6 +1,6 @@
 import regex
 import unicodedata
-
+from .transliterate import serbian_cyrillic_to_latin

 mecab_tokenize = None
 jieba_tokenize = None
@ -142,42 +142,6 @@ def simple_tokenize(text, include_punctuation=False):
            for token in TOKEN_RE.findall(text)
        ]

-
-def turkish_tokenize(text, include_punctuation=False):
-    """
-    Like `simple_tokenize`, but modifies i's so that they case-fold correctly
-    in Turkish, and modifies 'comma-below' characters to use cedillas.
-    """
-    text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
-    if include_punctuation:
-        return [
-            smash_numbers(commas_to_cedillas(token.casefold()))
-            for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
-        ]
-    else:
-        return [
-            smash_numbers(commas_to_cedillas(token.strip("'").casefold()))
-            for token in TOKEN_RE.findall(text)
-        ]
-
-
-def romanian_tokenize(text, include_punctuation=False):
-    """
-    Like `simple_tokenize`, but modifies the letters ş and ţ (with cedillas)
-    to use commas-below instead.
-    """
-    if include_punctuation:
-        return [
-            smash_numbers(cedillas_to_commas(token.casefold()))
-            for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
-        ]
-    else:
-        return [
-            smash_numbers(cedillas_to_commas(token.strip("'").casefold()))
-            for token in TOKEN_RE.findall(text)
-        ]
-
-
 def tokenize_mecab_language(text, lang, include_punctuation=False):
    """
    Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
@ -251,6 +215,30 @@ def cedillas_to_commas(text):
        '\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
    )

+def preprocess_turkish(text):
+    """
+    Modifies i's so that they case-fold correctly in Turkish, and modifies
+    'comma-below' characters to use cedillas.
+    """
+    text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
+    return commas_to_cedillas(text.casefold())
+
+
+def preprocess_romanian(text):
+    """
+    Modifies the letters ş and ţ (with cedillas) to use commas-below instead.
+    """
+    return cedillas_to_commas(text.casefold())
+
+
+def preprocess_serbian(text):
+    """
+    Serbian is written in two scripts, so transliterate from Cyrillic to Latin
+    (which is the unambiguous direction).
+    """
+    return serbian_cyrillic_to_latin(text)
+
+
 def sub_zeroes(match):
    """
    Given a regex match, return what it matched with digits replaced by
@ -371,9 +359,13 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
    elif lang == 'zh':
        return chinese_tokenize(text, include_punctuation, external_wordlist)
    elif lang == 'tr':
-        return turkish_tokenize(text, include_punctuation)
+        return simple_tokenize(preprocess_turkish(text), include_punctuation)
    elif lang == 'ro':
-        return romanian_tokenize(text, include_punctuation)
+        return simple_tokenize(preprocess_romanian(text), include_punctuation)
+    elif lang == 'sr' or lang == 'sh' or lang == 'hbs':
+        # These are the three language codes that could include Serbian text,
+        # which could be in Cyrillic.
+        return simple_tokenize(preprocess_serbian(text), include_punctuation)
    elif lang in ABJAD_LANGUAGES:
        text = remove_marks(unicodedata.normalize('NFKC', text))
        return simple_tokenize(text, include_punctuation)
--- a/wordfreq/transliterate.py
+++ b/wordfreq/transliterate.py
@ -0,0 +1,39 @@
+# This table comes from https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping.py,
+# from the 'cyrtranslit' module, which is too flaky for us to just import.
+SR_CYRL_TO_LATN_DICT = {
+    ord('А'): 'A',   ord('а'): 'a',
+    ord('Б'): 'B',   ord('б'): 'b',
+    ord('В'): 'V',   ord('в'): 'v',
+    ord('Г'): 'G',   ord('г'): 'g',
+    ord('Д'): 'D',   ord('д'): 'd',
+    ord('Ђ'): 'Đ',   ord('ђ'): 'đ',
+    ord('Е'): 'E',   ord('е'): 'e',
+    ord('Ж'): 'Ž',   ord('ж'): 'ž',
+    ord('З'): 'Z',   ord('з'): 'z',
+    ord('И'): 'I',   ord('и'): 'i',
+    ord('Ј'): 'J',   ord('ј'): 'j',
+    ord('К'): 'K',   ord('к'): 'k',
+    ord('Л'): 'L',   ord('л'): 'l',
+    ord('Љ'): 'Lj',  ord('љ'): 'lj',
+    ord('М'): 'M',   ord('м'): 'm',
+    ord('Н'): 'N',   ord('н'): 'n',
+    ord('Њ'): 'Nj',  ord('њ'): 'nj',
+    ord('О'): 'O',   ord('о'): 'o',
+    ord('П'): 'P',   ord('п'): 'p',
+    ord('Р'): 'R',   ord('р'): 'r',
+    ord('С'): 'S',   ord('с'): 's',
+    ord('Т'): 'T',   ord('т'): 't',
+    ord('Ћ'): 'Ć',   ord('ћ'): 'ć',
+    ord('У'): 'U',   ord('у'): 'u',
+    ord('Ф'): 'F',   ord('ф'): 'f',
+    ord('Х'): 'H',   ord('х'): 'h',
+    ord('Ц'): 'C',   ord('ц'): 'c',
+    ord('Ч'): 'Č',   ord('ч'): 'č',
+    ord('Џ'): 'Dž',  ord('џ'): 'dž',
+    ord('Ш'): 'Š',   ord('ш'): 'š',
+}
+
+
+def serbian_cyrillic_to_latin(text):
+    return text.translate(SR_CYRL_TO_LATN_DICT)
+