diff --git a/tests/test_serbian.py b/tests/test_serbian.py new file mode 100644 index 0000000..bc64389 --- /dev/null +++ b/tests/test_serbian.py @@ -0,0 +1,12 @@ +from nose.tools import eq_ +from wordfreq import tokenize + + +def test_transliteration(): + # "Well, there's a lot of things you do not understand." + # (from somewhere in OpenSubtitles) + eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'), + ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']) + eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'), + ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']) + diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index ac0665b..534bb85 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -1,6 +1,6 @@ import regex import unicodedata - +from .transliterate import serbian_cyrillic_to_latin mecab_tokenize = None jieba_tokenize = None @@ -142,42 +142,6 @@ def simple_tokenize(text, include_punctuation=False): for token in TOKEN_RE.findall(text) ] - -def turkish_tokenize(text, include_punctuation=False): - """ - Like `simple_tokenize`, but modifies i's so that they case-fold correctly - in Turkish, and modifies 'comma-below' characters to use cedillas. - """ - text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı') - if include_punctuation: - return [ - smash_numbers(commas_to_cedillas(token.casefold())) - for token in TOKEN_RE_WITH_PUNCTUATION.findall(text) - ] - else: - return [ - smash_numbers(commas_to_cedillas(token.strip("'").casefold())) - for token in TOKEN_RE.findall(text) - ] - - -def romanian_tokenize(text, include_punctuation=False): - """ - Like `simple_tokenize`, but modifies the letters ş and ţ (with cedillas) - to use commas-below instead. - """ - if include_punctuation: - return [ - smash_numbers(cedillas_to_commas(token.casefold())) - for token in TOKEN_RE_WITH_PUNCTUATION.findall(text) - ] - else: - return [ - smash_numbers(cedillas_to_commas(token.strip("'").casefold())) - for token in TOKEN_RE.findall(text) - ] - - def tokenize_mecab_language(text, lang, include_punctuation=False): """ Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary. @@ -251,6 +215,30 @@ def cedillas_to_commas(text): '\N{LATIN SMALL LETTER T WITH COMMA BELOW}' ) +def preprocess_turkish(text): + """ + Modifies i's so that they case-fold correctly in Turkish, and modifies + 'comma-below' characters to use cedillas. + """ + text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı') + return commas_to_cedillas(text.casefold()) + + +def preprocess_romanian(text): + """ + Modifies the letters ş and ţ (with cedillas) to use commas-below instead. + """ + return cedillas_to_commas(text.casefold()) + + +def preprocess_serbian(text): + """ + Serbian is written in two scripts, so transliterate from Cyrillic to Latin + (which is the unambiguous direction). + """ + return serbian_cyrillic_to_latin(text) + + def sub_zeroes(match): """ Given a regex match, return what it matched with digits replaced by @@ -371,9 +359,13 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False): elif lang == 'zh': return chinese_tokenize(text, include_punctuation, external_wordlist) elif lang == 'tr': - return turkish_tokenize(text, include_punctuation) + return simple_tokenize(preprocess_turkish(text), include_punctuation) elif lang == 'ro': - return romanian_tokenize(text, include_punctuation) + return simple_tokenize(preprocess_romanian(text), include_punctuation) + elif lang == 'sr' or lang == 'sh' or lang == 'hbs': + # These are the three language codes that could include Serbian text, + # which could be in Cyrillic. + return simple_tokenize(preprocess_serbian(text), include_punctuation) elif lang in ABJAD_LANGUAGES: text = remove_marks(unicodedata.normalize('NFKC', text)) return simple_tokenize(text, include_punctuation) diff --git a/wordfreq/transliterate.py b/wordfreq/transliterate.py new file mode 100644 index 0000000..6c5224c --- /dev/null +++ b/wordfreq/transliterate.py @@ -0,0 +1,39 @@ +# This table comes from https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping.py, +# from the 'cyrtranslit' module, which is too flaky for us to just import. +SR_CYRL_TO_LATN_DICT = { + ord('А'): 'A', ord('а'): 'a', + ord('Б'): 'B', ord('б'): 'b', + ord('В'): 'V', ord('в'): 'v', + ord('Г'): 'G', ord('г'): 'g', + ord('Д'): 'D', ord('д'): 'd', + ord('Ђ'): 'Đ', ord('ђ'): 'đ', + ord('Е'): 'E', ord('е'): 'e', + ord('Ж'): 'Ž', ord('ж'): 'ž', + ord('З'): 'Z', ord('з'): 'z', + ord('И'): 'I', ord('и'): 'i', + ord('Ј'): 'J', ord('ј'): 'j', + ord('К'): 'K', ord('к'): 'k', + ord('Л'): 'L', ord('л'): 'l', + ord('Љ'): 'Lj', ord('љ'): 'lj', + ord('М'): 'M', ord('м'): 'm', + ord('Н'): 'N', ord('н'): 'n', + ord('Њ'): 'Nj', ord('њ'): 'nj', + ord('О'): 'O', ord('о'): 'o', + ord('П'): 'P', ord('п'): 'p', + ord('Р'): 'R', ord('р'): 'r', + ord('С'): 'S', ord('с'): 's', + ord('Т'): 'T', ord('т'): 't', + ord('Ћ'): 'Ć', ord('ћ'): 'ć', + ord('У'): 'U', ord('у'): 'u', + ord('Ф'): 'F', ord('ф'): 'f', + ord('Х'): 'H', ord('х'): 'h', + ord('Ц'): 'C', ord('ц'): 'c', + ord('Ч'): 'Č', ord('ч'): 'č', + ord('Џ'): 'Dž', ord('џ'): 'dž', + ord('Ш'): 'Š', ord('ш'): 'š', +} + + +def serbian_cyrillic_to_latin(text): + return text.translate(SR_CYRL_TO_LATN_DICT) +