mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Add transliteration of Cyrillic Serbian
This commit is contained in:
parent
0aa7ad46ae
commit
6211b35fb3
12
tests/test_serbian.py
Normal file
12
tests/test_serbian.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from nose.tools import eq_
|
||||||
|
from wordfreq import tokenize
|
||||||
|
|
||||||
|
|
||||||
|
def test_transliteration():
|
||||||
|
# "Well, there's a lot of things you do not understand."
|
||||||
|
# (from somewhere in OpenSubtitles)
|
||||||
|
eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'),
|
||||||
|
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
|
||||||
|
eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
|
||||||
|
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
|
||||||
|
|
@ -1,6 +1,6 @@
|
|||||||
import regex
|
import regex
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
from .transliterate import serbian_cyrillic_to_latin
|
||||||
|
|
||||||
mecab_tokenize = None
|
mecab_tokenize = None
|
||||||
jieba_tokenize = None
|
jieba_tokenize = None
|
||||||
@ -142,42 +142,6 @@ def simple_tokenize(text, include_punctuation=False):
|
|||||||
for token in TOKEN_RE.findall(text)
|
for token in TOKEN_RE.findall(text)
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def turkish_tokenize(text, include_punctuation=False):
|
|
||||||
"""
|
|
||||||
Like `simple_tokenize`, but modifies i's so that they case-fold correctly
|
|
||||||
in Turkish, and modifies 'comma-below' characters to use cedillas.
|
|
||||||
"""
|
|
||||||
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
|
||||||
if include_punctuation:
|
|
||||||
return [
|
|
||||||
smash_numbers(commas_to_cedillas(token.casefold()))
|
|
||||||
for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
return [
|
|
||||||
smash_numbers(commas_to_cedillas(token.strip("'").casefold()))
|
|
||||||
for token in TOKEN_RE.findall(text)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def romanian_tokenize(text, include_punctuation=False):
|
|
||||||
"""
|
|
||||||
Like `simple_tokenize`, but modifies the letters ş and ţ (with cedillas)
|
|
||||||
to use commas-below instead.
|
|
||||||
"""
|
|
||||||
if include_punctuation:
|
|
||||||
return [
|
|
||||||
smash_numbers(cedillas_to_commas(token.casefold()))
|
|
||||||
for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
return [
|
|
||||||
smash_numbers(cedillas_to_commas(token.strip("'").casefold()))
|
|
||||||
for token in TOKEN_RE.findall(text)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
||||||
"""
|
"""
|
||||||
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
||||||
@ -251,6 +215,30 @@ def cedillas_to_commas(text):
|
|||||||
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
|
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def preprocess_turkish(text):
|
||||||
|
"""
|
||||||
|
Modifies i's so that they case-fold correctly in Turkish, and modifies
|
||||||
|
'comma-below' characters to use cedillas.
|
||||||
|
"""
|
||||||
|
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
||||||
|
return commas_to_cedillas(text.casefold())
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_romanian(text):
|
||||||
|
"""
|
||||||
|
Modifies the letters ş and ţ (with cedillas) to use commas-below instead.
|
||||||
|
"""
|
||||||
|
return cedillas_to_commas(text.casefold())
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_serbian(text):
|
||||||
|
"""
|
||||||
|
Serbian is written in two scripts, so transliterate from Cyrillic to Latin
|
||||||
|
(which is the unambiguous direction).
|
||||||
|
"""
|
||||||
|
return serbian_cyrillic_to_latin(text)
|
||||||
|
|
||||||
|
|
||||||
def sub_zeroes(match):
|
def sub_zeroes(match):
|
||||||
"""
|
"""
|
||||||
Given a regex match, return what it matched with digits replaced by
|
Given a regex match, return what it matched with digits replaced by
|
||||||
@ -371,9 +359,13 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
|||||||
elif lang == 'zh':
|
elif lang == 'zh':
|
||||||
return chinese_tokenize(text, include_punctuation, external_wordlist)
|
return chinese_tokenize(text, include_punctuation, external_wordlist)
|
||||||
elif lang == 'tr':
|
elif lang == 'tr':
|
||||||
return turkish_tokenize(text, include_punctuation)
|
return simple_tokenize(preprocess_turkish(text), include_punctuation)
|
||||||
elif lang == 'ro':
|
elif lang == 'ro':
|
||||||
return romanian_tokenize(text, include_punctuation)
|
return simple_tokenize(preprocess_romanian(text), include_punctuation)
|
||||||
|
elif lang == 'sr' or lang == 'sh' or lang == 'hbs':
|
||||||
|
# These are the three language codes that could include Serbian text,
|
||||||
|
# which could be in Cyrillic.
|
||||||
|
return simple_tokenize(preprocess_serbian(text), include_punctuation)
|
||||||
elif lang in ABJAD_LANGUAGES:
|
elif lang in ABJAD_LANGUAGES:
|
||||||
text = remove_marks(unicodedata.normalize('NFKC', text))
|
text = remove_marks(unicodedata.normalize('NFKC', text))
|
||||||
return simple_tokenize(text, include_punctuation)
|
return simple_tokenize(text, include_punctuation)
|
||||||
|
39
wordfreq/transliterate.py
Normal file
39
wordfreq/transliterate.py
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
# This table comes from https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping.py,
|
||||||
|
# from the 'cyrtranslit' module, which is too flaky for us to just import.
|
||||||
|
SR_CYRL_TO_LATN_DICT = {
|
||||||
|
ord('А'): 'A', ord('а'): 'a',
|
||||||
|
ord('Б'): 'B', ord('б'): 'b',
|
||||||
|
ord('В'): 'V', ord('в'): 'v',
|
||||||
|
ord('Г'): 'G', ord('г'): 'g',
|
||||||
|
ord('Д'): 'D', ord('д'): 'd',
|
||||||
|
ord('Ђ'): 'Đ', ord('ђ'): 'đ',
|
||||||
|
ord('Е'): 'E', ord('е'): 'e',
|
||||||
|
ord('Ж'): 'Ž', ord('ж'): 'ž',
|
||||||
|
ord('З'): 'Z', ord('з'): 'z',
|
||||||
|
ord('И'): 'I', ord('и'): 'i',
|
||||||
|
ord('Ј'): 'J', ord('ј'): 'j',
|
||||||
|
ord('К'): 'K', ord('к'): 'k',
|
||||||
|
ord('Л'): 'L', ord('л'): 'l',
|
||||||
|
ord('Љ'): 'Lj', ord('љ'): 'lj',
|
||||||
|
ord('М'): 'M', ord('м'): 'm',
|
||||||
|
ord('Н'): 'N', ord('н'): 'n',
|
||||||
|
ord('Њ'): 'Nj', ord('њ'): 'nj',
|
||||||
|
ord('О'): 'O', ord('о'): 'o',
|
||||||
|
ord('П'): 'P', ord('п'): 'p',
|
||||||
|
ord('Р'): 'R', ord('р'): 'r',
|
||||||
|
ord('С'): 'S', ord('с'): 's',
|
||||||
|
ord('Т'): 'T', ord('т'): 't',
|
||||||
|
ord('Ћ'): 'Ć', ord('ћ'): 'ć',
|
||||||
|
ord('У'): 'U', ord('у'): 'u',
|
||||||
|
ord('Ф'): 'F', ord('ф'): 'f',
|
||||||
|
ord('Х'): 'H', ord('х'): 'h',
|
||||||
|
ord('Ц'): 'C', ord('ц'): 'c',
|
||||||
|
ord('Ч'): 'Č', ord('ч'): 'č',
|
||||||
|
ord('Џ'): 'Dž', ord('џ'): 'dž',
|
||||||
|
ord('Ш'): 'Š', ord('ш'): 'š',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def serbian_cyrillic_to_latin(text):
|
||||||
|
return text.translate(SR_CYRL_TO_LATN_DICT)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user