wordfreq/tests/test_transliteration.py
2018-03-08 16:47:36 -05:00

50 lines
2.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from nose.tools import eq_
from wordfreq import tokenize
from wordfreq.preprocess import preprocess_text
def test_transliteration():
# "Well, there's a lot of things you do not understand."
# (from somewhere in OpenSubtitles)
eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'),
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
# I don't have examples of complete sentences in Azerbaijani that are
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani
# in Latin letters on the Internet, _except_ sometimes for Wiktionary.
# So here are some individual words.
# 'library' in Azerbaijani Cyrillic
eq_(preprocess_text('китабхана', 'az'), 'kitabxana')
eq_(preprocess_text('КИТАБХАНА', 'az'), 'kitabxana')
eq_(preprocess_text('KİTABXANA', 'az'), 'kitabxana')
# 'scream' in Azerbaijani Cyrillic
eq_(preprocess_text('бағырты', 'az'), 'bağırtı')
eq_(preprocess_text('БАҒЫРТЫ', 'az'), 'bağırtı')
eq_(preprocess_text('BAĞIRTI', 'az'), 'bağırtı')
def test_actually_russian():
# This looks mostly like Serbian, but was probably actually Russian.
# In Russian, Google Translate says it means:
# "a hundred out of a hundred, boys!"
#
# We make sure to handle this case so we don't end up with a mixed-script
# word like "pacanы".
eq_(tokenize("сто из ста, пацаны!", 'sr'),
['sto', 'iz', 'sta', 'pacany'])
eq_(tokenize("культуры", 'sr'), ["kul'tury"])
def test_alternate_codes():
# Try language codes for Serbo-Croatian that have been split, and now
# are canonically mapped to Serbian
eq_(tokenize("культуры", 'sh'), ["kul'tury"])
eq_(tokenize("культуры", 'hbs'), ["kul'tury"])