wordfreq/tests/test_serbian.py

from nose.tools import eq_
from wordfreq import tokenize


def test_transliteration():
    # "Well, there's a lot of things you do not understand."
    # (from somewhere in OpenSubtitles)
    eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'),
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
    eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
        ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])


def test_actually_russian():
    # This looks mostly like Serbian, but was probably actually Russian.
    # In Russian, Google Translate says it means:
    # "a hundred out of a hundred, boys!"
    #
    # We make sure to handle this case so we don't end up with a mixed-script
    # word like "pacanы".

    eq_(tokenize("сто из ста, пацаны!", 'sr'),
        ['sto', 'iz', 'sta', 'pacany'])

    eq_(tokenize("культуры", 'sr'), ["kul'tury"])
Add transliteration of Cyrillic Serbian 2016-12-29 23:27:17 +00:00			`from nose.tools import eq_`
			`from wordfreq import tokenize`


			`def test_transliteration():`
			`# "Well, there's a lot of things you do not understand."`
			`# (from somewhere in OpenSubtitles)`
			`eq_(tokenize("Па, има ту много ствари које не схваташ.", 'sr'),`
			`['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])`
			`eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),`
			`['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])`

transliterate: Handle unexpected Russian invasions 2017-01-04 23:51:00 +00:00
			`def test_actually_russian():`
			`# This looks mostly like Serbian, but was probably actually Russian.`
			`# In Russian, Google Translate says it means:`
			`# "a hundred out of a hundred, boys!"`
			`#`
			`# We make sure to handle this case so we don't end up with a mixed-script`
			`# word like "pacanы".`

			`eq_(tokenize("сто из ста, пацаны!", 'sr'),`
			`['sto', 'iz', 'sta', 'pacany'])`

			`eq_(tokenize("культуры", 'sr'), ["kul'tury"])`