from wordfreq import tokenize
from wordfreq.preprocess import preprocess_text


def test_transliteration():
    # "Well, there's a lot of things you do not understand."
    # (from somewhere in OpenSubtitles
    assert tokenize("Па, има ту много ствари које не схваташ.", "sr") == [
        "pa",
        "ima",
        "tu",
        "mnogo",
        "stvari",
        "koje",
        "ne",
        "shvataš",
    ]
    assert tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", "sr") == [
        "pa",
        "ima",
        "tu",
        "mnogo",
        "stvari",
        "koje",
        "ne",
        "shvataš",
    ]

    # I don't have examples of complete sentences in Azerbaijani that are
    # naturally in Cyrillic, because it turns out everyone writes Azerbaijani
    # in Latin letters on the Internet, _except_ sometimes for Wiktionary.
    # So here are some individual words.

    # 'library' in Azerbaijani Cyrillic
    assert preprocess_text("китабхана", "az") == "kitabxana"
    assert preprocess_text("КИТАБХАНА", "az") == "kitabxana"
    assert preprocess_text("KİTABXANA", "az") == "kitabxana"

    # 'scream' in Azerbaijani Cyrillic
    assert preprocess_text("бағырты", "az") == "bağırtı"
    assert preprocess_text("БАҒЫРТЫ", "az") == "bağırtı"
    assert preprocess_text("BAĞIRTI", "az") == "bağırtı"


def test_actually_russian():
    # This looks mostly like Serbian, but was probably actually Russian.
    # In Russian, Google Translate says it means:
    # "a hundred out of a hundred, boys!"
    #
    # We make sure to handle this case so we don't end up with a mixed-script
    # word like "pacanы".

    assert tokenize("сто из ста, пацаны!", "sr") == ["sto", "iz", "sta", "pacany"]
    assert tokenize("культуры", "sr") == ["kul'tury"]


def test_alternate_codes():
    # Try language codes for Serbo-Croatian that have been split, and now
    # are canonically mapped to Serbian
    assert tokenize("культуры", "sh") == ["kul'tury"]
    assert tokenize("культуры", "hbs") == ["kul'tury"]