1
0
mirror of https://github.com/rspeer/wordfreq.git synced 2025-01-14 13:15:59 +00:00
wordfreq/tests/test_transliteration.py
2022-03-10 18:33:42 -05:00

62 lines
2.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from wordfreq import tokenize
from wordfreq.preprocess import preprocess_text
def test_transliteration():
# "Well, there's a lot of things you do not understand."
# (from somewhere in OpenSubtitles
assert tokenize("Па, има ту много ствари које не схваташ.", "sr") == [
"pa",
"ima",
"tu",
"mnogo",
"stvari",
"koje",
"ne",
"shvataš",
]
assert tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", "sr") == [
"pa",
"ima",
"tu",
"mnogo",
"stvari",
"koje",
"ne",
"shvataš",
]
# I don't have examples of complete sentences in Azerbaijani that are
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani
# in Latin letters on the Internet, _except_ sometimes for Wiktionary.
# So here are some individual words.
# 'library' in Azerbaijani Cyrillic
assert preprocess_text("китабхана", "az") == "kitabxana"
assert preprocess_text("КИТАБХАНА", "az") == "kitabxana"
assert preprocess_text("KİTABXANA", "az") == "kitabxana"
# 'scream' in Azerbaijani Cyrillic
assert preprocess_text("бағырты", "az") == "bağırtı"
assert preprocess_text("БАҒЫРТЫ", "az") == "bağırtı"
assert preprocess_text("BAĞIRTI", "az") == "bağırtı"
def test_actually_russian():
# This looks mostly like Serbian, but was probably actually Russian.
# In Russian, Google Translate says it means:
# "a hundred out of a hundred, boys!"
#
# We make sure to handle this case so we don't end up with a mixed-script
# word like "pacanы".
assert tokenize("сто из ста, пацаны!", "sr") == ["sto", "iz", "sta", "pacany"]
assert tokenize("культуры", "sr") == ["kul'tury"]
def test_alternate_codes():
# Try language codes for Serbo-Croatian that have been split, and now
# are canonically mapped to Serbian
assert tokenize("культуры", "sh") == ["kul'tury"]
assert tokenize("культуры", "hbs") == ["kul'tury"]