2016-12-29 23:27:17 +00:00
|
|
|
|
from wordfreq import tokenize
|
2018-03-08 21:47:36 +00:00
|
|
|
|
from wordfreq.preprocess import preprocess_text
|
2016-12-29 23:27:17 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_transliteration():
|
|
|
|
|
# "Well, there's a lot of things you do not understand."
|
2018-06-01 20:40:51 +00:00
|
|
|
|
# (from somewhere in OpenSubtitles
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("Па, има ту много ствари које не схваташ.", "sr") == [
|
|
|
|
|
"pa",
|
|
|
|
|
"ima",
|
|
|
|
|
"tu",
|
|
|
|
|
"mnogo",
|
|
|
|
|
"stvari",
|
|
|
|
|
"koje",
|
|
|
|
|
"ne",
|
|
|
|
|
"shvataš",
|
|
|
|
|
]
|
|
|
|
|
assert tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", "sr") == [
|
|
|
|
|
"pa",
|
|
|
|
|
"ima",
|
|
|
|
|
"tu",
|
|
|
|
|
"mnogo",
|
|
|
|
|
"stvari",
|
|
|
|
|
"koje",
|
|
|
|
|
"ne",
|
|
|
|
|
"shvataš",
|
|
|
|
|
]
|
2016-12-29 23:27:17 +00:00
|
|
|
|
|
2018-03-08 21:47:36 +00:00
|
|
|
|
# I don't have examples of complete sentences in Azerbaijani that are
|
|
|
|
|
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani
|
|
|
|
|
# in Latin letters on the Internet, _except_ sometimes for Wiktionary.
|
|
|
|
|
# So here are some individual words.
|
|
|
|
|
|
|
|
|
|
# 'library' in Azerbaijani Cyrillic
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert preprocess_text("китабхана", "az") == "kitabxana"
|
|
|
|
|
assert preprocess_text("КИТАБХАНА", "az") == "kitabxana"
|
|
|
|
|
assert preprocess_text("KİTABXANA", "az") == "kitabxana"
|
2018-03-08 21:47:36 +00:00
|
|
|
|
|
|
|
|
|
# 'scream' in Azerbaijani Cyrillic
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert preprocess_text("бағырты", "az") == "bağırtı"
|
|
|
|
|
assert preprocess_text("БАҒЫРТЫ", "az") == "bağırtı"
|
|
|
|
|
assert preprocess_text("BAĞIRTI", "az") == "bağırtı"
|
2018-03-08 21:47:36 +00:00
|
|
|
|
|
2017-01-04 23:51:00 +00:00
|
|
|
|
|
|
|
|
|
def test_actually_russian():
|
|
|
|
|
# This looks mostly like Serbian, but was probably actually Russian.
|
|
|
|
|
# In Russian, Google Translate says it means:
|
|
|
|
|
# "a hundred out of a hundred, boys!"
|
|
|
|
|
#
|
|
|
|
|
# We make sure to handle this case so we don't end up with a mixed-script
|
|
|
|
|
# word like "pacanы".
|
|
|
|
|
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("сто из ста, пацаны!", "sr") == ["sto", "iz", "sta", "pacany"]
|
|
|
|
|
assert tokenize("культуры", "sr") == ["kul'tury"]
|
2017-04-27 19:09:59 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_alternate_codes():
|
|
|
|
|
# Try language codes for Serbo-Croatian that have been split, and now
|
|
|
|
|
# are canonically mapped to Serbian
|
2022-03-10 23:33:42 +00:00
|
|
|
|
assert tokenize("культуры", "sh") == ["kul'tury"]
|
|
|
|
|
assert tokenize("культуры", "hbs") == ["kul'tury"]
|