mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
51 lines
2.0 KiB
Python
51 lines
2.0 KiB
Python
from wordfreq import tokenize
|
||
from wordfreq.preprocess import preprocess_text
|
||
|
||
|
||
def test_transliteration():
|
||
# "Well, there's a lot of things you do not understand."
|
||
# (from somewhere in OpenSubtitles
|
||
assert (
|
||
tokenize("Па, има ту много ствари које не схваташ.", 'sr') ==
|
||
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
|
||
)
|
||
assert (
|
||
tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') ==
|
||
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš']
|
||
)
|
||
|
||
# I don't have examples of complete sentences in Azerbaijani that are
|
||
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani
|
||
# in Latin letters on the Internet, _except_ sometimes for Wiktionary.
|
||
# So here are some individual words.
|
||
|
||
# 'library' in Azerbaijani Cyrillic
|
||
assert preprocess_text('китабхана', 'az') == 'kitabxana'
|
||
assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana'
|
||
assert preprocess_text('KİTABXANA', 'az') == 'kitabxana'
|
||
|
||
# 'scream' in Azerbaijani Cyrillic
|
||
assert preprocess_text('бағырты', 'az') == 'bağırtı'
|
||
assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı'
|
||
assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'
|
||
|
||
|
||
def test_actually_russian():
|
||
# This looks mostly like Serbian, but was probably actually Russian.
|
||
# In Russian, Google Translate says it means:
|
||
# "a hundred out of a hundred, boys!"
|
||
#
|
||
# We make sure to handle this case so we don't end up with a mixed-script
|
||
# word like "pacanы".
|
||
|
||
assert tokenize("сто из ста, пацаны!", 'sr') == ['sto', 'iz', 'sta', 'pacany']
|
||
assert tokenize("культуры", 'sr') == ["kul'tury"]
|
||
|
||
|
||
def test_alternate_codes():
|
||
# Try language codes for Serbo-Croatian that have been split, and now
|
||
# are canonically mapped to Serbian
|
||
assert tokenize("культуры", 'sh') == ["kul'tury"]
|
||
assert tokenize("культуры", 'hbs') == ["kul'tury"]
|
||
|