mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
fix az-Latn transliteration, and test
This commit is contained in:
parent
a4d9614e39
commit
fe85b4e124
@ -1,5 +1,6 @@
|
|||||||
from nose.tools import eq_
|
from nose.tools import eq_
|
||||||
from wordfreq import tokenize
|
from wordfreq import tokenize
|
||||||
|
from wordfreq.preprocess import preprocess_text
|
||||||
|
|
||||||
|
|
||||||
def test_transliteration():
|
def test_transliteration():
|
||||||
@ -10,6 +11,21 @@ def test_transliteration():
|
|||||||
eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
|
eq_(tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr'),
|
||||||
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
|
['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'])
|
||||||
|
|
||||||
|
# I don't have examples of complete sentences in Azerbaijani that are
|
||||||
|
# naturally in Cyrillic, because it turns out everyone writes Azerbaijani
|
||||||
|
# in Latin letters on the Internet, _except_ sometimes for Wiktionary.
|
||||||
|
# So here are some individual words.
|
||||||
|
|
||||||
|
# 'library' in Azerbaijani Cyrillic
|
||||||
|
eq_(preprocess_text('китабхана', 'az'), 'kitabxana')
|
||||||
|
eq_(preprocess_text('КИТАБХАНА', 'az'), 'kitabxana')
|
||||||
|
eq_(preprocess_text('KİTABXANA', 'az'), 'kitabxana')
|
||||||
|
|
||||||
|
# 'scream' in Azerbaijani Cyrillic
|
||||||
|
eq_(preprocess_text('бағырты', 'az'), 'bağırtı')
|
||||||
|
eq_(preprocess_text('БАҒЫРТЫ', 'az'), 'bağırtı')
|
||||||
|
eq_(preprocess_text('BAĞIRTI', 'az'), 'bağırtı')
|
||||||
|
|
||||||
|
|
||||||
def test_actually_russian():
|
def test_actually_russian():
|
||||||
# This looks mostly like Serbian, but was probably actually Russian.
|
# This looks mostly like Serbian, but was probably actually Russian.
|
@ -162,6 +162,9 @@ def preprocess_text(text, language):
|
|||||||
Azerbaijani (Azeri) has a similar transliteration step to Serbian,
|
Azerbaijani (Azeri) has a similar transliteration step to Serbian,
|
||||||
and then the Latin-alphabet text is handled similarly to Turkish.
|
and then the Latin-alphabet text is handled similarly to Turkish.
|
||||||
|
|
||||||
|
>>> preprocess_text('бағырты', 'az')
|
||||||
|
'bağırtı'
|
||||||
|
|
||||||
We don't transliterate Traditional to Simplified Chinese in this step.
|
We don't transliterate Traditional to Simplified Chinese in this step.
|
||||||
There are some steps where we unify them internally: see chinese.py
|
There are some steps where we unify them internally: see chinese.py
|
||||||
for more information.
|
for more information.
|
||||||
|
@ -81,7 +81,7 @@ AZ_LATN_TABLE.update({
|
|||||||
ord('Ч'): 'Ç', ord('ч'): 'ç',
|
ord('Ч'): 'Ç', ord('ч'): 'ç',
|
||||||
ord('Х'): 'X', ord('х'): 'x',
|
ord('Х'): 'X', ord('х'): 'x',
|
||||||
ord('Ы'): 'I', ord('ы'): 'ı',
|
ord('Ы'): 'I', ord('ы'): 'ı',
|
||||||
ord('И'): 'İ', ord('и'): 'ı',
|
ord('И'): 'İ', ord('и'): 'i',
|
||||||
ord('Ж'): 'J', ord('ж'): 'j',
|
ord('Ж'): 'J', ord('ж'): 'j',
|
||||||
ord('Ј'): 'Y', ord('ј'): 'y',
|
ord('Ј'): 'Y', ord('ј'): 'y',
|
||||||
ord('Г'): 'Q', ord('г'): 'q',
|
ord('Г'): 'Q', ord('г'): 'q',
|
||||||
|
Loading…
Reference in New Issue
Block a user