mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Separate preprocessing from tokenization
This commit is contained in:
parent
72646f16a1
commit
5ab5d2ea55
@ -1,6 +1,6 @@
|
|||||||
from wordfreq import (
|
from wordfreq import (
|
||||||
word_frequency, available_languages, cB_to_freq,
|
word_frequency, available_languages, cB_to_freq,
|
||||||
top_n_list, random_words, random_ascii_words, tokenize
|
top_n_list, random_words, random_ascii_words, tokenize, lossy_tokenize
|
||||||
)
|
)
|
||||||
from nose.tools import (
|
from nose.tools import (
|
||||||
eq_, assert_almost_equal, assert_greater, raises
|
eq_, assert_almost_equal, assert_greater, raises
|
||||||
@ -164,13 +164,13 @@ def test_casefolding():
|
|||||||
def test_number_smashing():
|
def test_number_smashing():
|
||||||
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
|
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
|
||||||
['715', 'crσσks', 'by', 'bon', 'iver'])
|
['715', 'crσσks', 'by', 'bon', 'iver'])
|
||||||
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True),
|
eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
|
||||||
['000', 'crσσks', 'by', 'bon', 'iver'])
|
['000', 'crσσks', 'by', 'bon', 'iver'])
|
||||||
eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True, include_punctuation=True),
|
eq_(lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True),
|
||||||
['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
|
['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
|
||||||
eq_(tokenize('1', 'en', combine_numbers=True), ['1'])
|
eq_(lossy_tokenize('1', 'en'), ['1'])
|
||||||
eq_(tokenize('3.14', 'en', combine_numbers=True), ['0.00'])
|
eq_(lossy_tokenize('3.14', 'en'), ['0.00'])
|
||||||
eq_(tokenize('24601', 'en', combine_numbers=True), ['00000'])
|
eq_(lossy_tokenize('24601', 'en'), ['00000'])
|
||||||
eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
|
eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
|
||||||
|
|
||||||
|
|
||||||
@ -231,6 +231,7 @@ def test_ideographic_fallback():
|
|||||||
['ひらがな', 'カタカナ', 'romaji']
|
['ひらがな', 'カタカナ', 'romaji']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_other_languages():
|
def test_other_languages():
|
||||||
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
||||||
# we would actually split this into a three-word phrase.
|
# we would actually split this into a three-word phrase.
|
||||||
|
@ -83,5 +83,3 @@ def test_alternate_codes():
|
|||||||
# Separate codes for Mandarin and Cantonese
|
# Separate codes for Mandarin and Cantonese
|
||||||
eq_(tokenize('谢谢谢谢', 'cmn'), tokens)
|
eq_(tokenize('谢谢谢谢', 'cmn'), tokens)
|
||||||
eq_(tokenize('谢谢谢谢', 'yue'), tokens)
|
eq_(tokenize('谢谢谢谢', 'yue'), tokens)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
from wordfreq.tokens import tokenize, simple_tokenize
|
|
||||||
from pkg_resources import resource_filename
|
from pkg_resources import resource_filename
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
import langcodes
|
import langcodes
|
||||||
@ -10,6 +9,9 @@ import random
|
|||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
|
|
||||||
|
from .tokens import tokenize, simple_tokenize, lossy_tokenize
|
||||||
|
from .language_info import get_language_info
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@ -30,8 +32,9 @@ INFERRED_SPACE_LANGUAGES = {'zh'}
|
|||||||
# frequency.)
|
# frequency.)
|
||||||
INFERRED_SPACE_FACTOR = 10.0
|
INFERRED_SPACE_FACTOR = 10.0
|
||||||
|
|
||||||
# simple_tokenize is imported so that other things can import it from here.
|
# tokenize and simple_tokenize are imported so that other things can import
|
||||||
# Suppress the pyflakes warning.
|
# them from here. Suppress the pyflakes warning.
|
||||||
|
tokenize = tokenize
|
||||||
simple_tokenize = simple_tokenize
|
simple_tokenize = simple_tokenize
|
||||||
|
|
||||||
|
|
||||||
@ -215,8 +218,9 @@ def iter_wordlist(lang, wordlist='combined'):
|
|||||||
# it takes to look up frequencies from scratch, so something faster is needed.
|
# it takes to look up frequencies from scratch, so something faster is needed.
|
||||||
_wf_cache = {}
|
_wf_cache = {}
|
||||||
|
|
||||||
|
|
||||||
def _word_frequency(word, lang, wordlist, minimum):
|
def _word_frequency(word, lang, wordlist, minimum):
|
||||||
tokens = tokenize(word, lang, combine_numbers=True)
|
tokens = lossy_tokenize(word, lang)
|
||||||
if not tokens:
|
if not tokens:
|
||||||
return minimum
|
return minimum
|
||||||
|
|
||||||
@ -234,7 +238,10 @@ def _word_frequency(word, lang, wordlist, minimum):
|
|||||||
|
|
||||||
freq = 1.0 / one_over_result
|
freq = 1.0 / one_over_result
|
||||||
|
|
||||||
if lang in INFERRED_SPACE_LANGUAGES:
|
if get_language_info(lang)['tokenizer'] == 'jieba':
|
||||||
|
# If we used the Jieba tokenizer, we could tokenize anything to match
|
||||||
|
# our wordlist, even nonsense. To counteract this, we multiply by a
|
||||||
|
# probability for each word break that was inferred.
|
||||||
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
|
freq /= INFERRED_SPACE_FACTOR ** (len(tokens) - 1)
|
||||||
|
|
||||||
return max(freq, minimum)
|
return max(freq, minimum)
|
||||||
|
148
wordfreq/language_info.py
Normal file
148
wordfreq/language_info.py
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
from langcodes import Language, best_match
|
||||||
|
|
||||||
|
|
||||||
|
# Text in scripts written without spaces has to be handled specially in our
|
||||||
|
# tokenization regex (see TOKEN_RE in tokens.py). Also, when one of these is
|
||||||
|
# the script of the language we're analyzing, then we need to either have
|
||||||
|
# a specific tokenizer for the language or give up.
|
||||||
|
SPACELESS_SCRIPTS = [
|
||||||
|
# Han ideographs are spaceless, but they don't need to appear in this list
|
||||||
|
# because they have their own cases in get_language_info and TOKEN_RE.
|
||||||
|
'Hiragana',
|
||||||
|
# We omit katakana because Unicode regular expressions can already
|
||||||
|
# tokenize sequences of katakana, and omitting it here means we can also
|
||||||
|
# recognize a switch between hiragana and katakana as a token boundary.
|
||||||
|
'Thai', # Thai script
|
||||||
|
'Khmr', # Khmer script
|
||||||
|
'Laoo', # Lao script
|
||||||
|
'Mymr', # Burmese script
|
||||||
|
'Tale', # Tai Le script
|
||||||
|
'Talu', # Tai Lü script
|
||||||
|
'Lana', # Lanna script
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _language_in_list(language, targets, min_score=80):
|
||||||
|
"""
|
||||||
|
A helper function to determine whether this language matches one of the
|
||||||
|
target languages, with a match score above a certain threshold.
|
||||||
|
|
||||||
|
The languages can be given as strings (language tags) or as Language
|
||||||
|
objects. `targets` can be any iterable of such languages.
|
||||||
|
"""
|
||||||
|
matched = best_match(language, targets)
|
||||||
|
return matched[1] > 0
|
||||||
|
|
||||||
|
|
||||||
|
def get_language_info(language):
|
||||||
|
"""
|
||||||
|
Looks up the things we need to know about how to handle text in a given
|
||||||
|
language. This will return a dictionary with the following fields:
|
||||||
|
|
||||||
|
'script': a BCP 47 script code such as 'Latn', 'Cyrl', 'Hans'...
|
||||||
|
|
||||||
|
Indicates the script that tokens in this language should be in,
|
||||||
|
_after_ our preprocessing. The script for 'zh' is 'Hans', for example,
|
||||||
|
because even if the input is in Traditional Chinese ('Hant'), we
|
||||||
|
convert it to Simplified.
|
||||||
|
|
||||||
|
'tokenizer': 'regex', 'jieba', 'mecab', or None
|
||||||
|
|
||||||
|
Indicates the best way we know to separate tokens in the language.
|
||||||
|
|
||||||
|
'regex' is what will be used for most languages, meaning that we can
|
||||||
|
segment the text with a Unicode-aware regular expression. If a language
|
||||||
|
generally uses spaces to separate words, the regex will work well.
|
||||||
|
|
||||||
|
'jieba' and 'mecab' are tokenizers for specific languages written
|
||||||
|
without spaces.
|
||||||
|
|
||||||
|
A tokenizer of None means we don't have a good way to segment the
|
||||||
|
language. We'll use the regex anyway, but the results will be pretty
|
||||||
|
bad.
|
||||||
|
|
||||||
|
'normal_form': 'NFC' or 'NFKC'
|
||||||
|
|
||||||
|
How "should" Unicode be normalized when comparing text in this
|
||||||
|
language? This is not a standard, it's just based on experience.
|
||||||
|
Many languages need NFKC normalization for text comparisons to work
|
||||||
|
properly, but in many European languages, NFKC normalization is
|
||||||
|
excessive and loses information.
|
||||||
|
|
||||||
|
'remove_marks': True or False
|
||||||
|
|
||||||
|
Determines whether marks and decorations, such as vowel points and
|
||||||
|
tatweels, should be removed. True for languages in abjad scripts.
|
||||||
|
|
||||||
|
'dotless_i': True or False
|
||||||
|
|
||||||
|
Is "ı" the lowercase of "I" in this language, as in Turkish?
|
||||||
|
|
||||||
|
'diacritics_under': 'cedillas', 'commas', or None
|
||||||
|
|
||||||
|
Should we convert any diacritics that are under the letters "s" and
|
||||||
|
"t" in this language? 'cedillas' means we should convert commas to
|
||||||
|
cedillas, and 'commas' means we should convert cedillas to commas.
|
||||||
|
|
||||||
|
'transliteration': 'sr-Latn', 'az-Latn', or None
|
||||||
|
|
||||||
|
Indicates a type of transliteration that we should use for normalizing
|
||||||
|
a multi-script language. 'sr-Latn' means to use Serbian romanization,
|
||||||
|
and 'az-Latn' means to use Azerbaijani romanization.
|
||||||
|
|
||||||
|
'lookup_transliteration': 'zh-Hans' or None
|
||||||
|
|
||||||
|
Indicates a lossy transliteration that should be not be used for output,
|
||||||
|
but should be applied when looking up words in a list. 'zh-Hans' means
|
||||||
|
that we should convert Traditional Chinese characters to Simplified.
|
||||||
|
"""
|
||||||
|
# The input is probably a string, so parse it into a Language. If it's
|
||||||
|
# already a Language, it will pass through.
|
||||||
|
language = Language.get(language)
|
||||||
|
|
||||||
|
# Assume additional things about the language, such as what script it's in,
|
||||||
|
# using the "likely subtags" table
|
||||||
|
language_full = language.maximize()
|
||||||
|
|
||||||
|
# Start the `info` dictionary with default values, including the 'script'
|
||||||
|
# value that we now know from `language_full`.
|
||||||
|
info = {
|
||||||
|
'script': language_full.script,
|
||||||
|
'tokenizer': 'regex',
|
||||||
|
'normal_form': 'NFKC',
|
||||||
|
'remove_marks': False,
|
||||||
|
'dotless_i': False,
|
||||||
|
'diacritics_under': None,
|
||||||
|
'transliteration': None,
|
||||||
|
'lookup_transliteration': None
|
||||||
|
}
|
||||||
|
|
||||||
|
if _language_in_list(language, ['ja', 'ko']):
|
||||||
|
info['tokenizer'] = 'mecab'
|
||||||
|
elif _language_in_list(language, ['zh', 'yue']):
|
||||||
|
info['tokenizer'] = 'jieba'
|
||||||
|
elif info['script'] in SPACELESS_SCRIPTS:
|
||||||
|
info['tokenizer'] = None
|
||||||
|
|
||||||
|
# Cased alphabetic scripts get NFC normal form
|
||||||
|
if info['script'] in ['Latn', 'Grek', 'Cyrl']:
|
||||||
|
info['normal_form'] = 'NFC'
|
||||||
|
|
||||||
|
if info['script'] in ['Arab', 'Hebr']:
|
||||||
|
info['remove_marks'] = True
|
||||||
|
|
||||||
|
if _language_in_list(language, ['tr', 'az', 'kk']):
|
||||||
|
info['dotless_i'] = True
|
||||||
|
info['diacritics_under'] = 'cedillas'
|
||||||
|
elif _language_in_list(language, ['ro']):
|
||||||
|
info['diacritics_under'] = 'commas'
|
||||||
|
|
||||||
|
if _language_in_list(language, ['sr']):
|
||||||
|
info['transliteration'] = 'sr-Latn'
|
||||||
|
elif _language_in_list(language, ['az']):
|
||||||
|
info['transliteration'] = 'az-Latn'
|
||||||
|
|
||||||
|
if language.language == 'zh' and language.script != 'Hant':
|
||||||
|
info['lookup_transliteration'] = 'zh-Hans'
|
||||||
|
|
||||||
|
return info
|
265
wordfreq/preprocess.py
Normal file
265
wordfreq/preprocess.py
Normal file
@ -0,0 +1,265 @@
|
|||||||
|
import regex
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
|
from .language_info import get_language_info
|
||||||
|
from .transliterate import transliterate
|
||||||
|
|
||||||
|
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||||
|
|
||||||
|
DIGIT_RE = regex.compile('\d')
|
||||||
|
MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
|
||||||
|
|
||||||
|
|
||||||
|
def preprocess_text(text, language):
|
||||||
|
"""
|
||||||
|
This function applies pre-processing steps that convert forms of words
|
||||||
|
considered equivalent into one standardized form.
|
||||||
|
|
||||||
|
As one straightforward step, it case-folds the text. For the purposes of
|
||||||
|
wordfreq and related tools, a capitalized word shouldn't have a different
|
||||||
|
frequency from its lowercase version.
|
||||||
|
|
||||||
|
The steps that are applied in order, only some of which apply to each
|
||||||
|
language, are:
|
||||||
|
|
||||||
|
- NFC or NFKC normalization, as needed for the language
|
||||||
|
- Transliteration of multi-script languages
|
||||||
|
- Abjad mark removal
|
||||||
|
- Case folding
|
||||||
|
- Fixing of diacritics
|
||||||
|
|
||||||
|
We'll describe these steps out of order, to start with the more obvious
|
||||||
|
steps.
|
||||||
|
|
||||||
|
|
||||||
|
Case folding
|
||||||
|
------------
|
||||||
|
|
||||||
|
The most common effect of this function is that it case-folds alphabetic
|
||||||
|
text to lowercase:
|
||||||
|
|
||||||
|
>>> preprocess_text('Word', 'en')
|
||||||
|
'word'
|
||||||
|
|
||||||
|
This is proper Unicode-aware case-folding, so it eliminates distinctions
|
||||||
|
in lowercase letters that would not appear in uppercase. This accounts for
|
||||||
|
the German ß and the Greek final sigma:
|
||||||
|
|
||||||
|
>>> preprocess_text('groß', 'de')
|
||||||
|
'gross'
|
||||||
|
>>> preprocess_text('λέξις', 'el')
|
||||||
|
'λέξισ'
|
||||||
|
|
||||||
|
In Turkish (and Azerbaijani), case-folding is different, because the
|
||||||
|
uppercase and lowercase I come in two variants, one with a dot and one
|
||||||
|
without. They are matched in a way that preserves the number of dots, which
|
||||||
|
the usual pair of "I" and "i" do not.
|
||||||
|
|
||||||
|
>>> preprocess_text('HAKKINDA İSTANBUL', 'tr')
|
||||||
|
'hakkında istanbul'
|
||||||
|
|
||||||
|
|
||||||
|
Fixing of diacritics
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
While we're talking about Turkish: the Turkish alphabet contains letters
|
||||||
|
with cedillas attached to the bottom. In the case of "ş" and "ţ", these
|
||||||
|
letters are very similar to two Romanian letters, "ș" and "ț", which have
|
||||||
|
separate _commas_ below them.
|
||||||
|
|
||||||
|
(Did you know that a cedilla is not the same as a comma under a letter? I
|
||||||
|
didn't until I started dealing with text normalization. My keyboard layout
|
||||||
|
even inputs a letter with a cedilla when you hit Compose+comma.)
|
||||||
|
|
||||||
|
Because these letters look so similar, and because some fonts only include
|
||||||
|
one pair of letters and not the other, there are many cases where the
|
||||||
|
letters are confused with each other. Our preprocessing normalizes these
|
||||||
|
Turkish and Romanian letters to the letters each language prefers.
|
||||||
|
|
||||||
|
>>> preprocess_text('kișinin', 'tr') # comma to cedilla
|
||||||
|
'kişinin'
|
||||||
|
>>> preprocess_text('ACELAŞI', 'ro') # cedilla to comma
|
||||||
|
'același'
|
||||||
|
|
||||||
|
|
||||||
|
Unicode normalization
|
||||||
|
---------------------
|
||||||
|
|
||||||
|
Unicode text is NFC normalized in most languages, removing trivial
|
||||||
|
distinctions between strings that should be considered equivalent in all
|
||||||
|
cases:
|
||||||
|
|
||||||
|
>>> word = preprocess_text('natu\N{COMBINING DIAERESIS}rlich', 'de')
|
||||||
|
>>> word
|
||||||
|
'natürlich'
|
||||||
|
>>> '\N{LATIN SMALL LETTER U WITH DIAERESIS}' in word
|
||||||
|
True
|
||||||
|
|
||||||
|
NFC normalization is sufficient (and NFKC normalization is a bit too strong)
|
||||||
|
for many languages that are written in cased, alphabetic scripts.
|
||||||
|
Languages in other scripts tend to need stronger normalization to properly
|
||||||
|
compare text. So we use NFC normalization when the language's script is
|
||||||
|
Latin, Greek, or Cyrillic, and we use NFKC normalization for all other
|
||||||
|
languages.
|
||||||
|
|
||||||
|
Here's an example in Japanese, where preprocessing changes the width (and
|
||||||
|
the case) of a Latin letter that's used as part of a word:
|
||||||
|
|
||||||
|
>>> preprocess_text('Uターン', 'ja')
|
||||||
|
'uターン'
|
||||||
|
|
||||||
|
In Korean, NFKC normalization is important because it aligns two different
|
||||||
|
ways of encoding text -- as individual letters that are grouped together
|
||||||
|
into square characters, or as the entire syllables that those characters
|
||||||
|
represent:
|
||||||
|
|
||||||
|
>>> word = '\u1102\u1161\u11c0\u1106\u1161\u11af'
|
||||||
|
>>> word
|
||||||
|
'낱말'
|
||||||
|
>>> len(word)
|
||||||
|
6
|
||||||
|
>>> word = preprocess_text(word, 'ko')
|
||||||
|
>>> word
|
||||||
|
'낱말'
|
||||||
|
>>> len(word)
|
||||||
|
2
|
||||||
|
|
||||||
|
|
||||||
|
Abjad mark removal
|
||||||
|
------------------
|
||||||
|
|
||||||
|
There are many abjad languages, such as Arabic, Hebrew, Persian, and Urdu,
|
||||||
|
where words can be marked with vowel points but rarely are. In languages
|
||||||
|
that use abjad scripts, we remove all modifiers that are classified by
|
||||||
|
Unicode as "marks". We also remove an Arabic character called the tatweel,
|
||||||
|
which is used to visually lengthen a word.
|
||||||
|
|
||||||
|
>>> preprocess_text("كَلِمَة", 'ar')
|
||||||
|
'كلمة'
|
||||||
|
>>> preprocess_text("الحمــــــد", 'ar')
|
||||||
|
'الحمد'
|
||||||
|
|
||||||
|
Transliteration of multi-script languages
|
||||||
|
-----------------------------------------
|
||||||
|
|
||||||
|
Some languages are written in multiple scripts, and require special care.
|
||||||
|
These languages include Chinese, Serbian, and Azerbaijani.
|
||||||
|
|
||||||
|
In Serbian, there is a well-established mapping from Cyrillic letters to
|
||||||
|
Latin letters. We apply this mapping so that Serbian is always represented
|
||||||
|
in Latin letters.
|
||||||
|
|
||||||
|
>>> preprocess_text('схваташ', 'sr')
|
||||||
|
'shvataš'
|
||||||
|
|
||||||
|
The transliteration is more complete than it needs to be to cover just
|
||||||
|
Serbian, so that -- for example -- borrowings from Russian can be
|
||||||
|
transliterated, instead of coming out in a mixed script.
|
||||||
|
|
||||||
|
>>> preprocess_text('культуры', 'sr')
|
||||||
|
"kul'tury"
|
||||||
|
|
||||||
|
Azerbaijani (Azeri) has a similar transliteration step to Serbian,
|
||||||
|
and then the Latin-alphabet text is handled similarly to Turkish.
|
||||||
|
|
||||||
|
We don't transliterate Traditional to Simplified Chinese in this step.
|
||||||
|
There are some steps where we unify them internally: see chinese.py
|
||||||
|
for more information.
|
||||||
|
"""
|
||||||
|
# NFC or NFKC normalization, as needed for the language
|
||||||
|
info = get_language_info(language)
|
||||||
|
text = unicodedata.normalize(info['normal_form'], text)
|
||||||
|
|
||||||
|
# Transliteration of multi-script languages
|
||||||
|
if info['transliteration'] is not None:
|
||||||
|
text = transliterate(info['transliteration'], text)
|
||||||
|
|
||||||
|
# Abjad mark removal
|
||||||
|
if info['remove_marks']:
|
||||||
|
text = remove_marks(text)
|
||||||
|
|
||||||
|
# Case folding
|
||||||
|
if info['dotless_i']:
|
||||||
|
text = casefold_with_i_dots(text)
|
||||||
|
else:
|
||||||
|
text = text.casefold()
|
||||||
|
|
||||||
|
# Fixing of diacritics
|
||||||
|
if info['diacritics_under'] == 'commas':
|
||||||
|
text = cedillas_to_commas(text)
|
||||||
|
elif info['diacritics_under'] == 'cedillas':
|
||||||
|
text = commas_to_cedillas(text)
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def remove_marks(text):
|
||||||
|
"""
|
||||||
|
Remove decorations from words in abjad scripts:
|
||||||
|
|
||||||
|
- Combining marks of class Mn, which tend to represent non-essential
|
||||||
|
vowel markings.
|
||||||
|
- Tatweels, horizontal segments that are used to extend or justify an
|
||||||
|
Arabic word.
|
||||||
|
"""
|
||||||
|
return MARK_RE.sub('', text)
|
||||||
|
|
||||||
|
|
||||||
|
def casefold_with_i_dots(text):
|
||||||
|
"""
|
||||||
|
Convert capital I's and capital dotted İ's to lowercase in the way
|
||||||
|
that's appropriate for Turkish and related languages, then case-fold
|
||||||
|
the rest of the letters.
|
||||||
|
"""
|
||||||
|
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
||||||
|
return text.casefold()
|
||||||
|
|
||||||
|
|
||||||
|
def commas_to_cedillas(text):
|
||||||
|
"""
|
||||||
|
Convert s and t with commas (ș and ț) to cedillas (ş and ţ), which is
|
||||||
|
preferred in Turkish.
|
||||||
|
|
||||||
|
Only the lowercase versions are replaced, because this assumes the
|
||||||
|
text has already been case-folded.
|
||||||
|
"""
|
||||||
|
return text.replace(
|
||||||
|
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
|
||||||
|
'\N{LATIN SMALL LETTER S WITH CEDILLA}'
|
||||||
|
).replace(
|
||||||
|
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}',
|
||||||
|
'\N{LATIN SMALL LETTER T WITH CEDILLA}'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def cedillas_to_commas(text):
|
||||||
|
"""
|
||||||
|
Convert s and t with cedillas (ş and ţ) to commas (ș and ț), which is
|
||||||
|
preferred in Romanian.
|
||||||
|
|
||||||
|
Only the lowercase versions are replaced, because this assumes the
|
||||||
|
text has already been case-folded.
|
||||||
|
"""
|
||||||
|
return text.replace(
|
||||||
|
'\N{LATIN SMALL LETTER S WITH CEDILLA}',
|
||||||
|
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}'
|
||||||
|
).replace(
|
||||||
|
'\N{LATIN SMALL LETTER T WITH CEDILLA}',
|
||||||
|
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def sub_zeroes(match):
|
||||||
|
"""
|
||||||
|
Given a regex match, return what it matched with digits replaced by
|
||||||
|
zeroes.
|
||||||
|
"""
|
||||||
|
return DIGIT_RE.sub('0', match.group(0))
|
||||||
|
|
||||||
|
|
||||||
|
def smash_numbers(text):
|
||||||
|
"""
|
||||||
|
Replace sequences of multiple digits with zeroes, so we don't need to
|
||||||
|
distinguish the frequencies of thousands of numbers.
|
||||||
|
"""
|
||||||
|
return MULTI_DIGIT_RE.sub(sub_zeroes, text)
|
@ -1,30 +1,22 @@
|
|||||||
import regex
|
import regex
|
||||||
import unicodedata
|
import unicodedata
|
||||||
|
import logging
|
||||||
import langcodes
|
import langcodes
|
||||||
from .transliterate import serbian_cyrillic_to_latin
|
|
||||||
|
|
||||||
mecab_tokenize = None
|
from .language_info import get_language_info, SPACELESS_SCRIPTS
|
||||||
jieba_tokenize = None
|
from .preprocess import preprocess_text, smash_numbers
|
||||||
|
|
||||||
# See the documentation inside TOKEN_RE for why we have to handle these
|
# Placeholders for CJK functions that we'll import on demand
|
||||||
# scripts specially.
|
_mecab_tokenize = None
|
||||||
SPACELESS_SCRIPTS = [
|
_jieba_tokenize = None
|
||||||
'Hiragana',
|
_simplify_chinese = None
|
||||||
'Thai', # Thai script
|
|
||||||
'Khmr', # Khmer script
|
logger = logging.getLogger(__name__)
|
||||||
'Laoo', # Lao script
|
|
||||||
'Mymr', # Burmese script
|
|
||||||
'Tale', # Tai Le script
|
|
||||||
'Talu', # Tai Lü script
|
|
||||||
'Lana', # Lanna script
|
|
||||||
]
|
|
||||||
|
|
||||||
ABJAD_LANGUAGES = {
|
|
||||||
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
|
|
||||||
}
|
|
||||||
|
|
||||||
def _make_spaceless_expr():
|
def _make_spaceless_expr():
|
||||||
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
|
scripts = sorted(SPACELESS_SCRIPTS)
|
||||||
|
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts]
|
||||||
return ''.join(pieces)
|
return ''.join(pieces)
|
||||||
|
|
||||||
|
|
||||||
@ -116,10 +108,9 @@ TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
|||||||
\w'
|
\w'
|
||||||
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
||||||
|
|
||||||
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
|
||||||
|
|
||||||
DIGIT_RE = regex.compile('\d')
|
# Just identify punctuation, for cases where the tokenizer is separate
|
||||||
MULTI_DIGIT_RE = regex.compile('\d[\d.,]+')
|
PUNCT_RE = regex.compile(r"[\p{punct}]+")
|
||||||
|
|
||||||
|
|
||||||
def simple_tokenize(text, include_punctuation=False):
|
def simple_tokenize(text, include_punctuation=False):
|
||||||
@ -162,197 +153,27 @@ def simple_tokenize(text, include_punctuation=False):
|
|||||||
for token in TOKEN_RE.findall(text)
|
for token in TOKEN_RE.findall(text)
|
||||||
]
|
]
|
||||||
|
|
||||||
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
|
||||||
"""
|
|
||||||
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
|
||||||
"""
|
|
||||||
global mecab_tokenize
|
|
||||||
if not (lang == 'ja' or lang == 'ko'):
|
|
||||||
raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
|
|
||||||
if mecab_tokenize is None:
|
|
||||||
from wordfreq.mecab import mecab_tokenize
|
|
||||||
tokens = mecab_tokenize(text, lang)
|
|
||||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
|
||||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
|
||||||
|
|
||||||
|
def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||||
def chinese_tokenize(text, include_punctuation=False, external_wordlist=False):
|
|
||||||
"""
|
|
||||||
Tokenize Chinese text, initializing the Jieba tokenizer if necessary.
|
|
||||||
"""
|
|
||||||
global jieba_tokenize
|
|
||||||
if jieba_tokenize is None:
|
|
||||||
from wordfreq.chinese import jieba_tokenize
|
|
||||||
tokens = jieba_tokenize(text, external_wordlist=external_wordlist)
|
|
||||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
|
||||||
return [token.casefold() for token in tokens if token_expr.match(token)]
|
|
||||||
|
|
||||||
|
|
||||||
def remove_marks(text):
|
|
||||||
"""
|
|
||||||
Remove decorations from words in abjad scripts:
|
|
||||||
|
|
||||||
- Combining marks of class Mn, which tend to represent non-essential
|
|
||||||
vowel markings.
|
|
||||||
- Tatweels, horizontal segments that are used to extend or justify an
|
|
||||||
Arabic word.
|
|
||||||
"""
|
|
||||||
return MARK_RE.sub('', text)
|
|
||||||
|
|
||||||
|
|
||||||
def commas_to_cedillas(text):
|
|
||||||
"""
|
|
||||||
Convert s and t with commas (ș and ț) to cedillas (ş and ţ), which is
|
|
||||||
preferred in Turkish.
|
|
||||||
|
|
||||||
Only the lowercase versions are replaced, because this assumes the
|
|
||||||
text has already been case-folded.
|
|
||||||
"""
|
|
||||||
return text.replace(
|
|
||||||
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
|
|
||||||
'\N{LATIN SMALL LETTER S WITH CEDILLA}'
|
|
||||||
).replace(
|
|
||||||
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}',
|
|
||||||
'\N{LATIN SMALL LETTER T WITH CEDILLA}'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def cedillas_to_commas(text):
|
|
||||||
"""
|
|
||||||
Convert s and t with cedillas (ş and ţ) to commas (ș and ț), which is
|
|
||||||
preferred in Romanian.
|
|
||||||
|
|
||||||
Only the lowercase versions are replaced, because this assumes the
|
|
||||||
text has already been case-folded.
|
|
||||||
"""
|
|
||||||
return text.replace(
|
|
||||||
'\N{LATIN SMALL LETTER S WITH CEDILLA}',
|
|
||||||
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}'
|
|
||||||
).replace(
|
|
||||||
'\N{LATIN SMALL LETTER T WITH CEDILLA}',
|
|
||||||
'\N{LATIN SMALL LETTER T WITH COMMA BELOW}'
|
|
||||||
)
|
|
||||||
|
|
||||||
def preprocess_turkish(text):
|
|
||||||
"""
|
|
||||||
Modifies i's so that they case-fold correctly in Turkish, and modifies
|
|
||||||
'comma-below' characters to use cedillas.
|
|
||||||
"""
|
|
||||||
text = unicodedata.normalize('NFC', text).replace('İ', 'i').replace('I', 'ı')
|
|
||||||
return commas_to_cedillas(text.casefold())
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess_romanian(text):
|
|
||||||
"""
|
|
||||||
Modifies the letters ş and ţ (with cedillas) to use commas-below instead.
|
|
||||||
"""
|
|
||||||
return cedillas_to_commas(text.casefold())
|
|
||||||
|
|
||||||
|
|
||||||
def preprocess_serbian(text):
|
|
||||||
"""
|
|
||||||
Serbian is written in two scripts, so transliterate from Cyrillic to Latin
|
|
||||||
(which is the unambiguous direction).
|
|
||||||
"""
|
|
||||||
return serbian_cyrillic_to_latin(text)
|
|
||||||
|
|
||||||
|
|
||||||
def sub_zeroes(match):
|
|
||||||
"""
|
|
||||||
Given a regex match, return what it matched with digits replaced by
|
|
||||||
zeroes.
|
|
||||||
"""
|
|
||||||
return DIGIT_RE.sub('0', match.group(0))
|
|
||||||
|
|
||||||
|
|
||||||
def smash_numbers(text):
|
|
||||||
"""
|
|
||||||
Replace sequences of multiple digits with zeroes, so we don't need to
|
|
||||||
distinguish the frequencies of thousands of numbers.
|
|
||||||
"""
|
|
||||||
return MULTI_DIGIT_RE.sub(sub_zeroes, text)
|
|
||||||
|
|
||||||
|
|
||||||
def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
|
|
||||||
combine_numbers=False):
|
|
||||||
"""
|
"""
|
||||||
Tokenize this text in a way that's relatively simple but appropriate for
|
Tokenize this text in a way that's relatively simple but appropriate for
|
||||||
the language. Strings that are looked up in wordfreq will be run through
|
the language. Strings that are looked up in wordfreq will be run through
|
||||||
this function first, so that they can be expected to match the data.
|
this function first, so that they can be expected to match the data.
|
||||||
|
|
||||||
Some of the processing steps are specific to one language, such as Chinese,
|
The text will be run through a number of pre-processing steps that vary
|
||||||
but what broadly happens to the text depends on what general writing system
|
by language; see the docstring of `wordfreq.preprocess.preprocess_text`.
|
||||||
the language uses, out of these categories:
|
|
||||||
|
|
||||||
- Alphabetic scripts: English, Spanish, Russian, etc.
|
|
||||||
- Abjad scripts: Arabic, Hebrew, Persian, Urdu, etc.
|
|
||||||
- CJK scripts: Chinese, Japanese, Korean
|
|
||||||
- Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc.
|
|
||||||
|
|
||||||
The options `include_punctuation`, `external_wordlist`, and
|
|
||||||
`combine_numbers` are passed on to the appropriate tokenizer:
|
|
||||||
|
|
||||||
- `include_punctuation` preserves punctuation as tokens, instead of
|
|
||||||
removing it.
|
|
||||||
|
|
||||||
- `external_wordlist` uses the default Jieba wordlist to tokenize Chinese,
|
|
||||||
instead of wordfreq's wordlist.
|
|
||||||
|
|
||||||
- `combine_numbers` replaces multi-digit numbers with strings of zeroes.
|
|
||||||
|
|
||||||
|
|
||||||
Alphabetic scripts
|
|
||||||
------------------
|
|
||||||
|
|
||||||
The major alphabetic scripts -- Latin, Cyrillic, and Greek -- cover most
|
|
||||||
European languages, which are relatively straightforward to tokenize.
|
|
||||||
|
|
||||||
Text in these scripts will be normalized to NFC form, then passed
|
|
||||||
through a regular expression that implements the Word Segmentation section
|
|
||||||
of Unicode Annex #29, and then case-folded to lowercase.
|
|
||||||
|
|
||||||
The effect is mostly to split the text on spaces and punctuation. There are
|
|
||||||
some subtleties involving apostrophes inside words, which the regex will
|
|
||||||
only split when they occur before a vowel. ("Hasn't" is one token, but
|
|
||||||
"l'enfant" is two.)
|
|
||||||
|
|
||||||
If the language is Turkish, the case-folding rules will take this into
|
|
||||||
account, so that capital I and İ map to ı and i respectively.
|
|
||||||
|
|
||||||
|
|
||||||
Abjad scripts
|
|
||||||
-------------
|
|
||||||
|
|
||||||
Languages in the Arabic or Hebrew scripts are written with optional vowel
|
|
||||||
marks, and sometimes other decorative markings and ligatures. In these
|
|
||||||
languages:
|
|
||||||
|
|
||||||
- The text will be NFKC-normalized, which is a stronger and lossier form
|
|
||||||
than NFC. Here its purpose is to reduce ligatures to simpler characters.
|
|
||||||
|
|
||||||
- Marks will be removed, as well as the Arabic tatweel (an extension of
|
|
||||||
a word that is used for justification or decoration).
|
|
||||||
|
|
||||||
After these steps, the text will go through the same process as the
|
|
||||||
alphabetic scripts above.
|
|
||||||
|
|
||||||
|
If `include_punctuation` is True, punctuation will be included as separate
|
||||||
|
tokens. Otherwise, punctuation will be omitted in the output.
|
||||||
|
|
||||||
CJK scripts
|
CJK scripts
|
||||||
-----------
|
-----------
|
||||||
|
|
||||||
In the CJK languages, word boundaries can't usually be identified by a
|
In the CJK languages, word boundaries can't usually be identified by a
|
||||||
regular expression. Instead, there needs to be some language-specific
|
regular expression. Instead, there needs to be some language-specific
|
||||||
handling.
|
handling. In Chinese, we use the Jieba tokenizer, with a custom word list
|
||||||
|
to match the words whose frequencies we can look up. In Japanese and
|
||||||
- Chinese text first gets converted to a canonical representation we call
|
Korean, we use the MeCab tokenizer.
|
||||||
"Oversimplified Chinese", where all characters are replaced by their
|
|
||||||
Simplified Chinese form, no matter what, even when this misspells a word or
|
|
||||||
a name. This representation is then tokenized using the Jieba tokenizer,
|
|
||||||
trained on the list of Chinese words that can be looked up in wordfreq.
|
|
||||||
|
|
||||||
- Japanese and Korean will be NFKC-normalized, then tokenized using the
|
|
||||||
MeCab tokenizer, using dictionary files that are included in this
|
|
||||||
package.
|
|
||||||
|
|
||||||
The `external_wordlist` option only affects Chinese tokenization. If it's
|
The `external_wordlist` option only affects Chinese tokenization. If it's
|
||||||
True, then wordfreq will not use its own Chinese wordlist for tokenization.
|
True, then wordfreq will not use its own Chinese wordlist for tokenization.
|
||||||
@ -364,39 +185,64 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False,
|
|||||||
If you end up seeing tokens that are entire phrases or sentences glued
|
If you end up seeing tokens that are entire phrases or sentences glued
|
||||||
together, that probably means you passed in CJK text with the wrong
|
together, that probably means you passed in CJK text with the wrong
|
||||||
language code.
|
language code.
|
||||||
|
|
||||||
|
|
||||||
Brahmic scripts and other languages
|
|
||||||
-----------------------------------
|
|
||||||
|
|
||||||
Any kind of language not previously mentioned will just go through the same
|
|
||||||
tokenizer that alphabetic languages use. This includes the Brahmic scripts
|
|
||||||
used in Hindi, Tamil, and Telugu, for example.
|
|
||||||
|
|
||||||
Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are
|
|
||||||
written in Brahmic-derived scripts, but usually *without spaces*. wordfreq
|
|
||||||
does not support these languages yet. It will split on spaces and
|
|
||||||
punctuation, giving tokens that are far too long.
|
|
||||||
"""
|
"""
|
||||||
# Reduce whatever language code was passed in to a normal form,
|
# Use globals to load CJK tokenizers on demand, so that we can still run
|
||||||
# containing just the language subtag.
|
# in environments that lack the CJK dependencies
|
||||||
lang = langcodes.get(lang).prefer_macrolanguage().language
|
global _mecab_tokenize, _jieba_tokenize
|
||||||
if lang == 'ja' or lang == 'ko':
|
|
||||||
result = tokenize_mecab_language(text, lang, include_punctuation)
|
|
||||||
elif lang == 'zh' or lang == 'yue':
|
|
||||||
result = chinese_tokenize(text, include_punctuation, external_wordlist)
|
|
||||||
elif lang == 'tr':
|
|
||||||
result = simple_tokenize(preprocess_turkish(text), include_punctuation)
|
|
||||||
elif lang == 'ro':
|
|
||||||
result = simple_tokenize(preprocess_romanian(text), include_punctuation)
|
|
||||||
elif lang == 'sr':
|
|
||||||
result = simple_tokenize(preprocess_serbian(text), include_punctuation)
|
|
||||||
elif lang in ABJAD_LANGUAGES:
|
|
||||||
text = remove_marks(unicodedata.normalize('NFKC', text))
|
|
||||||
result = simple_tokenize(text, include_punctuation)
|
|
||||||
else:
|
|
||||||
result = simple_tokenize(text, include_punctuation)
|
|
||||||
|
|
||||||
if combine_numbers:
|
language = langcodes.get(lang)
|
||||||
result = [smash_numbers(token) for token in result]
|
info = get_language_info(language)
|
||||||
return result
|
text = preprocess_text(text, language)
|
||||||
|
|
||||||
|
if info['tokenizer'] == 'mecab':
|
||||||
|
from wordfreq.mecab import mecab_tokenize as _mecab_tokenize
|
||||||
|
# Get just the language code out of the Language object, so we can
|
||||||
|
# use it to select a MeCab dictionary
|
||||||
|
tokens = _mecab_tokenize(text, language.language)
|
||||||
|
if not include_punctuation:
|
||||||
|
tokens = [token for token in tokens if not PUNCT_RE.match(token)]
|
||||||
|
elif info['tokenizer'] == 'jieba':
|
||||||
|
from wordfreq.chinese import jieba_tokenize as _jieba_tokenize
|
||||||
|
tokens = _jieba_tokenize(text, external_wordlist=external_wordlist)
|
||||||
|
if not include_punctuation:
|
||||||
|
tokens = [token for token in tokens if not PUNCT_RE.match(token)]
|
||||||
|
else:
|
||||||
|
# This is the default case where we use the regex tokenizer. First
|
||||||
|
# let's complain a bit if we ended up here because we don't have an
|
||||||
|
# appropriate tokenizer.
|
||||||
|
if info['tokenizer'] != 'regex':
|
||||||
|
logger.warning(
|
||||||
|
"The language '{}' is in the '{}' script, which we don't "
|
||||||
|
"have a tokenizer for. The results will be bad."
|
||||||
|
.format(lang, info['script'])
|
||||||
|
)
|
||||||
|
tokens = simple_tokenize(text, include_punctuation=include_punctuation)
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
def lossy_tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||||
|
"""
|
||||||
|
Get a list of tokens for this text, with largely the same results and
|
||||||
|
options as `tokenize`, but aggressively normalize some text in a lossy way
|
||||||
|
that's good for counting word frequencies.
|
||||||
|
|
||||||
|
In particular:
|
||||||
|
|
||||||
|
- If a token has 2 adjacent digits, all its digits will be replaced with
|
||||||
|
the digit '0', so that frequencies for numbers don't have to be counted
|
||||||
|
separately. This is similar to word2vec, which replaces them with '#'.
|
||||||
|
|
||||||
|
- In Chinese, unless Traditional Chinese is specifically requested using
|
||||||
|
'zh-Hant', all characters will be converted to Simplified Chinese.
|
||||||
|
"""
|
||||||
|
global _simplify_chinese
|
||||||
|
|
||||||
|
info = get_language_info(lang)
|
||||||
|
tokens = tokenize(text, lang, include_punctuation, external_wordlist)
|
||||||
|
|
||||||
|
if info['lookup_transliteration'] == 'zh-Hans':
|
||||||
|
from wordfreq.chinese import simplify_chinese as _simplify_chinese
|
||||||
|
tokens = [_simplify_chinese(token) for token in tokens]
|
||||||
|
|
||||||
|
return [smash_numbers(token) for token in tokens]
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
# This table comes from https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping.py,
|
# This table comes from https://github.com/opendatakosovo/cyrillic-transliteration/blob/master/cyrtranslit/mapping.py,
|
||||||
# from the 'cyrtranslit' module, which can't currently be imported in Python 3.
|
# from the 'cyrtranslit' module. We originally had to reimplement it because
|
||||||
SR_CYRL_TO_LATN_DICT = {
|
# 'cyrtranslit' didn't work in Python 3; now it does, but we've made the table
|
||||||
|
# more robust than the one in cyrtranslit.
|
||||||
|
SR_LATN_TABLE = {
|
||||||
ord('А'): 'A', ord('а'): 'a',
|
ord('А'): 'A', ord('а'): 'a',
|
||||||
ord('Б'): 'B', ord('б'): 'b',
|
ord('Б'): 'B', ord('б'): 'b',
|
||||||
ord('В'): 'V', ord('в'): 'v',
|
ord('В'): 'V', ord('в'): 'v',
|
||||||
@ -55,7 +57,7 @@ SR_CYRL_TO_LATN_DICT = {
|
|||||||
# Ukrainian letters
|
# Ukrainian letters
|
||||||
ord('Є'): 'Je', ord('є'): 'je',
|
ord('Є'): 'Je', ord('є'): 'je',
|
||||||
ord('І'): 'I', ord('і'): 'i',
|
ord('І'): 'I', ord('і'): 'i',
|
||||||
ord('Ї'): 'Ji', ord('ї'): 'ji',
|
ord('Ї'): 'Ï', ord('ї'): 'ï',
|
||||||
ord('Ґ'): 'G', ord('ґ'): 'g',
|
ord('Ґ'): 'G', ord('ґ'): 'g',
|
||||||
|
|
||||||
# Macedonian letters
|
# Macedonian letters
|
||||||
@ -64,7 +66,43 @@ SR_CYRL_TO_LATN_DICT = {
|
|||||||
ord('Ќ'): 'Ḱ', ord('ќ'): 'ḱ',
|
ord('Ќ'): 'Ḱ', ord('ќ'): 'ḱ',
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AZ_LATN_TABLE = SR_LATN_TABLE.copy()
|
||||||
|
AZ_LATN_TABLE.update({
|
||||||
|
# Distinct Azerbaijani letters
|
||||||
|
ord('Ҹ'): 'C', ord('ҹ'): 'c',
|
||||||
|
ord('Ә'): 'Ə', ord('ә'): 'ə',
|
||||||
|
ord('Ғ'): 'Ğ', ord('ғ'): 'ğ',
|
||||||
|
ord('Һ'): 'H', ord('һ'): 'h',
|
||||||
|
ord('Ө'): 'Ö', ord('ө'): 'ö',
|
||||||
|
ord('Ҝ'): 'G', ord('ҝ'): 'g',
|
||||||
|
ord('Ү'): 'Ü', ord('ү'): 'ü',
|
||||||
|
|
||||||
def serbian_cyrillic_to_latin(text):
|
# Azerbaijani letters with different transliterations
|
||||||
return text.translate(SR_CYRL_TO_LATN_DICT)
|
ord('Ч'): 'Ç', ord('ч'): 'ç',
|
||||||
|
ord('Х'): 'X', ord('х'): 'x',
|
||||||
|
ord('Ы'): 'I', ord('ы'): 'ı',
|
||||||
|
ord('И'): 'İ', ord('и'): 'ı',
|
||||||
|
ord('Ж'): 'J', ord('ж'): 'j',
|
||||||
|
ord('Ј'): 'Y', ord('ј'): 'y',
|
||||||
|
ord('Г'): 'Q', ord('г'): 'q',
|
||||||
|
ord('Ш'): 'Ş', ord('ш'): 'ş',
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def transliterate(table, text):
|
||||||
|
"""
|
||||||
|
Transliterate text according to one of the tables above.
|
||||||
|
|
||||||
|
`table` chooses the table. It looks like a language code but comes from a
|
||||||
|
very restricted set:
|
||||||
|
|
||||||
|
- 'sr-Latn' means to convert Serbian, which may be in Cyrillic, into the
|
||||||
|
Latin alphabet.
|
||||||
|
- 'az-Latn' means the same for Azerbaijani Cyrillic to Latn.
|
||||||
|
"""
|
||||||
|
if table == 'sr-Latn':
|
||||||
|
return text.translate(SR_LATN_TABLE)
|
||||||
|
elif table == 'az-Latn':
|
||||||
|
return text.translate(AZ_LATN_TABLE)
|
||||||
|
else:
|
||||||
|
raise ValueError("Unknown transliteration table: {!r}".format(table))
|
||||||
|
Loading…
Reference in New Issue
Block a user