Code review fixes: avoid repeatedly constructing sets

Former-commit-id: 1a16b0f84c
This commit is contained in:
Robyn Speer 2016-07-29 12:32:26 -04:00
parent 68c6d95131
commit 15667ea023

View File

@ -18,6 +18,10 @@ SPACELESS_SCRIPTS = [
'Lana', # Lanna script 'Lana', # Lanna script
] ]
ABJAD_LANGUAGES = {
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
}
def _make_spaceless_expr(): def _make_spaceless_expr():
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS] pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
@ -143,7 +147,7 @@ def tokenize_mecab_language(text, lang, include_punctuation=False):
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary. Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
""" """
global mecab_tokenize global mecab_tokenize
if lang not in {'ja', 'ko'}: if not (lang == 'ja' or lang == 'ko'):
raise ValueError("Only Japanese and Korean can be tokenized using MeCab") raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
if mecab_tokenize is None: if mecab_tokenize is None:
from wordfreq.mecab import mecab_tokenize from wordfreq.mecab import mecab_tokenize
@ -180,6 +184,9 @@ def commas_to_cedillas(text):
""" """
Convert s and t with commas (ș and ț) to cedillas (ş and ţ), which is Convert s and t with commas (ș and ț) to cedillas (ş and ţ), which is
preferred in Turkish. preferred in Turkish.
Only the lowercase versions are replaced, because this assumes the
text has already been case-folded.
""" """
return text.replace( return text.replace(
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}', '\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
@ -194,6 +201,9 @@ def cedillas_to_commas(text):
""" """
Convert s and t with cedillas (ş and ţ) to commas (ș and ț), which is Convert s and t with cedillas (ş and ţ) to commas (ș and ț), which is
preferred in Romanian. preferred in Romanian.
Only the lowercase versions are replaced, because this assumes the
text has already been case-folded.
""" """
return text.replace( return text.replace(
'\N{LATIN SMALL LETTER S WITH CEDILLA}', '\N{LATIN SMALL LETTER S WITH CEDILLA}',
@ -308,8 +318,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
return turkish_tokenize(text, include_punctuation) return turkish_tokenize(text, include_punctuation)
elif lang == 'ro': elif lang == 'ro':
return romanian_tokenize(text, include_punctuation) return romanian_tokenize(text, include_punctuation)
elif lang in {'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'}: elif lang in ABJAD_LANGUAGES:
# Abjad languages
text = remove_marks(unicodedata.normalize('NFKC', text)) text = remove_marks(unicodedata.normalize('NFKC', text))
return simple_tokenize(text, include_punctuation) return simple_tokenize(text, include_punctuation)
else: else: