mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Code review fixes: avoid repeatedly constructing sets
Former-commit-id: 1a16b0f84c
This commit is contained in:
parent
68c6d95131
commit
15667ea023
@ -18,6 +18,10 @@ SPACELESS_SCRIPTS = [
|
|||||||
'Lana', # Lanna script
|
'Lana', # Lanna script
|
||||||
]
|
]
|
||||||
|
|
||||||
|
ABJAD_LANGUAGES = {
|
||||||
|
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _make_spaceless_expr():
|
def _make_spaceless_expr():
|
||||||
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
|
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
|
||||||
@ -143,7 +147,7 @@ def tokenize_mecab_language(text, lang, include_punctuation=False):
|
|||||||
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
||||||
"""
|
"""
|
||||||
global mecab_tokenize
|
global mecab_tokenize
|
||||||
if lang not in {'ja', 'ko'}:
|
if not (lang == 'ja' or lang == 'ko'):
|
||||||
raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
|
raise ValueError("Only Japanese and Korean can be tokenized using MeCab")
|
||||||
if mecab_tokenize is None:
|
if mecab_tokenize is None:
|
||||||
from wordfreq.mecab import mecab_tokenize
|
from wordfreq.mecab import mecab_tokenize
|
||||||
@ -180,6 +184,9 @@ def commas_to_cedillas(text):
|
|||||||
"""
|
"""
|
||||||
Convert s and t with commas (ș and ț) to cedillas (ş and ţ), which is
|
Convert s and t with commas (ș and ț) to cedillas (ş and ţ), which is
|
||||||
preferred in Turkish.
|
preferred in Turkish.
|
||||||
|
|
||||||
|
Only the lowercase versions are replaced, because this assumes the
|
||||||
|
text has already been case-folded.
|
||||||
"""
|
"""
|
||||||
return text.replace(
|
return text.replace(
|
||||||
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
|
'\N{LATIN SMALL LETTER S WITH COMMA BELOW}',
|
||||||
@ -194,6 +201,9 @@ def cedillas_to_commas(text):
|
|||||||
"""
|
"""
|
||||||
Convert s and t with cedillas (ş and ţ) to commas (ș and ț), which is
|
Convert s and t with cedillas (ş and ţ) to commas (ș and ț), which is
|
||||||
preferred in Romanian.
|
preferred in Romanian.
|
||||||
|
|
||||||
|
Only the lowercase versions are replaced, because this assumes the
|
||||||
|
text has already been case-folded.
|
||||||
"""
|
"""
|
||||||
return text.replace(
|
return text.replace(
|
||||||
'\N{LATIN SMALL LETTER S WITH CEDILLA}',
|
'\N{LATIN SMALL LETTER S WITH CEDILLA}',
|
||||||
@ -308,8 +318,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
|||||||
return turkish_tokenize(text, include_punctuation)
|
return turkish_tokenize(text, include_punctuation)
|
||||||
elif lang == 'ro':
|
elif lang == 'ro':
|
||||||
return romanian_tokenize(text, include_punctuation)
|
return romanian_tokenize(text, include_punctuation)
|
||||||
elif lang in {'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'}:
|
elif lang in ABJAD_LANGUAGES:
|
||||||
# Abjad languages
|
|
||||||
text = remove_marks(unicodedata.normalize('NFKC', text))
|
text = remove_marks(unicodedata.normalize('NFKC', text))
|
||||||
return simple_tokenize(text, include_punctuation)
|
return simple_tokenize(text, include_punctuation)
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user