mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
fix tokenization of words like "l'heure"
This commit is contained in:
parent
7f26270644
commit
596368ac6e
2
setup.py
2
setup.py
@ -34,7 +34,7 @@ if sys.version_info < (3, 4):
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="wordfreq",
|
name="wordfreq",
|
||||||
version='1.5.1',
|
version='1.6',
|
||||||
maintainer='Luminoso Technologies, Inc.',
|
maintainer='Luminoso Technologies, Inc.',
|
||||||
maintainer_email='info@luminoso.com',
|
maintainer_email='info@luminoso.com',
|
||||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||||
|
@ -22,6 +22,22 @@ ABJAD_LANGUAGES = {
|
|||||||
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
|
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Languages that can stick particles such as «l'» onto a word starting with
|
||||||
|
# a vowel sound, and where this vowel sound can follow a silent «h». These
|
||||||
|
# are French and related languages.
|
||||||
|
FRENCH_ISH_LANGUAGES = {
|
||||||
|
'fr', # French
|
||||||
|
'ca', # Catalan
|
||||||
|
'frp', # Franco-Provençal or Arpitan
|
||||||
|
'nrf', # Norman French / Jèrriais / Guernésiais
|
||||||
|
'oc', # Occitan
|
||||||
|
'pcd', # Picard
|
||||||
|
'wa', # Walloon
|
||||||
|
|
||||||
|
'frm', # Middle French
|
||||||
|
'fro', # Old French
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _make_spaceless_expr():
|
def _make_spaceless_expr():
|
||||||
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
|
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
|
||||||
@ -113,8 +129,10 @@ def simple_tokenize(text, include_punctuation=False):
|
|||||||
would end up in its own token, which is worse.
|
would end up in its own token, which is worse.
|
||||||
"""
|
"""
|
||||||
text = unicodedata.normalize('NFC', text)
|
text = unicodedata.normalize('NFC', text)
|
||||||
token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
|
if include_punctuation:
|
||||||
return [token.strip("'").casefold() for token in token_expr.findall(text)]
|
return [token.casefold() for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)]
|
||||||
|
else:
|
||||||
|
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
||||||
|
|
||||||
|
|
||||||
def turkish_tokenize(text, include_punctuation=False):
|
def turkish_tokenize(text, include_punctuation=False):
|
||||||
@ -142,6 +160,30 @@ def romanian_tokenize(text, include_punctuation=False):
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def french_tokenize(text, include_punctuation=False):
|
||||||
|
"""
|
||||||
|
Handle French apostrophes that precede an 'h', which should work the same as
|
||||||
|
before a vowel, which the Unicode Consortium forgot. "l'heure" should tokenize
|
||||||
|
as "l'" and "heure".
|
||||||
|
|
||||||
|
This also applies the same way to other languages such as Catalan.
|
||||||
|
"""
|
||||||
|
tokens = []
|
||||||
|
for token in simple_tokenize(text, include_punctuation):
|
||||||
|
if "'h" in token:
|
||||||
|
idx = token.find("'h")
|
||||||
|
if include_punctuation:
|
||||||
|
# Only include the apostrophe in the token if
|
||||||
|
# include_punctuation is True
|
||||||
|
tokens.append(token[:idx + 1])
|
||||||
|
else:
|
||||||
|
tokens.append(token[:idx])
|
||||||
|
tokens.append(token[idx + 1:])
|
||||||
|
else:
|
||||||
|
tokens.append(token)
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
||||||
"""
|
"""
|
||||||
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
||||||
@ -318,6 +360,8 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
|||||||
return turkish_tokenize(text, include_punctuation)
|
return turkish_tokenize(text, include_punctuation)
|
||||||
elif lang == 'ro':
|
elif lang == 'ro':
|
||||||
return romanian_tokenize(text, include_punctuation)
|
return romanian_tokenize(text, include_punctuation)
|
||||||
|
elif lang in FRENCH_ISH_LANGUAGES:
|
||||||
|
return french_tokenize(text, include_punctuation)
|
||||||
elif lang in ABJAD_LANGUAGES:
|
elif lang in ABJAD_LANGUAGES:
|
||||||
text = remove_marks(unicodedata.normalize('NFKC', text))
|
text = remove_marks(unicodedata.normalize('NFKC', text))
|
||||||
return simple_tokenize(text, include_punctuation)
|
return simple_tokenize(text, include_punctuation)
|
||||||
|
Loading…
Reference in New Issue
Block a user