From a92c805a823fea0419c7d97967def8c3b3eb5ece Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Mon, 5 Dec 2016 18:40:53 -0500 Subject: [PATCH] fix tokenization of words like "l'heure" --- setup.py | 2 +- wordfreq/tokens.py | 48 ++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 593c435..7f7b124 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ if sys.version_info < (3, 4): setup( name="wordfreq", - version='1.5.1', + version='1.6', maintainer='Luminoso Technologies, Inc.', maintainer_email='info@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/', diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index a2d67bf..5cf55f9 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -22,6 +22,22 @@ ABJAD_LANGUAGES = { 'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi' } +# Languages that can stick particles such as «l'» onto a word starting with +# a vowel sound, and where this vowel sound can follow a silent «h». These +# are French and related languages. +FRENCH_ISH_LANGUAGES = { + 'fr', # French + 'ca', # Catalan + 'frp', # Franco-Provençal or Arpitan + 'nrf', # Norman French / Jèrriais / Guernésiais + 'oc', # Occitan + 'pcd', # Picard + 'wa', # Walloon + + 'frm', # Middle French + 'fro', # Old French +} + def _make_spaceless_expr(): pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS] @@ -113,8 +129,10 @@ def simple_tokenize(text, include_punctuation=False): would end up in its own token, which is worse. """ text = unicodedata.normalize('NFC', text) - token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE - return [token.strip("'").casefold() for token in token_expr.findall(text)] + if include_punctuation: + return [token.casefold() for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)] + else: + return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] def turkish_tokenize(text, include_punctuation=False): @@ -142,6 +160,30 @@ def romanian_tokenize(text, include_punctuation=False): ] +def french_tokenize(text, include_punctuation=False): + """ + Handle French apostrophes that precede an 'h', which should work the same as + before a vowel, which the Unicode Consortium forgot. "l'heure" should tokenize + as "l'" and "heure". + + This also applies the same way to other languages such as Catalan. + """ + tokens = [] + for token in simple_tokenize(text, include_punctuation): + if "'h" in token: + idx = token.find("'h") + if include_punctuation: + # Only include the apostrophe in the token if + # include_punctuation is True + tokens.append(token[:idx + 1]) + else: + tokens.append(token[:idx]) + tokens.append(token[idx + 1:]) + else: + tokens.append(token) + return tokens + + def tokenize_mecab_language(text, lang, include_punctuation=False): """ Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary. @@ -318,6 +360,8 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False): return turkish_tokenize(text, include_punctuation) elif lang == 'ro': return romanian_tokenize(text, include_punctuation) + elif lang in FRENCH_ISH_LANGUAGES: + return french_tokenize(text, include_punctuation) elif lang in ABJAD_LANGUAGES: text = remove_marks(unicodedata.normalize('NFKC', text)) return simple_tokenize(text, include_punctuation)