fix tokenization of words like "l'heure"

2024-12-24 09:51:38 +00:00 · 2016-12-05 18:40:53 -05:00 · 2016-12-05 18:40:53 -05:00 · a92c805a82
commit a92c805a82
parent f6f0914e81
2 changed files with 47 additions and 3 deletions
--- a/setup.py
+++ b/setup.py
@ -34,7 +34,7 @@ if sys.version_info < (3, 4):

 setup(
    name="wordfreq",
-    version='1.5.1',
+    version='1.6',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -22,6 +22,22 @@ ABJAD_LANGUAGES = {
    'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
 }

+# Languages that can stick particles such as «l'» onto a word starting with
+# a vowel sound, and where this vowel sound can follow a silent «h». These
+# are French and related languages.
+FRENCH_ISH_LANGUAGES = {
+    'fr',   # French
+    'ca',   # Catalan
+    'frp',  # Franco-Provençal or Arpitan
+    'nrf',  # Norman French / Jèrriais / Guernésiais
+    'oc',   # Occitan
+    'pcd',  # Picard
+    'wa',   # Walloon
+
+    'frm',  # Middle French
+    'fro',  # Old French
+}
+

 def _make_spaceless_expr():
    pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
@ -113,8 +129,10 @@ def simple_tokenize(text, include_punctuation=False):
      would end up in its own token, which is worse.
    """
    text = unicodedata.normalize('NFC', text)
-    token_expr = TOKEN_RE_WITH_PUNCTUATION if include_punctuation else TOKEN_RE
-    return [token.strip("'").casefold() for token in token_expr.findall(text)]
+    if include_punctuation:
+        return [token.casefold() for token in TOKEN_RE_WITH_PUNCTUATION.findall(text)]
+    else:
+        return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]


 def turkish_tokenize(text, include_punctuation=False):
@ -142,6 +160,30 @@ def romanian_tokenize(text, include_punctuation=False):
    ]


+def french_tokenize(text, include_punctuation=False):
+    """
+    Handle French apostrophes that precede an 'h', which should work the same as
+    before a vowel, which the Unicode Consortium forgot. "l'heure" should tokenize
+    as "l'" and "heure".
+
+    This also applies the same way to other languages such as Catalan.
+    """
+    tokens = []
+    for token in simple_tokenize(text, include_punctuation):
+        if "'h" in token:
+            idx = token.find("'h")
+            if include_punctuation:
+                # Only include the apostrophe in the token if
+                # include_punctuation is True
+                tokens.append(token[:idx + 1])
+            else:
+                tokens.append(token[:idx])
+            tokens.append(token[idx + 1:])
+        else:
+            tokens.append(token)
+    return tokens
+
+
 def tokenize_mecab_language(text, lang, include_punctuation=False):
    """
    Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
@ -318,6 +360,8 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
        return turkish_tokenize(text, include_punctuation)
    elif lang == 'ro':
        return romanian_tokenize(text, include_punctuation)
+    elif lang in FRENCH_ISH_LANGUAGES:
+        return french_tokenize(text, include_punctuation)
    elif lang in ABJAD_LANGUAGES:
        text = remove_marks(unicodedata.normalize('NFKC', text))
        return simple_tokenize(text, include_punctuation)