Bake the 'h special case into the regex

This lets me remove the French-specific code I just put in.
2024-12-23 09:21:37 +00:00 · 2016-12-06 17:37:35 -05:00 · 2016-12-06 17:37:35 -05:00 · 21a78f5eb9
commit 21a78f5eb9
parent 82eba05f2d
2 changed files with 37 additions and 60 deletions
--- a/tests/test_french_and_related.py
+++ b/tests/test_french_and_related.py
@ -3,23 +3,27 @@ from wordfreq import tokenize, word_frequency


 def test_apostrophes():
-    for lang in ('fr', 'ca', 'oc'):
-        eq_(tokenize("langues d'oïl", lang),
+    # Test that we handle apostrophes in French reasonably.
+    eq_(tokenize("qu'un", 'fr'), ['qu', 'un'])
+    eq_(tokenize("qu'un", 'fr', include_punctuation=True),
+        ["qu'", "un"])
+    eq_(tokenize("langues d'oïl", 'fr'),
        ['langues', "d", 'oïl'])
-        eq_(tokenize("langues d'oïl", lang, include_punctuation=True),
+    eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True),
        ['langues', "d'", 'oïl'])
-        eq_(tokenize("l'heure", lang),
+    eq_(tokenize("l'heure", 'fr'),
        ['l', 'heure'])
-        eq_(tokenize("l'heure", lang, include_punctuation=True),
+    eq_(tokenize("l'heure", 'fr', include_punctuation=True),
        ["l'", 'heure'])
-        eq_(tokenize("L'Hôpital", lang, include_punctuation=True),
+    eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True),
        ["l'", 'hôpital'])
-        eq_(tokenize("This isn't French", lang),
+    eq_(tokenize("This isn't French", 'en'),
        ['this', "isn't", 'french'])


-def test_catalan():
-    # Catalan orthography is fiddly. Test that we get a short sentence right.
+def test_catastrophes():
+    # More apostrophes, but this time they're in Catalan, and there's other
+    # mid-word punctuation going on too.
    eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
        ['m', 'acabo', 'd', 'instal·lar'])
    eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -22,23 +22,6 @@ ABJAD_LANGUAGES = {
    'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
 }

-# Languages that can stick particles such as «l'» onto a word starting with
-# a vowel sound, and where this vowel sound can follow a silent «h». These
-# are French and related languages.
-FRENCH_ISH_LANGUAGES = {
-    'fr',   # French
-    'ca',   # Catalan
-    'frp',  # Franco-Provençal or Arpitan
-    'nrf',  # Norman French / Jèrriais / Guernésiais
-    'oc',   # Occitan
-    'pcd',  # Picard
-    'wa',   # Walloon
-
-    'frm',  # Middle French
-    'fro',  # Old French
-}
-
-
 def _make_spaceless_expr():
    pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
    return ''.join(pieces)
@ -76,6 +59,13 @@ TOKEN_RE = regex.compile(r"""
    # Case 2: standard Unicode segmentation
    # -------------------------------------

+    # The start of the token must not be a letter followed by «'h». If it is,
+    # we should use Case 3 to match up to the apostrophe, then match a new token
+    # starting with «h». This rule lets us break «l'heure» into two tokens, just
+    # like we would do for «l'arc».
+
+    (?!\w'[Hh])
+
    # The start of the token must be 'word-like', not punctuation or whitespace
    # or various other things. However, we allow characters of category So
    # (Symbol - Other) because many of these are emoji, which can convey
@ -87,13 +77,22 @@ TOKEN_RE = regex.compile(r"""
    # (\S) and do not cause word breaks according to the Unicode word
    # segmentation heuristic (\B), or are categorized as Marks (\p{M}).

-    (?:\B\S|\p{M})*
+    (?:\B\S|\p{M})* |
+
+    # Case 3: Fix French
+    # ------------------
+    # This allows us to match the articles in French, Catalan, and related
+    # languages, such as «l'», that we may have excluded from being part of
+    # the token in Case 2.
+
+    \w'
 """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)

 TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
    [<SPACELESS>]+ |
    [\p{punct}]+ |
-    \S(?:\B\S|\p{M})*
+    (?!\w'[Hh]) \S(?:\B\S|\p{M})* |
+    \w'
 """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)

 MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
@ -160,30 +159,6 @@ def romanian_tokenize(text, include_punctuation=False):
    ]


-def french_tokenize(text, include_punctuation=False):
-    """
-    Handle French apostrophes that precede an 'h', which should work the same as
-    before a vowel, which the Unicode Consortium forgot. "l'heure" should tokenize
-    as "l'" and "heure".
-
-    This also applies the same way to other languages such as Catalan.
-    """
-    tokens = []
-    for token in simple_tokenize(text, include_punctuation):
-        if "'h" in token:
-            idx = token.find("'h")
-            if include_punctuation:
-                # Only include the apostrophe in the token if
-                # include_punctuation is True
-                tokens.append(token[:idx + 1])
-            else:
-                tokens.append(token[:idx])
-            tokens.append(token[idx + 1:])
-        else:
-            tokens.append(token)
-    return tokens
-
-
 def tokenize_mecab_language(text, lang, include_punctuation=False):
    """
    Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
@ -360,8 +335,6 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
        return turkish_tokenize(text, include_punctuation)
    elif lang == 'ro':
        return romanian_tokenize(text, include_punctuation)
-    elif lang in FRENCH_ISH_LANGUAGES:
-        return french_tokenize(text, include_punctuation)
    elif lang in ABJAD_LANGUAGES:
        text = remove_marks(unicodedata.normalize('NFKC', text))
        return simple_tokenize(text, include_punctuation)