Bake the 'h special case into the regex

This lets me remove the French-specific code I just put in.
2024-12-23 09:21:37 +00:00 · 2016-12-06 17:37:35 -05:00 · 2016-12-06 17:37:35 -05:00 · 21a78f5eb9
commit 21a78f5eb9
parent 82eba05f2d
2 changed files with 37 additions and 60 deletions
--- a/tests/test_french_and_related.py
+++ b/tests/test_french_and_related.py
@ -3,23 +3,27 @@ from wordfreq import tokenize, word_frequency
 def test_apostrophes():
-    for lang in ('fr', 'ca', 'oc'):
+    # Test that we handle apostrophes in French reasonably.
-        eq_(tokenize("langues d'oïl", lang),
+    eq_(tokenize("qu'un", 'fr'), ['qu', 'un'])
-            ['langues', "d", 'oïl'])
+    eq_(tokenize("qu'un", 'fr', include_punctuation=True),
-        eq_(tokenize("langues d'oïl", lang, include_punctuation=True),
+        ["qu'", "un"])
-            ['langues', "d'", 'oïl'])
+    eq_(tokenize("langues d'oïl", 'fr'),
-        eq_(tokenize("l'heure", lang),
+        ['langues', "d", 'oïl'])
-            ['l', 'heure'])
+    eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True),
-        eq_(tokenize("l'heure", lang, include_punctuation=True),
+        ['langues', "d'", 'oïl'])
-            ["l'", 'heure'])
+    eq_(tokenize("l'heure", 'fr'),
-        eq_(tokenize("L'Hôpital", lang, include_punctuation=True),
+        ['l', 'heure'])
-            ["l'", 'hôpital'])
+    eq_(tokenize("l'heure", 'fr', include_punctuation=True),
-        eq_(tokenize("This isn't French", lang),
+        ["l'", 'heure'])
-            ['this', "isn't", 'french'])
+    eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True),
        ["l'", 'hôpital'])
    eq_(tokenize("This isn't French", 'en'),
        ['this', "isn't", 'french'])
-def test_catalan():
+def test_catastrophes():
-    # Catalan orthography is fiddly. Test that we get a short sentence right.
+    # More apostrophes, but this time they're in Catalan, and there's other
    # mid-word punctuation going on too.
    eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
        ['m', 'acabo', 'd', 'instal·lar'])
    eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -22,23 +22,6 @@ ABJAD_LANGUAGES = {
    'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
 }
 # Languages that can stick particles such as «l'» onto a word starting with
 # a vowel sound, and where this vowel sound can follow a silent «h». These
 # are French and related languages.
 FRENCH_ISH_LANGUAGES = {
    'fr',   # French
    'ca',   # Catalan
    'frp',  # Franco-Provençal or Arpitan
    'nrf',  # Norman French / Jèrriais / Guernésiais
    'oc',   # Occitan
    'pcd',  # Picard
    'wa',   # Walloon
    'frm',  # Middle French
    'fro',  # Old French
 }
 def _make_spaceless_expr():
    pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
    return ''.join(pieces)
@ -76,6 +59,13 @@ TOKEN_RE = regex.compile(r"""
    # Case 2: standard Unicode segmentation
    # -------------------------------------
    # The start of the token must not be a letter followed by «'h». If it is,
    # we should use Case 3 to match up to the apostrophe, then match a new token
    # starting with «h». This rule lets us break «l'heure» into two tokens, just
    # like we would do for «l'arc».
    (?!\w'[Hh])
    # The start of the token must be 'word-like', not punctuation or whitespace
    # or various other things. However, we allow characters of category So
    # (Symbol - Other) because many of these are emoji, which can convey
@ -87,13 +77,22 @@ TOKEN_RE = regex.compile(r"""
    # (\S) and do not cause word breaks according to the Unicode word
    # segmentation heuristic (\B), or are categorized as Marks (\p{M}).
-    (?:\B\S|\p{M})*
+    (?:\B\S|\p{M})* |
    # Case 3: Fix French
    # ------------------
    # This allows us to match the articles in French, Catalan, and related
    # languages, such as «l'», that we may have excluded from being part of
    # the token in Case 2.
    \w'
 """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
 TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
    [<SPACELESS>]+ |
    [\p{punct}]+ |
-    \S(?:\B\S|\p{M})*
+    (?!\w'[Hh]) \S(?:\B\S|\p{M})* |
    \w'
 """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
 MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
@ -160,30 +159,6 @@ def romanian_tokenize(text, include_punctuation=False):
    ]
 def french_tokenize(text, include_punctuation=False):
    """
    Handle French apostrophes that precede an 'h', which should work the same as
    before a vowel, which the Unicode Consortium forgot. "l'heure" should tokenize
    as "l'" and "heure".
    This also applies the same way to other languages such as Catalan.
    """
    tokens = []
    for token in simple_tokenize(text, include_punctuation):
        if "'h" in token:
            idx = token.find("'h")
            if include_punctuation:
                # Only include the apostrophe in the token if
                # include_punctuation is True
                tokens.append(token[:idx + 1])
            else:
                tokens.append(token[:idx])
            tokens.append(token[idx + 1:])
        else:
            tokens.append(token)
    return tokens
 def tokenize_mecab_language(text, lang, include_punctuation=False):
    """
    Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
@ -360,8 +335,6 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
        return turkish_tokenize(text, include_punctuation)
    elif lang == 'ro':
        return romanian_tokenize(text, include_punctuation)
    elif lang in FRENCH_ISH_LANGUAGES:
        return french_tokenize(text, include_punctuation)
    elif lang in ABJAD_LANGUAGES:
        text = remove_marks(unicodedata.normalize('NFKC', text))
        return simple_tokenize(text, include_punctuation)