Recognize "@" in gender-neutral word endings as part of the token

2024-12-23 17:31:41 +00:00 · 2018-07-02 17:36:55 -04:00 · 2018-07-02 17:36:55 -04:00 · 65692c3d81
commit 65692c3d81
parent 7bf69595bb
3 changed files with 153 additions and 21 deletions
--- a/tests/test_at_sign.py
+++ b/tests/test_at_sign.py
@ -0,0 +1,101 @@
 from wordfreq import tokenize, lossy_tokenize, word_frequency
 def test_gender_neutral_at():
    # Recognize the gender-neutral @ in Spanish as part of the word
    text = "La protección de los derechos de tod@s l@s trabajador@s migrantes"
    assert tokenize(text, "es") == [
        "la",
        "protección",
        "de",
        "los",
        "derechos",
        "de",
        "tod@s",
        "l@s",
        "trabajador@s",
        "migrantes"
    ]
    text = "el distrito 22@ de Barcelona"
    assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"]
    assert lossy_tokenize(text, 'es') == ["el", "distrito", "00@", "de", "barcelona"]
    # It also appears in Portuguese
    text = "direitos e deveres para @s membr@s da comunidade virtual"
    assert tokenize(text, "pt") == [
        "direitos",
        "e",
        "deveres",
        "para",
        "@s",
        "membr@s",
        "da",
        "comunidade",
        "virtual"
    ]
    # Because this is part of our tokenization, the language code doesn't
    # actually matter, as long as it's a language with Unicode tokenization
    text = "@s membr@s da comunidade virtual"
    assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
 def test_punctuation_at():
    # If the @ appears alone in a word, we consider it to be punctuation
    text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
    assert tokenize(text, "pt") == [
        "operadores",
        "de",
        "canal",
        "que",
        "são",
        "aqueles",
        "que",
        "têm",
        "um",
        "ao",
        "lado",
        "do",
        "nick"
    ]
    assert tokenize(text, "pt", include_punctuation=True) == [
        "operadores",
        "de",
        "canal",
        ",",
        "que",
        "são",
        "aqueles",
        "que",
        "têm",
        "um",
        "@",
        "ao",
        "lado",
        "do",
        "nick"
    ]
    # If the @ is not at the end of the word or part of the word ending '@s',
    # it is also punctuation
    text = "un archivo hosts.deny que contiene la línea ALL:ALL@ALL"
    assert tokenize(text, "es") == [
        "un",
        "archivo",
        "hosts.deny",
        "que",
        "contiene",
        "la",
        "línea",
        "all:all",
        "all"
    ]
    # Make sure not to catch e-mail addresses
    text = "info@something.example"
    assert tokenize(text, "en") == [
        "info",
        "something.example"
    ]
--- a/wordfreq/preprocess.py
+++ b/wordfreq/preprocess.py
@ -252,7 +252,7 @@ def cedillas_to_commas(text):
    )
-def sub_zeroes(match):
+def _sub_zeroes(match):
    """
    Given a regex match, return what it matched with digits replaced by
    zeroes.
@ -265,4 +265,4 @@ def smash_numbers(text):
    Replace sequences of multiple digits with zeroes, so we don't need to
    distinguish the frequencies of thousands of numbers.
    """
-    return MULTI_DIGIT_RE.sub(sub_zeroes, text)
+    return MULTI_DIGIT_RE.sub(_sub_zeroes, text)
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -48,10 +48,28 @@ TOKEN_RE = regex.compile(r"""
    # <SPACELESS> will be replaced by the complex range expression made by
    # _make_spaceless_expr().
-    [<SPACELESS>]+ |
+    [<SPACELESS>]+
    |
-    # Case 2: standard Unicode segmentation
+    # Case 2: Gender-neutral "@s"
-    # -------------------------------------
+    # ---------------------------
    #
    # "@" and "@s" are gender-neutral word endings that can replace -a, -o,
    # -as, and -os in Spanish, Portuguese, and occasionally Italian.
    #
    # This doesn't really conflict with other uses of the @ sign, so we simply
    # recognize these endings as being part of the token in any language.
    #
    # We will recognize the endings as part of our main rule for recognizing
    # words, which is Case 3 below. However, one case that remains separate is
    # the Portuguese word "@s" itself, standing for the article "as" or "os".
    # This must be followed by a word break (\b).
    @s \b
    |
    # Case 3: Unicode segmentation with tweaks
    # ----------------------------------------
    # The start of the token must be 'word-like', not punctuation or whitespace
    # or various other things. However, we allow characters of category So
@ -68,29 +86,41 @@ TOKEN_RE = regex.compile(r"""
    (?!\w'[Hh])
    # The entire token is made of graphemes (\X). Matching by graphemes means
-    # that we don't have to specially account for marks or ZWJ sequences.
+    # that we don't have to specially account for marks or ZWJ sequences. We use
    # a non-greedy match so that we can control where the match ends in the
    # following expression.
    #
    # The token ends as soon as it encounters a word break (\b). We use the
    # non-greedy match (+?) to make sure to end at the first word break we
    # encounter.
    \X+? \b |
    # If we were matching by codepoints (.) instead of graphemes (\X), then
    # detecting boundaries would be more difficult. Here's a fact that's subtle
    # and poorly documented: a position that's between codepoints, but in the
    # middle of a grapheme, does not match as a word break (\b), but also does
    # not match as not-a-word-break (\B). The word boundary algorithm simply
    # doesn't apply in such a position.
    \X+?
    # The token ends when it encounters a word break (\b). We use the
    # non-greedy match (+?) to make sure to end at the first word break we
    # encounter.
    #
-    # We used to match the rest of the token using \S, which matches non-space
+    # We need a special case for gender-neutral "@", which is acting as a
-    # *codepoints*, and this caused us to incompletely work around cases where
+    # letter, but Unicode considers it to be a symbol and would break words
-    # it left off in the middle of a grapheme.
+    # around it.  We prefer continuing the token with "@" or "@s" over matching
    # a word break.
    #
    # As in case 2, this is only allowed at the end of the word. Unfortunately,
    # we can't use the word-break expression \b in this case, because "@"
    # already is a word break according to Unicode. Instead, we use a negative
    # lookahead assertion to ensure that the next character is not word-like.
    (?:
       @s? (?!\w) | \b
    )
    |
    # Another subtle fact: the "non-breaking space" U+A0 counts as a word break
    # here. That's surprising, but it's also what we want, because we don't want
    # any kind of spaces in the middle of our tokens.
-    # Case 3: Fix French
+    # Case 4: Fix French
    # ------------------
    # This allows us to match the articles in French, Catalan, and related
    # languages, such as «l'», that we may have excluded from being part of
@ -100,13 +130,14 @@ TOKEN_RE = regex.compile(r"""
 """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
 TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
-    # This expression is similar to the expression above, but also matches any
+    # This expression is similar to the expression above. It adds a case between
-    # sequence of punctuation characters.
+    # 2 and 3 that matches any sequence of punctuation characters.
-    [<SPACELESS>]+ |
+    [<SPACELESS>]+ |                                        # Case 1
-    [\p{punct}]+ |
+    @s \b |                                                 # Case 2
-    (?=[\w\p{So}]) (?!\w'[Hh]) \X+? \b |
+    [\p{punct}]+ |                                          # punctuation
-    \w'
+    (?=[\w\p{So}]) (?!\w'[Hh]) \X+? (?: @s? (?!w) | \b) |   # Case 3
    \w'                                                     # Case 4
 """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)