Recognize "@" in gender-neutral word endings as part of the token

This commit is contained in:
Rob Speer 2018-07-02 17:36:55 -04:00
parent ca9cf7d90f
commit b2d242e8bf
3 changed files with 153 additions and 21 deletions

101
tests/test_at_sign.py Normal file
View File

@ -0,0 +1,101 @@
from wordfreq import tokenize, lossy_tokenize, word_frequency
def test_gender_neutral_at():
# Recognize the gender-neutral @ in Spanish as part of the word
text = "La protección de los derechos de tod@s l@s trabajador@s migrantes"
assert tokenize(text, "es") == [
"la",
"protección",
"de",
"los",
"derechos",
"de",
"tod@s",
"l@s",
"trabajador@s",
"migrantes"
]
text = "el distrito 22@ de Barcelona"
assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"]
assert lossy_tokenize(text, 'es') == ["el", "distrito", "00@", "de", "barcelona"]
# It also appears in Portuguese
text = "direitos e deveres para @s membr@s da comunidade virtual"
assert tokenize(text, "pt") == [
"direitos",
"e",
"deveres",
"para",
"@s",
"membr@s",
"da",
"comunidade",
"virtual"
]
# Because this is part of our tokenization, the language code doesn't
# actually matter, as long as it's a language with Unicode tokenization
text = "@s membr@s da comunidade virtual"
assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
def test_punctuation_at():
# If the @ appears alone in a word, we consider it to be punctuation
text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
assert tokenize(text, "pt") == [
"operadores",
"de",
"canal",
"que",
"são",
"aqueles",
"que",
"têm",
"um",
"ao",
"lado",
"do",
"nick"
]
assert tokenize(text, "pt", include_punctuation=True) == [
"operadores",
"de",
"canal",
",",
"que",
"são",
"aqueles",
"que",
"têm",
"um",
"@",
"ao",
"lado",
"do",
"nick"
]
# If the @ is not at the end of the word or part of the word ending '@s',
# it is also punctuation
text = "un archivo hosts.deny que contiene la línea ALL:ALL@ALL"
assert tokenize(text, "es") == [
"un",
"archivo",
"hosts.deny",
"que",
"contiene",
"la",
"línea",
"all:all",
"all"
]
# Make sure not to catch e-mail addresses
text = "info@something.example"
assert tokenize(text, "en") == [
"info",
"something.example"
]

View File

@ -252,7 +252,7 @@ def cedillas_to_commas(text):
) )
def sub_zeroes(match): def _sub_zeroes(match):
""" """
Given a regex match, return what it matched with digits replaced by Given a regex match, return what it matched with digits replaced by
zeroes. zeroes.
@ -265,4 +265,4 @@ def smash_numbers(text):
Replace sequences of multiple digits with zeroes, so we don't need to Replace sequences of multiple digits with zeroes, so we don't need to
distinguish the frequencies of thousands of numbers. distinguish the frequencies of thousands of numbers.
""" """
return MULTI_DIGIT_RE.sub(sub_zeroes, text) return MULTI_DIGIT_RE.sub(_sub_zeroes, text)

View File

@ -48,10 +48,28 @@ TOKEN_RE = regex.compile(r"""
# <SPACELESS> will be replaced by the complex range expression made by # <SPACELESS> will be replaced by the complex range expression made by
# _make_spaceless_expr(). # _make_spaceless_expr().
[<SPACELESS>]+ | [<SPACELESS>]+
|
# Case 2: standard Unicode segmentation # Case 2: Gender-neutral "@s"
# ------------------------------------- # ---------------------------
#
# "@" and "@s" are gender-neutral word endings that can replace -a, -o,
# -as, and -os in Spanish, Portuguese, and occasionally Italian.
#
# This doesn't really conflict with other uses of the @ sign, so we simply
# recognize these endings as being part of the token in any language.
#
# We will recognize the endings as part of our main rule for recognizing
# words, which is Case 3 below. However, one case that remains separate is
# the Portuguese word "@s" itself, standing for the article "as" or "os".
# This must be followed by a word break (\b).
@s \b
|
# Case 3: Unicode segmentation with tweaks
# ----------------------------------------
# The start of the token must be 'word-like', not punctuation or whitespace # The start of the token must be 'word-like', not punctuation or whitespace
# or various other things. However, we allow characters of category So # or various other things. However, we allow characters of category So
@ -68,29 +86,41 @@ TOKEN_RE = regex.compile(r"""
(?!\w'[Hh]) (?!\w'[Hh])
# The entire token is made of graphemes (\X). Matching by graphemes means # The entire token is made of graphemes (\X). Matching by graphemes means
# that we don't have to specially account for marks or ZWJ sequences. # that we don't have to specially account for marks or ZWJ sequences. We use
# a non-greedy match so that we can control where the match ends in the
# following expression.
# #
# The token ends as soon as it encounters a word break (\b). We use the
# non-greedy match (+?) to make sure to end at the first word break we
# encounter.
\X+? \b |
# If we were matching by codepoints (.) instead of graphemes (\X), then # If we were matching by codepoints (.) instead of graphemes (\X), then
# detecting boundaries would be more difficult. Here's a fact that's subtle # detecting boundaries would be more difficult. Here's a fact that's subtle
# and poorly documented: a position that's between codepoints, but in the # and poorly documented: a position that's between codepoints, but in the
# middle of a grapheme, does not match as a word break (\b), but also does # middle of a grapheme, does not match as a word break (\b), but also does
# not match as not-a-word-break (\B). The word boundary algorithm simply # not match as not-a-word-break (\B). The word boundary algorithm simply
# doesn't apply in such a position. # doesn't apply in such a position.
\X+?
# The token ends when it encounters a word break (\b). We use the
# non-greedy match (+?) to make sure to end at the first word break we
# encounter.
# #
# We used to match the rest of the token using \S, which matches non-space # We need a special case for gender-neutral "@", which is acting as a
# *codepoints*, and this caused us to incompletely work around cases where # letter, but Unicode considers it to be a symbol and would break words
# it left off in the middle of a grapheme. # around it. We prefer continuing the token with "@" or "@s" over matching
# a word break.
# #
# As in case 2, this is only allowed at the end of the word. Unfortunately,
# we can't use the word-break expression \b in this case, because "@"
# already is a word break according to Unicode. Instead, we use a negative
# lookahead assertion to ensure that the next character is not word-like.
(?:
@s? (?!\w) | \b
)
|
# Another subtle fact: the "non-breaking space" U+A0 counts as a word break # Another subtle fact: the "non-breaking space" U+A0 counts as a word break
# here. That's surprising, but it's also what we want, because we don't want # here. That's surprising, but it's also what we want, because we don't want
# any kind of spaces in the middle of our tokens. # any kind of spaces in the middle of our tokens.
# Case 3: Fix French # Case 4: Fix French
# ------------------ # ------------------
# This allows us to match the articles in French, Catalan, and related # This allows us to match the articles in French, Catalan, and related
# languages, such as «l'», that we may have excluded from being part of # languages, such as «l'», that we may have excluded from being part of
@ -100,13 +130,14 @@ TOKEN_RE = regex.compile(r"""
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE) """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r""" TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
# This expression is similar to the expression above, but also matches any # This expression is similar to the expression above. It adds a case between
# sequence of punctuation characters. # 2 and 3 that matches any sequence of punctuation characters.
[<SPACELESS>]+ | [<SPACELESS>]+ | # Case 1
[\p{punct}]+ | @s \b | # Case 2
(?=[\w\p{So}]) (?!\w'[Hh]) \X+? \b | [\p{punct}]+ | # punctuation
\w' (?=[\w\p{So}]) (?!\w'[Hh]) \X+? (?: @s? (?!w) | \b) | # Case 3
\w' # Case 4
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE) """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)