wordfreq/tests/test_at_sign.py
2018-07-15 01:01:35 -04:00

110 lines
2.7 KiB
Python

from wordfreq import tokenize, lossy_tokenize, word_frequency
def test_gender_neutral_at():
# Recognize the gender-neutral @ in Spanish as part of the word
text = "La protección de los derechos de tod@s l@s trabajador@s migrantes"
assert tokenize(text, "es") == [
"la",
"protección",
"de",
"los",
"derechos",
"de",
"tod@s",
"l@s",
"trabajador@s",
"migrantes"
]
text = "el distrito 22@ de Barcelona"
assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"]
assert lossy_tokenize(text, 'es') == ["el", "distrito", "00@", "de", "barcelona"]
# It also appears in Portuguese
text = "direitos e deveres para @s membr@s da comunidade virtual"
assert tokenize(text, "pt") == [
"direitos",
"e",
"deveres",
"para",
"@s",
"membr@s",
"da",
"comunidade",
"virtual"
]
# Because this is part of our tokenization, the language code doesn't
# actually matter, as long as it's a language with Unicode tokenization
text = "@s membr@s da comunidade virtual"
assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
def test_at_in_corpus():
# We have a word frequency for "l@s"
assert word_frequency('l@s', 'es') > 0
# It's not just treated as a word break
assert word_frequency('l@s', 'es') < word_frequency('l s', 'es')
def test_punctuation_at():
# If the @ appears alone in a word, we consider it to be punctuation
text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
assert tokenize(text, "pt") == [
"operadores",
"de",
"canal",
"que",
"são",
"aqueles",
"que",
"têm",
"um",
"ao",
"lado",
"do",
"nick"
]
assert tokenize(text, "pt", include_punctuation=True) == [
"operadores",
"de",
"canal",
",",
"que",
"são",
"aqueles",
"que",
"têm",
"um",
"@",
"ao",
"lado",
"do",
"nick"
]
# If the @ is not at the end of the word or part of the word ending '@s',
# it is also punctuation
text = "un archivo hosts.deny que contiene la línea ALL:ALL@ALL"
assert tokenize(text, "es") == [
"un",
"archivo",
"hosts.deny",
"que",
"contiene",
"la",
"línea",
"all:all",
"all"
]
# Make sure not to catch e-mail addresses
text = "info@something.example"
assert tokenize(text, "en") == [
"info",
"something.example"
]