mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
107 lines
2.7 KiB
Python
107 lines
2.7 KiB
Python
from wordfreq import tokenize, lossy_tokenize, word_frequency
|
|
|
|
|
|
def test_gender_neutral_at():
|
|
# Recognize the gender-neutral @ in Spanish as part of the word
|
|
text = "La protección de los derechos de tod@s l@s trabajador@s migrantes"
|
|
assert tokenize(text, "es") == [
|
|
"la",
|
|
"protección",
|
|
"de",
|
|
"los",
|
|
"derechos",
|
|
"de",
|
|
"tod@s",
|
|
"l@s",
|
|
"trabajador@s",
|
|
"migrantes",
|
|
]
|
|
|
|
text = "el distrito 22@ de Barcelona"
|
|
assert tokenize(text, "es") == ["el", "distrito", "22@", "de", "barcelona"]
|
|
assert lossy_tokenize(text, "es") == ["el", "distrito", "22@", "de", "barcelona"]
|
|
|
|
# It also appears in Portuguese
|
|
text = "direitos e deveres para @s membr@s da comunidade virtual"
|
|
assert tokenize(text, "pt") == [
|
|
"direitos",
|
|
"e",
|
|
"deveres",
|
|
"para",
|
|
"@s",
|
|
"membr@s",
|
|
"da",
|
|
"comunidade",
|
|
"virtual",
|
|
]
|
|
|
|
# Because this is part of our tokenization, the language code doesn't
|
|
# actually matter, as long as it's a language with Unicode tokenization
|
|
text = "@s membr@s da comunidade virtual"
|
|
assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
|
|
|
|
|
|
def test_at_in_corpus():
|
|
# We have a word frequency for "l@s"
|
|
assert word_frequency("l@s", "es") > 0
|
|
|
|
# It's not just treated as a word break
|
|
assert word_frequency("l@s", "es") < word_frequency("l s", "es")
|
|
|
|
|
|
def test_punctuation_at():
|
|
# If the @ appears alone in a word, we consider it to be punctuation
|
|
text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
|
|
assert tokenize(text, "pt") == [
|
|
"operadores",
|
|
"de",
|
|
"canal",
|
|
"que",
|
|
"são",
|
|
"aqueles",
|
|
"que",
|
|
"têm",
|
|
"um",
|
|
"ao",
|
|
"lado",
|
|
"do",
|
|
"nick",
|
|
]
|
|
|
|
assert tokenize(text, "pt", include_punctuation=True) == [
|
|
"operadores",
|
|
"de",
|
|
"canal",
|
|
",",
|
|
"que",
|
|
"são",
|
|
"aqueles",
|
|
"que",
|
|
"têm",
|
|
"um",
|
|
"@",
|
|
"ao",
|
|
"lado",
|
|
"do",
|
|
"nick",
|
|
]
|
|
|
|
# If the @ is not at the end of the word or part of the word ending '@s',
|
|
# it is also punctuation
|
|
text = "un archivo hosts.deny que contiene la línea ALL:ALL@ALL"
|
|
assert tokenize(text, "es") == [
|
|
"un",
|
|
"archivo",
|
|
"hosts.deny",
|
|
"que",
|
|
"contiene",
|
|
"la",
|
|
"línea",
|
|
"all:all",
|
|
"all",
|
|
]
|
|
|
|
# Make sure not to catch e-mail addresses
|
|
text = "info@something.example"
|
|
assert tokenize(text, "en") == ["info", "something.example"]
|