wordfreq/tests/test_at_sign.py

from wordfreq import lossy_tokenize, tokenize, word_frequency


def test_gender_neutral_at():
    # Recognize the gender-neutral @ in Spanish as part of the word
    text = "La protección de los derechos de tod@s l@s trabajador@s migrantes"
    assert tokenize(text, "es") == [
        "la",
        "protección",
        "de",
        "los",
        "derechos",
        "de",
        "tod@s",
        "l@s",
        "trabajador@s",
        "migrantes",
    ]

    text = "el distrito 22@ de Barcelona"
    assert tokenize(text, "es") == ["el", "distrito", "22@", "de", "barcelona"]
    assert lossy_tokenize(text, "es") == ["el", "distrito", "22@", "de", "barcelona"]

    # It also appears in Portuguese
    text = "direitos e deveres para @s membr@s da comunidade virtual"
    assert tokenize(text, "pt") == [
        "direitos",
        "e",
        "deveres",
        "para",
        "@s",
        "membr@s",
        "da",
        "comunidade",
        "virtual",
    ]

    # Because this is part of our tokenization, the language code doesn't
    # actually matter, as long as it's a language with Unicode tokenization
    text = "@s membr@s da comunidade virtual"
    assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]


def test_at_in_corpus():
    # We have a word frequency for "l@s"
    assert word_frequency("l@s", "es") > 0

    # It's not just treated as a word break
    assert word_frequency("l@s", "es") < word_frequency("l s", "es")


def test_punctuation_at():
    # If the @ appears alone in a word, we consider it to be punctuation
    text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
    assert tokenize(text, "pt") == [
        "operadores",
        "de",
        "canal",
        "que",
        "são",
        "aqueles",
        "que",
        "têm",
        "um",
        "ao",
        "lado",
        "do",
        "nick",
    ]

    assert tokenize(text, "pt", include_punctuation=True) == [
        "operadores",
        "de",
        "canal",
        ",",
        "que",
        "são",
        "aqueles",
        "que",
        "têm",
        "um",
        "@",
        "ao",
        "lado",
        "do",
        "nick",
    ]

    # If the @ is not at the end of the word or part of the word ending '@s',
    # it is also punctuation
    text = "un archivo hosts.deny que contiene la línea ALL:ALL@ALL"
    assert tokenize(text, "es") == [
        "un",
        "archivo",
        "hosts.deny",
        "que",
        "contiene",
        "la",
        "línea",
        "all:all",
        "all",
    ]

    # Make sure not to catch e-mail addresses
    text = "info@something.example"
    assert tokenize(text, "en") == ["info", "something.example"]
v3.1: support py3.12, update formatting, replace pkg_resources with locate 2023-11-21 23:07:04 +00:00			`from wordfreq import lossy_tokenize, tokenize, word_frequency`
Recognize "@" in gender-neutral word endings as part of the token 2018-07-02 21:36:55 +00:00

			`def test_gender_neutral_at():`
			`# Recognize the gender-neutral @ in Spanish as part of the word`
			`text = "La protección de los derechos de tod@s l@s trabajador@s migrantes"`
			`assert tokenize(text, "es") == [`
			`"la",`
			`"protección",`
			`"de",`
			`"los",`
			`"derechos",`
			`"de",`
			`"tod@s",`
			`"l@s",`
			`"trabajador@s",`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`"migrantes",`
Recognize "@" in gender-neutral word endings as part of the token 2018-07-02 21:36:55 +00:00			`]`

			`text = "el distrito 22@ de Barcelona"`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert tokenize(text, "es") == ["el", "distrito", "22@", "de", "barcelona"]`
			`assert lossy_tokenize(text, "es") == ["el", "distrito", "22@", "de", "barcelona"]`
Recognize "@" in gender-neutral word endings as part of the token 2018-07-02 21:36:55 +00:00
			`# It also appears in Portuguese`
			`text = "direitos e deveres para @s membr@s da comunidade virtual"`
			`assert tokenize(text, "pt") == [`
			`"direitos",`
			`"e",`
			`"deveres",`
			`"para",`
			`"@s",`
			`"membr@s",`
			`"da",`
			`"comunidade",`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`"virtual",`
Recognize "@" in gender-neutral word endings as part of the token 2018-07-02 21:36:55 +00:00			`]`

			`# Because this is part of our tokenization, the language code doesn't`
			`# actually matter, as long as it's a language with Unicode tokenization`
			`text = "@s membr@s da comunidade virtual"`
			`assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]`


include data from xc rebuild 2018-07-15 05:01:35 +00:00			`def test_at_in_corpus():`
			`# We have a word frequency for "l@s"`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert word_frequency("l@s", "es") > 0`
include data from xc rebuild 2018-07-15 05:01:35 +00:00
			`# It's not just treated as a word break`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert word_frequency("l@s", "es") < word_frequency("l s", "es")`
include data from xc rebuild 2018-07-15 05:01:35 +00:00

Recognize "@" in gender-neutral word endings as part of the token 2018-07-02 21:36:55 +00:00			`def test_punctuation_at():`
			`# If the @ appears alone in a word, we consider it to be punctuation`
			`text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"`
			`assert tokenize(text, "pt") == [`
			`"operadores",`
			`"de",`
			`"canal",`
			`"que",`
			`"são",`
			`"aqueles",`
			`"que",`
			`"têm",`
			`"um",`
			`"ao",`
			`"lado",`
			`"do",`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`"nick",`
Recognize "@" in gender-neutral word endings as part of the token 2018-07-02 21:36:55 +00:00			`]`

			`assert tokenize(text, "pt", include_punctuation=True) == [`
			`"operadores",`
			`"de",`
			`"canal",`
			`",",`
			`"que",`
			`"são",`
			`"aqueles",`
			`"que",`
			`"têm",`
			`"um",`
			`"@",`
			`"ao",`
			`"lado",`
			`"do",`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`"nick",`
Recognize "@" in gender-neutral word endings as part of the token 2018-07-02 21:36:55 +00:00			`]`

			`# If the @ is not at the end of the word or part of the word ending '@s',`
			`# it is also punctuation`
			`text = "un archivo hosts.deny que contiene la línea ALL:ALL@ALL"`
			`assert tokenize(text, "es") == [`
			`"un",`
			`"archivo",`
			`"hosts.deny",`
			`"que",`
			`"contiene",`
			`"la",`
			`"línea",`
			`"all:all",`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`"all",`
Recognize "@" in gender-neutral word endings as part of the token 2018-07-02 21:36:55 +00:00			`]`

			`# Make sure not to catch e-mail addresses`
			`text = "info@something.example"`
estimate the freq distribution of numbers 2022-03-10 23:33:42 +00:00			`assert tokenize(text, "en") == ["info", "something.example"]`