Merge pull request #60 from LuminosoInsight/gender-neutral-at

Recognize "@" in gender-neutral word endings as part of the token
2024-12-23 17:31:41 +00:00 · 2018-07-24 18:16:31 -04:00 · 2018-07-24 18:16:31 -04:00 · 2f8600e975
commit 2f8600e975
parent 7bf69595bb 287df17a71
56 changed files with 36675 additions and 35955 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,24 @@
+## Version 2.2 (2018-07-24)
+
+Library change:
+
+- While the @ sign is usually considered a symbol and not part of a word, there
+  is a case where it acts like a letter. It's used in one way of writing
+  gender-neutral words in Spanish and Portuguese, such as "l@s niñ@s". The
+  tokenizer in wordfreq will now allow words to end with "@" or "@s", so it
+  can recognize these words.
+
+Data changes:
+
+- Updated the data from Exquisite Corpus to filter the ParaCrawl web crawl
+  better. ParaCrawl provides two metrics (Zipporah and Bicleaner) for the
+  goodness of its data, and we now filter it to only use texts that get
+  positive scores on both metrics.
+
+- The input data includes the change to tokenization described above, giving
+  us word frequencies for words such as "l@s".
+
+
 ## Version 2.1 (2018-06-18)

 Data changes:
--- a/README.md
+++ b/README.md
@ -48,13 +48,13 @@ frequency as a decimal between 0 and 1.
    1.07e-05

    >>> word_frequency('café', 'en')
-    5.89e-06
+    5.75e-06

    >>> word_frequency('cafe', 'fr')
    1.51e-06

    >>> word_frequency('café', 'fr')
-    5.25e-05
+    5.13e-05


 `zipf_frequency` is a variation on `word_frequency` that aims to return the
@ -78,10 +78,10 @@ one occurrence per billion words.
    5.29

    >>> zipf_frequency('frequency', 'en')
-    4.42
+    4.43

    >>> zipf_frequency('zipf', 'en')
-    1.55
+    1.57

    >>> zipf_frequency('zipf', 'en', wordlist='small')
    0.0
@ -276,7 +276,8 @@ produces tokens that follow the recommendations in [Unicode
 Annex #29, Text Segmentation][uax29], including the optional rule that
 splits words between apostrophes and vowels.

-There are language-specific exceptions:
+There are exceptions where we change the tokenization to work better
+with certain languages:

 - In Arabic and Hebrew, it additionally normalizes ligatures and removes
  combining marks.
@ -288,11 +289,21 @@ There are language-specific exceptions:
 - In Chinese, it uses the external Python library `jieba`, another optional
  dependency.

+- While the @ sign is usually considered a symbol and not part of a word,
+  wordfreq will allow a word to end with "@" or "@s". This is one way of
+  writing gender-neutral words in Spanish and Portuguese.
+
 [uax29]: http://unicode.org/reports/tr29/

 When wordfreq's frequency lists are built in the first place, the words are
 tokenized according to this function.

+    >>> from wordfreq import tokenize
+    >>> tokenize('l@s niñ@s', 'es')
+    ['l@s', 'niñ@s']
+    >>> zipf_frequency('l@s', 'es')
+    2.8
+
 Because tokenization in the real world is far from consistent, wordfreq will
 also try to deal gracefully when you query it with texts that actually break
 into multiple tokens:
@ -300,7 +311,7 @@ into multiple tokens:
    >>> zipf_frequency('New York', 'en')
    5.28
    >>> zipf_frequency('北京地铁', 'zh')  # "Beijing Subway"
-    3.57
+    3.61

 The word frequencies are combined with the half-harmonic-mean function in order
 to provide an estimate of what their combined frequency would be. In Chinese,
--- a/setup.py
+++ b/setup.py
@ -35,7 +35,7 @@ if sys.version_info < (3, 4):

 setup(
    name="wordfreq",
-    version='2.1.0',
+    version='2.2.0',
    maintainer='Luminoso Technologies, Inc.',
    maintainer_email='info@luminoso.com',
    url='http://github.com/LuminosoInsight/wordfreq/',
--- a/tests/test_at_sign.py
+++ b/tests/test_at_sign.py
@ -0,0 +1,109 @@
+from wordfreq import tokenize, lossy_tokenize, word_frequency
+
+
+def test_gender_neutral_at():
+    # Recognize the gender-neutral @ in Spanish as part of the word
+    text = "La protección de los derechos de tod@s l@s trabajador@s migrantes"
+    assert tokenize(text, "es") == [
+        "la",
+        "protección",
+        "de",
+        "los",
+        "derechos",
+        "de",
+        "tod@s",
+        "l@s",
+        "trabajador@s",
+        "migrantes"
+    ]
+
+    text = "el distrito 22@ de Barcelona"
+    assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"]
+    assert lossy_tokenize(text, 'es') == ["el", "distrito", "00@", "de", "barcelona"]
+
+    # It also appears in Portuguese
+    text = "direitos e deveres para @s membr@s da comunidade virtual"
+    assert tokenize(text, "pt") == [
+        "direitos",
+        "e",
+        "deveres",
+        "para",
+        "@s",
+        "membr@s",
+        "da",
+        "comunidade",
+        "virtual"
+    ]
+
+    # Because this is part of our tokenization, the language code doesn't
+    # actually matter, as long as it's a language with Unicode tokenization
+    text = "@s membr@s da comunidade virtual"
+    assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
+
+
+def test_at_in_corpus():
+    # We have a word frequency for "l@s"
+    assert word_frequency('l@s', 'es') > 0
+
+    # It's not just treated as a word break
+    assert word_frequency('l@s', 'es') < word_frequency('l s', 'es')
+
+
+def test_punctuation_at():
+    # If the @ appears alone in a word, we consider it to be punctuation
+    text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
+    assert tokenize(text, "pt") == [
+        "operadores",
+        "de",
+        "canal",
+        "que",
+        "são",
+        "aqueles",
+        "que",
+        "têm",
+        "um",
+        "ao",
+        "lado",
+        "do",
+        "nick"
+    ]
+
+    assert tokenize(text, "pt", include_punctuation=True) == [
+        "operadores",
+        "de",
+        "canal",
+        ",",
+        "que",
+        "são",
+        "aqueles",
+        "que",
+        "têm",
+        "um",
+        "@",
+        "ao",
+        "lado",
+        "do",
+        "nick"
+    ]
+
+    # If the @ is not at the end of the word or part of the word ending '@s',
+    # it is also punctuation
+    text = "un archivo hosts.deny que contiene la línea ALL:ALL@ALL"
+    assert tokenize(text, "es") == [
+        "un",
+        "archivo",
+        "hosts.deny",
+        "que",
+        "contiene",
+        "la",
+        "línea",
+        "all:all",
+        "all"
+    ]
+
+    # Make sure not to catch e-mail addresses
+    text = "info@something.example"
+    assert tokenize(text, "en") == [
+        "info",
+        "something.example"
+    ]
--- a/tests/test_chinese.py
+++ b/tests/test_chinese.py
@ -59,7 +59,7 @@ def test_tokens():

 def test_combination():
    xiexie_freq = word_frequency('谢谢', 'zh')   # "Thanks"
-    assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20)
+    assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)


 def test_alternate_codes():
--- a/wordfreq/data/jieba_zh.txt
+++ b/wordfreq/data/jieba_zh.txt
--- a/wordfreq/data/large_ar.msgpack.gz
+++ b/wordfreq/data/large_ar.msgpack.gz
--- a/wordfreq/data/large_cs.msgpack.gz
+++ b/wordfreq/data/large_cs.msgpack.gz
--- a/wordfreq/data/large_de.msgpack.gz
+++ b/wordfreq/data/large_de.msgpack.gz
--- a/wordfreq/data/large_en.msgpack.gz
+++ b/wordfreq/data/large_en.msgpack.gz
--- a/wordfreq/data/large_es.msgpack.gz
+++ b/wordfreq/data/large_es.msgpack.gz
--- a/wordfreq/data/large_fi.msgpack.gz
+++ b/wordfreq/data/large_fi.msgpack.gz
--- a/wordfreq/data/large_fr.msgpack.gz
+++ b/wordfreq/data/large_fr.msgpack.gz
--- a/wordfreq/data/large_it.msgpack.gz
+++ b/wordfreq/data/large_it.msgpack.gz
--- a/wordfreq/data/large_ja.msgpack.gz
+++ b/wordfreq/data/large_ja.msgpack.gz
--- a/wordfreq/data/large_nl.msgpack.gz
+++ b/wordfreq/data/large_nl.msgpack.gz
--- a/wordfreq/data/large_pl.msgpack.gz
+++ b/wordfreq/data/large_pl.msgpack.gz
--- a/wordfreq/data/large_pt.msgpack.gz
+++ b/wordfreq/data/large_pt.msgpack.gz
--- a/wordfreq/data/large_ru.msgpack.gz
+++ b/wordfreq/data/large_ru.msgpack.gz
--- a/wordfreq/data/large_zh.msgpack.gz
+++ b/wordfreq/data/large_zh.msgpack.gz
--- a/wordfreq/data/small_ar.msgpack.gz
+++ b/wordfreq/data/small_ar.msgpack.gz
--- a/wordfreq/data/small_bg.msgpack.gz
+++ b/wordfreq/data/small_bg.msgpack.gz
--- a/wordfreq/data/small_bn.msgpack.gz
+++ b/wordfreq/data/small_bn.msgpack.gz
--- a/wordfreq/data/small_ca.msgpack.gz
+++ b/wordfreq/data/small_ca.msgpack.gz
--- a/wordfreq/data/small_cs.msgpack.gz
+++ b/wordfreq/data/small_cs.msgpack.gz
--- a/wordfreq/data/small_da.msgpack.gz
+++ b/wordfreq/data/small_da.msgpack.gz
--- a/wordfreq/data/small_de.msgpack.gz
+++ b/wordfreq/data/small_de.msgpack.gz
--- a/wordfreq/data/small_el.msgpack.gz
+++ b/wordfreq/data/small_el.msgpack.gz
--- a/wordfreq/data/small_en.msgpack.gz
+++ b/wordfreq/data/small_en.msgpack.gz
--- a/wordfreq/data/small_es.msgpack.gz
+++ b/wordfreq/data/small_es.msgpack.gz
--- a/wordfreq/data/small_fa.msgpack.gz
+++ b/wordfreq/data/small_fa.msgpack.gz
--- a/wordfreq/data/small_fi.msgpack.gz
+++ b/wordfreq/data/small_fi.msgpack.gz
--- a/wordfreq/data/small_fr.msgpack.gz
+++ b/wordfreq/data/small_fr.msgpack.gz
--- a/wordfreq/data/small_he.msgpack.gz
+++ b/wordfreq/data/small_he.msgpack.gz
--- a/wordfreq/data/small_hi.msgpack.gz
+++ b/wordfreq/data/small_hi.msgpack.gz
--- a/wordfreq/data/small_hu.msgpack.gz
+++ b/wordfreq/data/small_hu.msgpack.gz
--- a/wordfreq/data/small_id.msgpack.gz
+++ b/wordfreq/data/small_id.msgpack.gz
--- a/wordfreq/data/small_it.msgpack.gz
+++ b/wordfreq/data/small_it.msgpack.gz
--- a/wordfreq/data/small_ja.msgpack.gz
+++ b/wordfreq/data/small_ja.msgpack.gz
--- a/wordfreq/data/small_ko.msgpack.gz
+++ b/wordfreq/data/small_ko.msgpack.gz
--- a/wordfreq/data/small_lv.msgpack.gz
+++ b/wordfreq/data/small_lv.msgpack.gz
--- a/wordfreq/data/small_mk.msgpack.gz
+++ b/wordfreq/data/small_mk.msgpack.gz
--- a/wordfreq/data/small_ms.msgpack.gz
+++ b/wordfreq/data/small_ms.msgpack.gz
--- a/wordfreq/data/small_nb.msgpack.gz
+++ b/wordfreq/data/small_nb.msgpack.gz
--- a/wordfreq/data/small_nl.msgpack.gz
+++ b/wordfreq/data/small_nl.msgpack.gz
--- a/wordfreq/data/small_pl.msgpack.gz
+++ b/wordfreq/data/small_pl.msgpack.gz
--- a/wordfreq/data/small_pt.msgpack.gz
+++ b/wordfreq/data/small_pt.msgpack.gz
--- a/wordfreq/data/small_ro.msgpack.gz
+++ b/wordfreq/data/small_ro.msgpack.gz
--- a/wordfreq/data/small_ru.msgpack.gz
+++ b/wordfreq/data/small_ru.msgpack.gz
--- a/wordfreq/data/small_sh.msgpack.gz
+++ b/wordfreq/data/small_sh.msgpack.gz
--- a/wordfreq/data/small_sv.msgpack.gz
+++ b/wordfreq/data/small_sv.msgpack.gz
--- a/wordfreq/data/small_tr.msgpack.gz
+++ b/wordfreq/data/small_tr.msgpack.gz
--- a/wordfreq/data/small_uk.msgpack.gz
+++ b/wordfreq/data/small_uk.msgpack.gz
--- a/wordfreq/data/small_zh.msgpack.gz
+++ b/wordfreq/data/small_zh.msgpack.gz
--- a/wordfreq/preprocess.py
+++ b/wordfreq/preprocess.py
@ -252,7 +252,7 @@ def cedillas_to_commas(text):
    )


-def sub_zeroes(match):
+def _sub_zeroes(match):
    """
    Given a regex match, return what it matched with digits replaced by
    zeroes.
@ -265,4 +265,4 @@ def smash_numbers(text):
    Replace sequences of multiple digits with zeroes, so we don't need to
    distinguish the frequencies of thousands of numbers.
    """
-    return MULTI_DIGIT_RE.sub(sub_zeroes, text)
+    return MULTI_DIGIT_RE.sub(_sub_zeroes, text)
--- a/wordfreq/tokens.py
+++ b/wordfreq/tokens.py
@ -48,10 +48,28 @@ TOKEN_RE = regex.compile(r"""
    # <SPACELESS> will be replaced by the complex range expression made by
    # _make_spaceless_expr().

-    [<SPACELESS>]+ |
+    [<SPACELESS>]+
+    |

-    # Case 2: standard Unicode segmentation
-    # -------------------------------------
+    # Case 2: Gender-neutral "@s"
+    # ---------------------------
+    #
+    # "@" and "@s" are gender-neutral word endings that can replace -a, -o,
+    # -as, and -os in Spanish, Portuguese, and occasionally Italian.
+    #
+    # This doesn't really conflict with other uses of the @ sign, so we simply
+    # recognize these endings as being part of the token in any language.
+    #
+    # We will recognize the endings as part of our main rule for recognizing
+    # words, which is Case 3 below. However, one case that remains separate is
+    # the Portuguese word "@s" itself, standing for the article "as" or "os".
+    # This must be followed by a word break (\b).
+
+    @s \b
+    |
+
+    # Case 3: Unicode segmentation with tweaks
+    # ----------------------------------------

    # The start of the token must be 'word-like', not punctuation or whitespace
    # or various other things. However, we allow characters of category So
@ -68,29 +86,41 @@ TOKEN_RE = regex.compile(r"""
    (?!\w'[Hh])

    # The entire token is made of graphemes (\X). Matching by graphemes means
-    # that we don't have to specially account for marks or ZWJ sequences.
+    # that we don't have to specially account for marks or ZWJ sequences. We use
+    # a non-greedy match so that we can control where the match ends in the
+    # following expression.
    #
-    # The token ends as soon as it encounters a word break (\b). We use the
-    # non-greedy match (+?) to make sure to end at the first word break we
-    # encounter.
-    \X+? \b |
-
    # If we were matching by codepoints (.) instead of graphemes (\X), then
    # detecting boundaries would be more difficult. Here's a fact that's subtle
    # and poorly documented: a position that's between codepoints, but in the
    # middle of a grapheme, does not match as a word break (\b), but also does
    # not match as not-a-word-break (\B). The word boundary algorithm simply
    # doesn't apply in such a position.
+    \X+?
+
+    # The token ends when it encounters a word break (\b). We use the
+    # non-greedy match (+?) to make sure to end at the first word break we
+    # encounter.
    #
-    # We used to match the rest of the token using \S, which matches non-space
-    # *codepoints*, and this caused us to incompletely work around cases where
-    # it left off in the middle of a grapheme.
+    # We need a special case for gender-neutral "@", which is acting as a
+    # letter, but Unicode considers it to be a symbol and would break words
+    # around it.  We prefer continuing the token with "@" or "@s" over matching
+    # a word break.
    #
+    # As in case 2, this is only allowed at the end of the word. Unfortunately,
+    # we can't use the word-break expression \b in this case, because "@"
+    # already is a word break according to Unicode. Instead, we use a negative
+    # lookahead assertion to ensure that the next character is not word-like.
+    (?:
+       @s? (?!\w) | \b
+    )
+    |
+
    # Another subtle fact: the "non-breaking space" U+A0 counts as a word break
    # here. That's surprising, but it's also what we want, because we don't want
    # any kind of spaces in the middle of our tokens.

-    # Case 3: Fix French
+    # Case 4: Fix French
    # ------------------
    # This allows us to match the articles in French, Catalan, and related
    # languages, such as «l'», that we may have excluded from being part of
@ -100,13 +130,14 @@ TOKEN_RE = regex.compile(r"""
 """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)

 TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
-    # This expression is similar to the expression above, but also matches any
-    # sequence of punctuation characters.
+    # This expression is similar to the expression above. It adds a case between
+    # 2 and 3 that matches any sequence of punctuation characters.

-    [<SPACELESS>]+ |
-    [\p{punct}]+ |
-    (?=[\w\p{So}]) (?!\w'[Hh]) \X+? \b |
-    \w'
+    [<SPACELESS>]+ |                                        # Case 1
+    @s \b |                                                 # Case 2
+    [\p{punct}]+ |                                          # punctuation
+    (?=[\w\p{So}]) (?!\w'[Hh]) \X+? (?: @s? (?!w) | \b) |   # Case 3
+    \w'                                                     # Case 4
 """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)