Merge pull request #60 from LuminosoInsight/gender-neutral-at

Recognize "@" in gender-neutral word endings as part of the token
This commit is contained in:
Lance Nathan 2018-07-24 18:16:31 -04:00 committed by GitHub
commit 2f8600e975
56 changed files with 36675 additions and 35955 deletions

View File

@ -1,3 +1,24 @@
## Version 2.2 (2018-07-24)
Library change:
- While the @ sign is usually considered a symbol and not part of a word, there
is a case where it acts like a letter. It's used in one way of writing
gender-neutral words in Spanish and Portuguese, such as "l@s niñ@s". The
tokenizer in wordfreq will now allow words to end with "@" or "@s", so it
can recognize these words.
Data changes:
- Updated the data from Exquisite Corpus to filter the ParaCrawl web crawl
better. ParaCrawl provides two metrics (Zipporah and Bicleaner) for the
goodness of its data, and we now filter it to only use texts that get
positive scores on both metrics.
- The input data includes the change to tokenization described above, giving
us word frequencies for words such as "l@s".
## Version 2.1 (2018-06-18)
Data changes:

View File

@ -48,13 +48,13 @@ frequency as a decimal between 0 and 1.
1.07e-05
>>> word_frequency('café', 'en')
5.89e-06
5.75e-06
>>> word_frequency('cafe', 'fr')
1.51e-06
>>> word_frequency('café', 'fr')
5.25e-05
5.13e-05
`zipf_frequency` is a variation on `word_frequency` that aims to return the
@ -78,10 +78,10 @@ one occurrence per billion words.
5.29
>>> zipf_frequency('frequency', 'en')
4.42
4.43
>>> zipf_frequency('zipf', 'en')
1.55
1.57
>>> zipf_frequency('zipf', 'en', wordlist='small')
0.0
@ -276,7 +276,8 @@ produces tokens that follow the recommendations in [Unicode
Annex #29, Text Segmentation][uax29], including the optional rule that
splits words between apostrophes and vowels.
There are language-specific exceptions:
There are exceptions where we change the tokenization to work better
with certain languages:
- In Arabic and Hebrew, it additionally normalizes ligatures and removes
combining marks.
@ -288,11 +289,21 @@ There are language-specific exceptions:
- In Chinese, it uses the external Python library `jieba`, another optional
dependency.
- While the @ sign is usually considered a symbol and not part of a word,
wordfreq will allow a word to end with "@" or "@s". This is one way of
writing gender-neutral words in Spanish and Portuguese.
[uax29]: http://unicode.org/reports/tr29/
When wordfreq's frequency lists are built in the first place, the words are
tokenized according to this function.
>>> from wordfreq import tokenize
>>> tokenize('l@s niñ@s', 'es')
['l@s', 'niñ@s']
>>> zipf_frequency('l@s', 'es')
2.8
Because tokenization in the real world is far from consistent, wordfreq will
also try to deal gracefully when you query it with texts that actually break
into multiple tokens:
@ -300,7 +311,7 @@ into multiple tokens:
>>> zipf_frequency('New York', 'en')
5.28
>>> zipf_frequency('北京地铁', 'zh') # "Beijing Subway"
3.57
3.61
The word frequencies are combined with the half-harmonic-mean function in order
to provide an estimate of what their combined frequency would be. In Chinese,

View File

@ -35,7 +35,7 @@ if sys.version_info < (3, 4):
setup(
name="wordfreq",
version='2.1.0',
version='2.2.0',
maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq/',

109
tests/test_at_sign.py Normal file
View File

@ -0,0 +1,109 @@
from wordfreq import tokenize, lossy_tokenize, word_frequency
def test_gender_neutral_at():
# Recognize the gender-neutral @ in Spanish as part of the word
text = "La protección de los derechos de tod@s l@s trabajador@s migrantes"
assert tokenize(text, "es") == [
"la",
"protección",
"de",
"los",
"derechos",
"de",
"tod@s",
"l@s",
"trabajador@s",
"migrantes"
]
text = "el distrito 22@ de Barcelona"
assert tokenize(text, 'es') == ["el", "distrito", "22@", "de", "barcelona"]
assert lossy_tokenize(text, 'es') == ["el", "distrito", "00@", "de", "barcelona"]
# It also appears in Portuguese
text = "direitos e deveres para @s membr@s da comunidade virtual"
assert tokenize(text, "pt") == [
"direitos",
"e",
"deveres",
"para",
"@s",
"membr@s",
"da",
"comunidade",
"virtual"
]
# Because this is part of our tokenization, the language code doesn't
# actually matter, as long as it's a language with Unicode tokenization
text = "@s membr@s da comunidade virtual"
assert tokenize(text, "en") == ["@s", "membr@s", "da", "comunidade", "virtual"]
def test_at_in_corpus():
# We have a word frequency for "l@s"
assert word_frequency('l@s', 'es') > 0
# It's not just treated as a word break
assert word_frequency('l@s', 'es') < word_frequency('l s', 'es')
def test_punctuation_at():
# If the @ appears alone in a word, we consider it to be punctuation
text = "operadores de canal, que são aqueles que têm um @ ao lado do nick"
assert tokenize(text, "pt") == [
"operadores",
"de",
"canal",
"que",
"são",
"aqueles",
"que",
"têm",
"um",
"ao",
"lado",
"do",
"nick"
]
assert tokenize(text, "pt", include_punctuation=True) == [
"operadores",
"de",
"canal",
",",
"que",
"são",
"aqueles",
"que",
"têm",
"um",
"@",
"ao",
"lado",
"do",
"nick"
]
# If the @ is not at the end of the word or part of the word ending '@s',
# it is also punctuation
text = "un archivo hosts.deny que contiene la línea ALL:ALL@ALL"
assert tokenize(text, "es") == [
"un",
"archivo",
"hosts.deny",
"que",
"contiene",
"la",
"línea",
"all:all",
"all"
]
# Make sure not to catch e-mail addresses
text = "info@something.example"
assert tokenize(text, "en") == [
"info",
"something.example"
]

View File

@ -59,7 +59,7 @@ def test_tokens():
def test_combination():
xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks"
assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20)
assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)
def test_alternate_codes():

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -252,7 +252,7 @@ def cedillas_to_commas(text):
)
def sub_zeroes(match):
def _sub_zeroes(match):
"""
Given a regex match, return what it matched with digits replaced by
zeroes.
@ -265,4 +265,4 @@ def smash_numbers(text):
Replace sequences of multiple digits with zeroes, so we don't need to
distinguish the frequencies of thousands of numbers.
"""
return MULTI_DIGIT_RE.sub(sub_zeroes, text)
return MULTI_DIGIT_RE.sub(_sub_zeroes, text)

View File

@ -48,10 +48,28 @@ TOKEN_RE = regex.compile(r"""
# <SPACELESS> will be replaced by the complex range expression made by
# _make_spaceless_expr().
[<SPACELESS>]+ |
[<SPACELESS>]+
|
# Case 2: standard Unicode segmentation
# -------------------------------------
# Case 2: Gender-neutral "@s"
# ---------------------------
#
# "@" and "@s" are gender-neutral word endings that can replace -a, -o,
# -as, and -os in Spanish, Portuguese, and occasionally Italian.
#
# This doesn't really conflict with other uses of the @ sign, so we simply
# recognize these endings as being part of the token in any language.
#
# We will recognize the endings as part of our main rule for recognizing
# words, which is Case 3 below. However, one case that remains separate is
# the Portuguese word "@s" itself, standing for the article "as" or "os".
# This must be followed by a word break (\b).
@s \b
|
# Case 3: Unicode segmentation with tweaks
# ----------------------------------------
# The start of the token must be 'word-like', not punctuation or whitespace
# or various other things. However, we allow characters of category So
@ -68,29 +86,41 @@ TOKEN_RE = regex.compile(r"""
(?!\w'[Hh])
# The entire token is made of graphemes (\X). Matching by graphemes means
# that we don't have to specially account for marks or ZWJ sequences.
# that we don't have to specially account for marks or ZWJ sequences. We use
# a non-greedy match so that we can control where the match ends in the
# following expression.
#
# The token ends as soon as it encounters a word break (\b). We use the
# non-greedy match (+?) to make sure to end at the first word break we
# encounter.
\X+? \b |
# If we were matching by codepoints (.) instead of graphemes (\X), then
# detecting boundaries would be more difficult. Here's a fact that's subtle
# and poorly documented: a position that's between codepoints, but in the
# middle of a grapheme, does not match as a word break (\b), but also does
# not match as not-a-word-break (\B). The word boundary algorithm simply
# doesn't apply in such a position.
\X+?
# The token ends when it encounters a word break (\b). We use the
# non-greedy match (+?) to make sure to end at the first word break we
# encounter.
#
# We used to match the rest of the token using \S, which matches non-space
# *codepoints*, and this caused us to incompletely work around cases where
# it left off in the middle of a grapheme.
# We need a special case for gender-neutral "@", which is acting as a
# letter, but Unicode considers it to be a symbol and would break words
# around it. We prefer continuing the token with "@" or "@s" over matching
# a word break.
#
# As in case 2, this is only allowed at the end of the word. Unfortunately,
# we can't use the word-break expression \b in this case, because "@"
# already is a word break according to Unicode. Instead, we use a negative
# lookahead assertion to ensure that the next character is not word-like.
(?:
@s? (?!\w) | \b
)
|
# Another subtle fact: the "non-breaking space" U+A0 counts as a word break
# here. That's surprising, but it's also what we want, because we don't want
# any kind of spaces in the middle of our tokens.
# Case 3: Fix French
# Case 4: Fix French
# ------------------
# This allows us to match the articles in French, Catalan, and related
# languages, such as «l'», that we may have excluded from being part of
@ -100,13 +130,14 @@ TOKEN_RE = regex.compile(r"""
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
# This expression is similar to the expression above, but also matches any
# sequence of punctuation characters.
# This expression is similar to the expression above. It adds a case between
# 2 and 3 that matches any sequence of punctuation characters.
[<SPACELESS>]+ |
[\p{punct}]+ |
(?=[\w\p{So}]) (?!\w'[Hh]) \X+? \b |
\w'
[<SPACELESS>]+ | # Case 1
@s \b | # Case 2
[\p{punct}]+ | # punctuation
(?=[\w\p{So}]) (?!\w'[Hh]) \X+? (?: @s? (?!w) | \b) | # Case 3
\w' # Case 4
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)