diff --git a/CHANGELOG.md b/CHANGELOG.md index 79fd6b2..0099d45 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,16 @@ +## Version 2.3.2 (2020-04-28) + +- Relaxing the dependency on regex had an unintended consequence in 2.3.1: + it could no longer get the frequency of French phrases such as "l'écran" + because their tokenization behavior changed. + + 2.3.2 fixes this with a more complex tokenization rule that should handle + apostrophes the same across these various versions of regex. + ## Version 2.3.1 (2020-04-22) - State the dependency on msgpack >= 1.0 in setup.py. - +- Relax the dependency on regex to allow versions after 2018.02.08. ## Version 2.3 (2020-04-16) diff --git a/setup.py b/setup.py index aa482a0..4b5927b 100755 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ if sys.version_info < (3, 4): setup( name="wordfreq", - version='2.3.1', + version='2.3.2', maintainer='Robyn Speer', maintainer_email='rspeer@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/', diff --git a/tests/test_french_and_related.py b/tests/test_french_and_related.py index 58b2d88..c27ecae 100644 --- a/tests/test_french_and_related.py +++ b/tests/test_french_and_related.py @@ -8,6 +8,7 @@ def test_apostrophes(): assert tokenize("langues d'oïl", 'fr') == ['langues', "d", 'oïl'] assert tokenize("langues d'oïl", 'fr', include_punctuation=True) == ['langues', "d'", 'oïl'] assert tokenize("l'heure", 'fr') == ['l', 'heure'] + assert tokenize("l'ànima", 'ca') == ['l', 'ànima'] assert tokenize("l'heure", 'fr', include_punctuation=True) == ["l'", 'heure'] assert tokenize("L'Hôpital", 'fr', include_punctuation=True) == ["l'", 'hôpital'] assert tokenize("aujourd'hui", 'fr') == ["aujourd'hui"] diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index e2918fc..3d8f9bf 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -3,7 +3,11 @@ import unicodedata import logging import langcodes -from .language_info import get_language_info, SPACELESS_SCRIPTS, EXTRA_JAPANESE_CHARACTERS +from .language_info import ( + get_language_info, + SPACELESS_SCRIPTS, + EXTRA_JAPANESE_CHARACTERS, +) from .preprocess import preprocess_text, smash_numbers # Placeholders for CJK functions that we'll import on demand @@ -17,13 +21,20 @@ logger = logging.getLogger(__name__) def _make_spaceless_expr(): scripts = sorted(SPACELESS_SCRIPTS) - pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in scripts] + pieces = [r'\p{IsIdeo}'] + [ + r'\p{Script=%s}' % script_code for script_code in scripts + ] return ''.join(pieces) + EXTRA_JAPANESE_CHARACTERS SPACELESS_EXPR = _make_spaceless_expr() -TOKEN_RE = regex.compile(r""" +# All vowels that might appear at the start of a word in French or Catalan, +# plus 'h' which would be silent and imply a following vowel sound. +INITIAL_VOWEL_EXPR = '[AEHIOUÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛaehiouáéíóúàèìòùâêîôû]' + +TOKEN_RE = regex.compile( + r""" # Case 1: a special case for non-spaced languages # ----------------------------------------------- @@ -78,24 +89,32 @@ TOKEN_RE = regex.compile(r""" (?=[\w\p{So}]) - # The start of the token must not be a letter followed by «'h». If it is, - # we should use Case 3 to match up to the apostrophe, then match a new token - # starting with «h». This rule lets us break «l'heure» into two tokens, just - # like we would do for «l'arc». + # The start of the token must not consist of 1-2 letters, an apostrophe, + # and a vowel or 'h'. This is a sequence that occurs particularly in French + # phrases such as "l'arc", "d'heure", or "qu'un". In these cases we want + # the sequence up to the apostrophe to be considered as a separate token, + # even though apostrophes are not usually word separators (the word "won't" + # does not separate into "won" and "t"). + # + # This would be taken care of by optional rule "WB5a" in Unicode TR29, + # "Unicode Text Segmentation". That optional rule was applied in `regex` + # before June 2018, but no longer is, so we have to do it ourselves. - (?!\w'[Hh]) + (?!\w\w?') # The entire token is made of graphemes (\X). Matching by graphemes means - # that we don't have to specially account for marks or ZWJ sequences. We use - # a non-greedy match so that we can control where the match ends in the + # that we don't have to specially account for marks or ZWJ sequences. We + # use a non-greedy match so that we can control where the match ends in the # following expression. # # If we were matching by codepoints (.) instead of graphemes (\X), then - # detecting boundaries would be more difficult. Here's a fact that's subtle - # and poorly documented: a position that's between codepoints, but in the - # middle of a grapheme, does not match as a word break (\b), but also does - # not match as not-a-word-break (\B). The word boundary algorithm simply - # doesn't apply in such a position. + # detecting boundaries would be more difficult. Here's a fact about the + # regex module that's subtle and poorly documented: a position that's + # between codepoints, but in the middle of a grapheme, does not match as a + # word break (\b), but also does not match as not-a-word-break (\B). The + # word boundary algorithm simply doesn't apply in such a position. It is + # unclear whether this is intentional. + \X+? # The token ends when it encounters a word break (\b). We use the @@ -120,25 +139,39 @@ TOKEN_RE = regex.compile(r""" # here. That's surprising, but it's also what we want, because we don't want # any kind of spaces in the middle of our tokens. - # Case 4: Fix French - # ------------------ - # This allows us to match the articles in French, Catalan, and related - # languages, such as «l'», that we may have excluded from being part of - # the token in Case 2. + # Case 4: Match French apostrophes + # -------------------------------- + # This allows us to match the particles in French, Catalan, and related + # languages, such as «l'» and «qu'», that we may have excluded from being + # part of the token in Case 3. - \w' -""".replace('', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE) + \w\w?' +""".replace( + '', SPACELESS_EXPR + ).replace( + '', INITIAL_VOWEL_EXPR + ), + regex.V1 | regex.WORD | regex.VERBOSE, +) -TOKEN_RE_WITH_PUNCTUATION = regex.compile(r""" +TOKEN_RE_WITH_PUNCTUATION = regex.compile( + r""" # This expression is similar to the expression above. It adds a case between # 2 and 3 that matches any sequence of punctuation characters. []+ | # Case 1 @s \b | # Case 2 [\p{punct}]+ | # punctuation - (?=[\w\p{So}]) (?!\w'[Hh]) \X+? (?: @s? (?!w) | \b) | # Case 3 - \w' # Case 4 -""".replace('', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE) + (?=[\w\p{So}]) (?!\w\w?') + \X+? (?: @s? (?!w) | \b) | # Case 3 + \w\w?' # Case 4 +""".replace( + '', SPACELESS_EXPR + ).replace( + '', INITIAL_VOWEL_EXPR + ), + regex.V1 | regex.WORD | regex.VERBOSE, +) # Just identify punctuation, for cases where the tokenizer is separate @@ -180,10 +213,7 @@ def simple_tokenize(text, include_punctuation=False): for token in TOKEN_RE_WITH_PUNCTUATION.findall(text) ] else: - return [ - token.strip("'").casefold() - for token in TOKEN_RE.findall(text) - ] + return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] def tokenize(text, lang, include_punctuation=False, external_wordlist=False): @@ -228,6 +258,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False): if info['tokenizer'] == 'mecab': from wordfreq.mecab import mecab_tokenize as _mecab_tokenize + # Get just the language code out of the Language object, so we can # use it to select a MeCab dictionary tokens = _mecab_tokenize(text, language.language) @@ -235,6 +266,7 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False): tokens = [token for token in tokens if not PUNCT_RE.match(token)] elif info['tokenizer'] == 'jieba': from wordfreq.chinese import jieba_tokenize as _jieba_tokenize + tokens = _jieba_tokenize(text, external_wordlist=external_wordlist) if not include_punctuation: tokens = [token for token in tokens if not PUNCT_RE.match(token)] @@ -245,8 +277,9 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False): if info['tokenizer'] != 'regex' and lang not in _WARNED_LANGUAGES: logger.warning( "The language '{}' is in the '{}' script, which we don't " - "have a tokenizer for. The results will be bad." - .format(lang, info['script']) + "have a tokenizer for. The results will be bad.".format( + lang, info['script'] + ) ) _WARNED_LANGUAGES.add(lang) tokens = simple_tokenize(text, include_punctuation=include_punctuation) @@ -254,7 +287,9 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False): return tokens -def lossy_tokenize(text, lang, include_punctuation=False, external_wordlist=False): +def lossy_tokenize( + text, lang, include_punctuation=False, external_wordlist=False +): """ Get a list of tokens for this text, with largely the same results and options as `tokenize`, but aggressively normalize some text in a lossy way @@ -279,6 +314,7 @@ def lossy_tokenize(text, lang, include_punctuation=False, external_wordlist=Fals if info['lookup_transliteration'] == 'zh-Hans': from wordfreq.chinese import simplify_chinese as _simplify_chinese + tokens = [_simplify_chinese(token) for token in tokens] return [smash_numbers(token) for token in tokens]