From a31deec58010016648e3a1311107e5f925c7e14d Mon Sep 17 00:00:00 2001 From: Lance Nathan Date: Thu, 8 Oct 2020 12:23:22 -0400 Subject: [PATCH] Update the "initial vowels" in French/Catalan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User LBeaudoux observed (https://github.com/LuminosoInsight/wordfreq/pull/82) that "Œ and œ should be considered as vowels that might appear at the start of a word in French". Further investigation of the French wordfreq list revealed words in the data starting with other vowels (such as d'yvonne, d'åland, l'ïle, d'özil). This PR is a combination of LBeaudoux's PR and the latter fact. (The updated regex is also used for Catalan, but should have no actual effect. To the best of our understanding, "y" appears in Catalan only in the digraph "ny" and in foreign words--the Catalan wordlist contains "york", "by", "city", several English names, and so forth, but no real Catalan words starting with "y"; cf "ioga", "iogurt". The wordlist in fact contained "l'fbi" and "l'nba", but cases of "l'" followed by a vowel like the ones found in French.) --- wordfreq/tokens.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index 3d8f9bf..30f9a09 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -31,7 +31,7 @@ SPACELESS_EXPR = _make_spaceless_expr() # All vowels that might appear at the start of a word in French or Catalan, # plus 'h' which would be silent and imply a following vowel sound. -INITIAL_VOWEL_EXPR = '[AEHIOUÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛaehiouáéíóúàèìòùâêîôû]' +INITIAL_VOWEL_EXPR = '[AEHIOUYÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÅÏÖŒaehiouyáéíóúàèìòùâêîôûåïöœ]' TOKEN_RE = regex.compile( r"""