Bake the 'h special case into the regex

This lets me remove the French-specific code I just put in.
This commit is contained in:
Robyn Speer 2016-12-06 17:37:35 -05:00
parent 82eba05f2d
commit 21a78f5eb9
2 changed files with 37 additions and 60 deletions

View File

@ -3,23 +3,27 @@ from wordfreq import tokenize, word_frequency
def test_apostrophes(): def test_apostrophes():
for lang in ('fr', 'ca', 'oc'): # Test that we handle apostrophes in French reasonably.
eq_(tokenize("langues d'oïl", lang), eq_(tokenize("qu'un", 'fr'), ['qu', 'un'])
['langues', "d", 'oïl']) eq_(tokenize("qu'un", 'fr', include_punctuation=True),
eq_(tokenize("langues d'oïl", lang, include_punctuation=True), ["qu'", "un"])
['langues', "d'", 'oïl']) eq_(tokenize("langues d'oïl", 'fr'),
eq_(tokenize("l'heure", lang), ['langues', "d", 'oïl'])
['l', 'heure']) eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True),
eq_(tokenize("l'heure", lang, include_punctuation=True), ['langues', "d'", 'oïl'])
["l'", 'heure']) eq_(tokenize("l'heure", 'fr'),
eq_(tokenize("L'Hôpital", lang, include_punctuation=True), ['l', 'heure'])
["l'", 'hôpital']) eq_(tokenize("l'heure", 'fr', include_punctuation=True),
eq_(tokenize("This isn't French", lang), ["l'", 'heure'])
['this', "isn't", 'french']) eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True),
["l'", 'hôpital'])
eq_(tokenize("This isn't French", 'en'),
['this', "isn't", 'french'])
def test_catalan(): def test_catastrophes():
# Catalan orthography is fiddly. Test that we get a short sentence right. # More apostrophes, but this time they're in Catalan, and there's other
# mid-word punctuation going on too.
eq_(tokenize("M'acabo d'instal·lar.", 'ca'), eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
['m', 'acabo', 'd', 'instal·lar']) ['m', 'acabo', 'd', 'instal·lar'])
eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True), eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),

View File

@ -22,23 +22,6 @@ ABJAD_LANGUAGES = {
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi' 'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
} }
# Languages that can stick particles such as «l'» onto a word starting with
# a vowel sound, and where this vowel sound can follow a silent «h». These
# are French and related languages.
FRENCH_ISH_LANGUAGES = {
'fr', # French
'ca', # Catalan
'frp', # Franco-Provençal or Arpitan
'nrf', # Norman French / Jèrriais / Guernésiais
'oc', # Occitan
'pcd', # Picard
'wa', # Walloon
'frm', # Middle French
'fro', # Old French
}
def _make_spaceless_expr(): def _make_spaceless_expr():
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS] pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
return ''.join(pieces) return ''.join(pieces)
@ -76,6 +59,13 @@ TOKEN_RE = regex.compile(r"""
# Case 2: standard Unicode segmentation # Case 2: standard Unicode segmentation
# ------------------------------------- # -------------------------------------
# The start of the token must not be a letter followed by «'h». If it is,
# we should use Case 3 to match up to the apostrophe, then match a new token
# starting with «h». This rule lets us break «l'heure» into two tokens, just
# like we would do for «l'arc».
(?!\w'[Hh])
# The start of the token must be 'word-like', not punctuation or whitespace # The start of the token must be 'word-like', not punctuation or whitespace
# or various other things. However, we allow characters of category So # or various other things. However, we allow characters of category So
# (Symbol - Other) because many of these are emoji, which can convey # (Symbol - Other) because many of these are emoji, which can convey
@ -87,13 +77,22 @@ TOKEN_RE = regex.compile(r"""
# (\S) and do not cause word breaks according to the Unicode word # (\S) and do not cause word breaks according to the Unicode word
# segmentation heuristic (\B), or are categorized as Marks (\p{M}). # segmentation heuristic (\B), or are categorized as Marks (\p{M}).
(?:\B\S|\p{M})* (?:\B\S|\p{M})* |
# Case 3: Fix French
# ------------------
# This allows us to match the articles in French, Catalan, and related
# languages, such as «l'», that we may have excluded from being part of
# the token in Case 2.
\w'
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE) """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r""" TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
[<SPACELESS>]+ | [<SPACELESS>]+ |
[\p{punct}]+ | [\p{punct}]+ |
\S(?:\B\S|\p{M})* (?!\w'[Hh]) \S(?:\B\S|\p{M})* |
\w'
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE) """.replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1) MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
@ -160,30 +159,6 @@ def romanian_tokenize(text, include_punctuation=False):
] ]
def french_tokenize(text, include_punctuation=False):
"""
Handle French apostrophes that precede an 'h', which should work the same as
before a vowel, which the Unicode Consortium forgot. "l'heure" should tokenize
as "l'" and "heure".
This also applies the same way to other languages such as Catalan.
"""
tokens = []
for token in simple_tokenize(text, include_punctuation):
if "'h" in token:
idx = token.find("'h")
if include_punctuation:
# Only include the apostrophe in the token if
# include_punctuation is True
tokens.append(token[:idx + 1])
else:
tokens.append(token[:idx])
tokens.append(token[idx + 1:])
else:
tokens.append(token)
return tokens
def tokenize_mecab_language(text, lang, include_punctuation=False): def tokenize_mecab_language(text, lang, include_punctuation=False):
""" """
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary. Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
@ -360,8 +335,6 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
return turkish_tokenize(text, include_punctuation) return turkish_tokenize(text, include_punctuation)
elif lang == 'ro': elif lang == 'ro':
return romanian_tokenize(text, include_punctuation) return romanian_tokenize(text, include_punctuation)
elif lang in FRENCH_ISH_LANGUAGES:
return french_tokenize(text, include_punctuation)
elif lang in ABJAD_LANGUAGES: elif lang in ABJAD_LANGUAGES:
text = remove_marks(unicodedata.normalize('NFKC', text)) text = remove_marks(unicodedata.normalize('NFKC', text))
return simple_tokenize(text, include_punctuation) return simple_tokenize(text, include_punctuation)