mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Bake the 'h special case into the regex
This lets me remove the French-specific code I just put in.
This commit is contained in:
parent
82eba05f2d
commit
21a78f5eb9
@ -3,23 +3,27 @@ from wordfreq import tokenize, word_frequency
|
||||
|
||||
|
||||
def test_apostrophes():
|
||||
for lang in ('fr', 'ca', 'oc'):
|
||||
eq_(tokenize("langues d'oïl", lang),
|
||||
['langues', "d", 'oïl'])
|
||||
eq_(tokenize("langues d'oïl", lang, include_punctuation=True),
|
||||
['langues', "d'", 'oïl'])
|
||||
eq_(tokenize("l'heure", lang),
|
||||
['l', 'heure'])
|
||||
eq_(tokenize("l'heure", lang, include_punctuation=True),
|
||||
["l'", 'heure'])
|
||||
eq_(tokenize("L'Hôpital", lang, include_punctuation=True),
|
||||
["l'", 'hôpital'])
|
||||
eq_(tokenize("This isn't French", lang),
|
||||
['this', "isn't", 'french'])
|
||||
# Test that we handle apostrophes in French reasonably.
|
||||
eq_(tokenize("qu'un", 'fr'), ['qu', 'un'])
|
||||
eq_(tokenize("qu'un", 'fr', include_punctuation=True),
|
||||
["qu'", "un"])
|
||||
eq_(tokenize("langues d'oïl", 'fr'),
|
||||
['langues', "d", 'oïl'])
|
||||
eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True),
|
||||
['langues', "d'", 'oïl'])
|
||||
eq_(tokenize("l'heure", 'fr'),
|
||||
['l', 'heure'])
|
||||
eq_(tokenize("l'heure", 'fr', include_punctuation=True),
|
||||
["l'", 'heure'])
|
||||
eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True),
|
||||
["l'", 'hôpital'])
|
||||
eq_(tokenize("This isn't French", 'en'),
|
||||
['this', "isn't", 'french'])
|
||||
|
||||
|
||||
def test_catalan():
|
||||
# Catalan orthography is fiddly. Test that we get a short sentence right.
|
||||
def test_catastrophes():
|
||||
# More apostrophes, but this time they're in Catalan, and there's other
|
||||
# mid-word punctuation going on too.
|
||||
eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
|
||||
['m', 'acabo', 'd', 'instal·lar'])
|
||||
eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
|
||||
|
@ -22,23 +22,6 @@ ABJAD_LANGUAGES = {
|
||||
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
|
||||
}
|
||||
|
||||
# Languages that can stick particles such as «l'» onto a word starting with
|
||||
# a vowel sound, and where this vowel sound can follow a silent «h». These
|
||||
# are French and related languages.
|
||||
FRENCH_ISH_LANGUAGES = {
|
||||
'fr', # French
|
||||
'ca', # Catalan
|
||||
'frp', # Franco-Provençal or Arpitan
|
||||
'nrf', # Norman French / Jèrriais / Guernésiais
|
||||
'oc', # Occitan
|
||||
'pcd', # Picard
|
||||
'wa', # Walloon
|
||||
|
||||
'frm', # Middle French
|
||||
'fro', # Old French
|
||||
}
|
||||
|
||||
|
||||
def _make_spaceless_expr():
|
||||
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
|
||||
return ''.join(pieces)
|
||||
@ -76,6 +59,13 @@ TOKEN_RE = regex.compile(r"""
|
||||
# Case 2: standard Unicode segmentation
|
||||
# -------------------------------------
|
||||
|
||||
# The start of the token must not be a letter followed by «'h». If it is,
|
||||
# we should use Case 3 to match up to the apostrophe, then match a new token
|
||||
# starting with «h». This rule lets us break «l'heure» into two tokens, just
|
||||
# like we would do for «l'arc».
|
||||
|
||||
(?!\w'[Hh])
|
||||
|
||||
# The start of the token must be 'word-like', not punctuation or whitespace
|
||||
# or various other things. However, we allow characters of category So
|
||||
# (Symbol - Other) because many of these are emoji, which can convey
|
||||
@ -87,13 +77,22 @@ TOKEN_RE = regex.compile(r"""
|
||||
# (\S) and do not cause word breaks according to the Unicode word
|
||||
# segmentation heuristic (\B), or are categorized as Marks (\p{M}).
|
||||
|
||||
(?:\B\S|\p{M})*
|
||||
(?:\B\S|\p{M})* |
|
||||
|
||||
# Case 3: Fix French
|
||||
# ------------------
|
||||
# This allows us to match the articles in French, Catalan, and related
|
||||
# languages, such as «l'», that we may have excluded from being part of
|
||||
# the token in Case 2.
|
||||
|
||||
\w'
|
||||
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
||||
|
||||
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
||||
[<SPACELESS>]+ |
|
||||
[\p{punct}]+ |
|
||||
\S(?:\B\S|\p{M})*
|
||||
(?!\w'[Hh]) \S(?:\B\S|\p{M})* |
|
||||
\w'
|
||||
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
||||
|
||||
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||
@ -160,30 +159,6 @@ def romanian_tokenize(text, include_punctuation=False):
|
||||
]
|
||||
|
||||
|
||||
def french_tokenize(text, include_punctuation=False):
|
||||
"""
|
||||
Handle French apostrophes that precede an 'h', which should work the same as
|
||||
before a vowel, which the Unicode Consortium forgot. "l'heure" should tokenize
|
||||
as "l'" and "heure".
|
||||
|
||||
This also applies the same way to other languages such as Catalan.
|
||||
"""
|
||||
tokens = []
|
||||
for token in simple_tokenize(text, include_punctuation):
|
||||
if "'h" in token:
|
||||
idx = token.find("'h")
|
||||
if include_punctuation:
|
||||
# Only include the apostrophe in the token if
|
||||
# include_punctuation is True
|
||||
tokens.append(token[:idx + 1])
|
||||
else:
|
||||
tokens.append(token[:idx])
|
||||
tokens.append(token[idx + 1:])
|
||||
else:
|
||||
tokens.append(token)
|
||||
return tokens
|
||||
|
||||
|
||||
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
||||
"""
|
||||
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
||||
@ -360,8 +335,6 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
||||
return turkish_tokenize(text, include_punctuation)
|
||||
elif lang == 'ro':
|
||||
return romanian_tokenize(text, include_punctuation)
|
||||
elif lang in FRENCH_ISH_LANGUAGES:
|
||||
return french_tokenize(text, include_punctuation)
|
||||
elif lang in ABJAD_LANGUAGES:
|
||||
text = remove_marks(unicodedata.normalize('NFKC', text))
|
||||
return simple_tokenize(text, include_punctuation)
|
||||
|
Loading…
Reference in New Issue
Block a user