mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Bake the 'h special case into the regex
This lets me remove the French-specific code I just put in.
This commit is contained in:
parent
82eba05f2d
commit
21a78f5eb9
@ -3,23 +3,27 @@ from wordfreq import tokenize, word_frequency
|
|||||||
|
|
||||||
|
|
||||||
def test_apostrophes():
|
def test_apostrophes():
|
||||||
for lang in ('fr', 'ca', 'oc'):
|
# Test that we handle apostrophes in French reasonably.
|
||||||
eq_(tokenize("langues d'oïl", lang),
|
eq_(tokenize("qu'un", 'fr'), ['qu', 'un'])
|
||||||
['langues', "d", 'oïl'])
|
eq_(tokenize("qu'un", 'fr', include_punctuation=True),
|
||||||
eq_(tokenize("langues d'oïl", lang, include_punctuation=True),
|
["qu'", "un"])
|
||||||
['langues', "d'", 'oïl'])
|
eq_(tokenize("langues d'oïl", 'fr'),
|
||||||
eq_(tokenize("l'heure", lang),
|
['langues', "d", 'oïl'])
|
||||||
['l', 'heure'])
|
eq_(tokenize("langues d'oïl", 'fr', include_punctuation=True),
|
||||||
eq_(tokenize("l'heure", lang, include_punctuation=True),
|
['langues', "d'", 'oïl'])
|
||||||
["l'", 'heure'])
|
eq_(tokenize("l'heure", 'fr'),
|
||||||
eq_(tokenize("L'Hôpital", lang, include_punctuation=True),
|
['l', 'heure'])
|
||||||
["l'", 'hôpital'])
|
eq_(tokenize("l'heure", 'fr', include_punctuation=True),
|
||||||
eq_(tokenize("This isn't French", lang),
|
["l'", 'heure'])
|
||||||
['this', "isn't", 'french'])
|
eq_(tokenize("L'Hôpital", 'fr', include_punctuation=True),
|
||||||
|
["l'", 'hôpital'])
|
||||||
|
eq_(tokenize("This isn't French", 'en'),
|
||||||
|
['this', "isn't", 'french'])
|
||||||
|
|
||||||
|
|
||||||
def test_catalan():
|
def test_catastrophes():
|
||||||
# Catalan orthography is fiddly. Test that we get a short sentence right.
|
# More apostrophes, but this time they're in Catalan, and there's other
|
||||||
|
# mid-word punctuation going on too.
|
||||||
eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
|
eq_(tokenize("M'acabo d'instal·lar.", 'ca'),
|
||||||
['m', 'acabo', 'd', 'instal·lar'])
|
['m', 'acabo', 'd', 'instal·lar'])
|
||||||
eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
|
eq_(tokenize("M'acabo d'instal·lar.", 'ca', include_punctuation=True),
|
||||||
|
@ -22,23 +22,6 @@ ABJAD_LANGUAGES = {
|
|||||||
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
|
'ar', 'bal', 'fa', 'ku', 'ps', 'sd', 'tk', 'ug', 'ur', 'he', 'yi'
|
||||||
}
|
}
|
||||||
|
|
||||||
# Languages that can stick particles such as «l'» onto a word starting with
|
|
||||||
# a vowel sound, and where this vowel sound can follow a silent «h». These
|
|
||||||
# are French and related languages.
|
|
||||||
FRENCH_ISH_LANGUAGES = {
|
|
||||||
'fr', # French
|
|
||||||
'ca', # Catalan
|
|
||||||
'frp', # Franco-Provençal or Arpitan
|
|
||||||
'nrf', # Norman French / Jèrriais / Guernésiais
|
|
||||||
'oc', # Occitan
|
|
||||||
'pcd', # Picard
|
|
||||||
'wa', # Walloon
|
|
||||||
|
|
||||||
'frm', # Middle French
|
|
||||||
'fro', # Old French
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _make_spaceless_expr():
|
def _make_spaceless_expr():
|
||||||
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
|
pieces = [r'\p{IsIdeo}'] + [r'\p{Script=%s}' % script_code for script_code in SPACELESS_SCRIPTS]
|
||||||
return ''.join(pieces)
|
return ''.join(pieces)
|
||||||
@ -76,6 +59,13 @@ TOKEN_RE = regex.compile(r"""
|
|||||||
# Case 2: standard Unicode segmentation
|
# Case 2: standard Unicode segmentation
|
||||||
# -------------------------------------
|
# -------------------------------------
|
||||||
|
|
||||||
|
# The start of the token must not be a letter followed by «'h». If it is,
|
||||||
|
# we should use Case 3 to match up to the apostrophe, then match a new token
|
||||||
|
# starting with «h». This rule lets us break «l'heure» into two tokens, just
|
||||||
|
# like we would do for «l'arc».
|
||||||
|
|
||||||
|
(?!\w'[Hh])
|
||||||
|
|
||||||
# The start of the token must be 'word-like', not punctuation or whitespace
|
# The start of the token must be 'word-like', not punctuation or whitespace
|
||||||
# or various other things. However, we allow characters of category So
|
# or various other things. However, we allow characters of category So
|
||||||
# (Symbol - Other) because many of these are emoji, which can convey
|
# (Symbol - Other) because many of these are emoji, which can convey
|
||||||
@ -87,13 +77,22 @@ TOKEN_RE = regex.compile(r"""
|
|||||||
# (\S) and do not cause word breaks according to the Unicode word
|
# (\S) and do not cause word breaks according to the Unicode word
|
||||||
# segmentation heuristic (\B), or are categorized as Marks (\p{M}).
|
# segmentation heuristic (\B), or are categorized as Marks (\p{M}).
|
||||||
|
|
||||||
(?:\B\S|\p{M})*
|
(?:\B\S|\p{M})* |
|
||||||
|
|
||||||
|
# Case 3: Fix French
|
||||||
|
# ------------------
|
||||||
|
# This allows us to match the articles in French, Catalan, and related
|
||||||
|
# languages, such as «l'», that we may have excluded from being part of
|
||||||
|
# the token in Case 2.
|
||||||
|
|
||||||
|
\w'
|
||||||
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
||||||
|
|
||||||
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
TOKEN_RE_WITH_PUNCTUATION = regex.compile(r"""
|
||||||
[<SPACELESS>]+ |
|
[<SPACELESS>]+ |
|
||||||
[\p{punct}]+ |
|
[\p{punct}]+ |
|
||||||
\S(?:\B\S|\p{M})*
|
(?!\w'[Hh]) \S(?:\B\S|\p{M})* |
|
||||||
|
\w'
|
||||||
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
""".replace('<SPACELESS>', SPACELESS_EXPR), regex.V1 | regex.WORD | regex.VERBOSE)
|
||||||
|
|
||||||
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
MARK_RE = regex.compile(r'[\p{Mn}\N{ARABIC TATWEEL}]', regex.V1)
|
||||||
@ -160,30 +159,6 @@ def romanian_tokenize(text, include_punctuation=False):
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
def french_tokenize(text, include_punctuation=False):
|
|
||||||
"""
|
|
||||||
Handle French apostrophes that precede an 'h', which should work the same as
|
|
||||||
before a vowel, which the Unicode Consortium forgot. "l'heure" should tokenize
|
|
||||||
as "l'" and "heure".
|
|
||||||
|
|
||||||
This also applies the same way to other languages such as Catalan.
|
|
||||||
"""
|
|
||||||
tokens = []
|
|
||||||
for token in simple_tokenize(text, include_punctuation):
|
|
||||||
if "'h" in token:
|
|
||||||
idx = token.find("'h")
|
|
||||||
if include_punctuation:
|
|
||||||
# Only include the apostrophe in the token if
|
|
||||||
# include_punctuation is True
|
|
||||||
tokens.append(token[:idx + 1])
|
|
||||||
else:
|
|
||||||
tokens.append(token[:idx])
|
|
||||||
tokens.append(token[idx + 1:])
|
|
||||||
else:
|
|
||||||
tokens.append(token)
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
|
|
||||||
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
def tokenize_mecab_language(text, lang, include_punctuation=False):
|
||||||
"""
|
"""
|
||||||
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
Tokenize Japanese or Korean text, initializing the MeCab tokenizer if necessary.
|
||||||
@ -360,8 +335,6 @@ def tokenize(text, lang, include_punctuation=False, external_wordlist=False):
|
|||||||
return turkish_tokenize(text, include_punctuation)
|
return turkish_tokenize(text, include_punctuation)
|
||||||
elif lang == 'ro':
|
elif lang == 'ro':
|
||||||
return romanian_tokenize(text, include_punctuation)
|
return romanian_tokenize(text, include_punctuation)
|
||||||
elif lang in FRENCH_ISH_LANGUAGES:
|
|
||||||
return french_tokenize(text, include_punctuation)
|
|
||||||
elif lang in ABJAD_LANGUAGES:
|
elif lang in ABJAD_LANGUAGES:
|
||||||
text = remove_marks(unicodedata.normalize('NFKC', text))
|
text = remove_marks(unicodedata.normalize('NFKC', text))
|
||||||
return simple_tokenize(text, include_punctuation)
|
return simple_tokenize(text, include_punctuation)
|
||||||
|
Loading…
Reference in New Issue
Block a user