mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
Exclude math and modifier symbols as tokens
Former-commit-id: 8f3c9f576c
This commit is contained in:
parent
8637aaef9e
commit
070c89c00c
@ -4,10 +4,11 @@ import unicodedata
|
|||||||
|
|
||||||
# Here's what the following regular expression is looking for:
|
# Here's what the following regular expression is looking for:
|
||||||
#
|
#
|
||||||
# At the start, it looks for a character in the set [\S--\p{punct}]. \S
|
# At the start, it looks for a character in the set \S -- the set of
|
||||||
# contains non-space characters, and then it subtracts the set of Unicode
|
# non-punctuation -- with various characters subtracted out, including punctuation
|
||||||
# punctuation characters from that set. This is slightly different from \w,
|
# and most of the 'symbol' categories. (We leave So, "Symbol - Other", because
|
||||||
# because it leaves symbols (such as emoji) as tokens.
|
# it contains things like emoji that have interesting frequencies. This is why
|
||||||
|
# we don't just insist on the token starting with a "word" character, \w.)
|
||||||
#
|
#
|
||||||
# After it has found one such character, the rest of the token is (?:\B\S)*,
|
# After it has found one such character, the rest of the token is (?:\B\S)*,
|
||||||
# which continues to consume characters as long as the next character does not
|
# which continues to consume characters as long as the next character does not
|
||||||
@ -26,7 +27,7 @@ import unicodedata
|
|||||||
# correct behavior for word-wrapping, but it's an ugly failure mode for NLP
|
# correct behavior for word-wrapping, but it's an ugly failure mode for NLP
|
||||||
# tokenization.
|
# tokenization.
|
||||||
|
|
||||||
TOKEN_RE = regex.compile(r'[\S--\p{punct}](?:\B\S|[\p{IsIdeo}\p{Script=Hiragana}])*', regex.V1 | regex.WORD)
|
TOKEN_RE = regex.compile(r'[\S--[\p{punct}\p{Sm}\p{Sc}\p{Sk}]](?:\B\S|[\p{IsIdeo}\p{Script=Hiragana}])*', regex.V1 | regex.WORD)
|
||||||
ARABIC_MARK_RE = regex.compile(r'[[\p{Mn}&&\p{Block=Arabic}]\N{ARABIC TATWEEL}]', regex.V1)
|
ARABIC_MARK_RE = regex.compile(r'[[\p{Mn}&&\p{Block=Arabic}]\N{ARABIC TATWEEL}]', regex.V1)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user