mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
use better regexes in wordfreq_builder tokenizer
This commit is contained in:
parent
554455699d
commit
de73888a76
@ -1,7 +1,7 @@
|
|||||||
from html.entities import name2codepoint
|
from html.entities import name2codepoint
|
||||||
from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
|
from wordfreq import tokenize
|
||||||
from ftfy.fixes import unescape_html
|
from ftfy.fixes import unescape_html
|
||||||
import re
|
import regex
|
||||||
import pycld2
|
import pycld2
|
||||||
|
|
||||||
CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
|
CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
|
||||||
@ -18,10 +18,10 @@ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
|
|||||||
] +
|
] +
|
||||||
[chr(65534+65536*x+y) for x in range(17) for y in range(2)]
|
[chr(65534+65536*x+y) for x in range(17) for y in range(2)]
|
||||||
)
|
)
|
||||||
CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE)
|
CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE)
|
||||||
|
|
||||||
TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE))
|
TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
|
||||||
TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
|
TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
|
||||||
|
|
||||||
|
|
||||||
def cld2_surface_tokenizer(text):
|
def cld2_surface_tokenizer(text):
|
||||||
|
Loading…
Reference in New Issue
Block a user