fix URL expression

This commit is contained in:
Rob Speer 2015-08-26 15:00:46 -04:00
parent f7babea352
commit c4a2594217
2 changed files with 25 additions and 2 deletions

View File

@ -0,0 +1,20 @@
from wordfreq_builder.word_counts import URL_RE
from nose.tools import eq_
def check_url(url):
match = URL_RE.match(url)
assert match
eq_(match.span(), (0, len(url)))
def test_url_re():
# URLs like this are all over the Arabic Wikipedia. Here's one with the
# student ID blanked out.
yield check_url, 'http://www.ju.edu.jo/alumnicard/0000000.aspx'
yield check_url, 'https://example.com/űnicode.html'
yield check_url, 'http://☃.net'
assert not URL_RE.match('ftp://127.0.0.1')

View File

@ -9,7 +9,9 @@ import gzip
import regex
URL_RE = regex.compile(r'https?://(?:\B\S)+')
# Match common cases of URLs: the schema http:// or https:// followed by
# non-whitespace characters.
URL_RE = regex.compile(r'https?://(?:\S)+')
def count_tokens(filename):
@ -17,7 +19,8 @@ def count_tokens(filename):
Count tokens that appear in a file, running each line through our
simple tokenizer.
Unicode errors in the input data will become token boundaries.
URLs will be skipped, and Unicode errors will become separate tokens
containing '<EFBFBD>'.
"""
counts = defaultdict(int)
with open(filename, encoding='utf-8', errors='replace') as infile: