mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
parent
e463397edf
commit
af29fc4f88
20
wordfreq_builder/tests/test_urls.py
Normal file
20
wordfreq_builder/tests/test_urls.py
Normal file
@ -0,0 +1,20 @@
|
||||
from wordfreq_builder.word_counts import URL_RE
|
||||
from nose.tools import eq_
|
||||
|
||||
|
||||
def check_url(url):
|
||||
match = URL_RE.match(url)
|
||||
assert match
|
||||
eq_(match.span(), (0, len(url)))
|
||||
|
||||
|
||||
def test_url_re():
|
||||
# URLs like this are all over the Arabic Wikipedia. Here's one with the
|
||||
# student ID blanked out.
|
||||
yield check_url, 'http://www.ju.edu.jo/alumnicard/0000000.aspx'
|
||||
|
||||
yield check_url, 'https://example.com/űnicode.html'
|
||||
yield check_url, 'http://☃.net'
|
||||
|
||||
assert not URL_RE.match('ftp://127.0.0.1')
|
||||
|
@ -9,7 +9,9 @@ import gzip
|
||||
import regex
|
||||
|
||||
|
||||
URL_RE = regex.compile(r'https?://(?:\B\S)+')
|
||||
# Match common cases of URLs: the schema http:// or https:// followed by
|
||||
# non-whitespace characters.
|
||||
URL_RE = regex.compile(r'https?://(?:\S)+')
|
||||
|
||||
|
||||
def count_tokens(filename):
|
||||
@ -17,7 +19,8 @@ def count_tokens(filename):
|
||||
Count tokens that appear in a file, running each line through our
|
||||
simple tokenizer.
|
||||
|
||||
Unicode errors in the input data will become token boundaries.
|
||||
URLs will be skipped, and Unicode errors will become separate tokens
|
||||
containing '<EFBFBD>'.
|
||||
"""
|
||||
counts = defaultdict(int)
|
||||
with open(filename, encoding='utf-8', errors='replace') as infile:
|
||||
|
Loading…
Reference in New Issue
Block a user