From af29fc4f885bf635eebcd68002dba735f61824d2 Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Wed, 26 Aug 2015 15:00:46 -0400 Subject: [PATCH] fix URL expression Former-commit-id: c4a25942170825b91f0dd2a23ce04ce3b502b9cd --- wordfreq_builder/tests/test_urls.py | 20 +++++++++++++++++++ .../wordfreq_builder/word_counts.py | 7 +++++-- 2 files changed, 25 insertions(+), 2 deletions(-) create mode 100644 wordfreq_builder/tests/test_urls.py diff --git a/wordfreq_builder/tests/test_urls.py b/wordfreq_builder/tests/test_urls.py new file mode 100644 index 0000000..688a0b8 --- /dev/null +++ b/wordfreq_builder/tests/test_urls.py @@ -0,0 +1,20 @@ +from wordfreq_builder.word_counts import URL_RE +from nose.tools import eq_ + + +def check_url(url): + match = URL_RE.match(url) + assert match + eq_(match.span(), (0, len(url))) + + +def test_url_re(): + # URLs like this are all over the Arabic Wikipedia. Here's one with the + # student ID blanked out. + yield check_url, 'http://www.ju.edu.jo/alumnicard/0000000.aspx' + + yield check_url, 'https://example.com/űnicode.html' + yield check_url, 'http://☃.net' + + assert not URL_RE.match('ftp://127.0.0.1') + diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 55eff3d..9da95a3 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -9,7 +9,9 @@ import gzip import regex -URL_RE = regex.compile(r'https?://(?:\B\S)+') +# Match common cases of URLs: the schema http:// or https:// followed by +# non-whitespace characters. +URL_RE = regex.compile(r'https?://(?:\S)+') def count_tokens(filename): @@ -17,7 +19,8 @@ def count_tokens(filename): Count tokens that appear in a file, running each line through our simple tokenizer. - Unicode errors in the input data will become token boundaries. + URLs will be skipped, and Unicode errors will become separate tokens + containing '�'. """ counts = defaultdict(int) with open(filename, encoding='utf-8', errors='replace') as infile: