From 5a1fc00aaabd8d121a9f1e71ac6224141c8badfe Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Tue, 25 Aug 2015 12:41:48 -0400 Subject: [PATCH] Strip apostrophes from edges of tokens The issue here is that if you had French text with an apostrophe, such as "d'un", it would split it into "d'" and "un", but if "d'" were re-tokenized it would come out as "d". Stripping apostrophes makes the process more idempotent. --- wordfreq/tokens.py | 2 +- wordfreq_builder/tests/test_tokenizer.py | 2 +- wordfreq_builder/wordfreq_builder/word_counts.py | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/wordfreq/tokens.py b/wordfreq/tokens.py index 9fefdc8..dc540ac 100644 --- a/wordfreq/tokens.py +++ b/wordfreq/tokens.py @@ -49,7 +49,7 @@ def simple_tokenize(text): relatively untokenized. """ text = unicodedata.normalize('NFC', text) - return [token.casefold() for token in TOKEN_RE.findall(text)] + return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)] def remove_arabic_marks(text): diff --git a/wordfreq_builder/tests/test_tokenizer.py b/wordfreq_builder/tests/test_tokenizer.py index a26feab..2fbc477 100644 --- a/wordfreq_builder/tests/test_tokenizer.py +++ b/wordfreq_builder/tests/test_tokenizer.py @@ -6,7 +6,7 @@ def test_tokenizer_1(): text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."' tokens = [ 'this', 'is', 'a', 'test', 'she', 'said', - 'and', "i'll", 'bet', "y'all", '3', '50', 'that', + 'and', "i'll", 'bet', "y", "all", '3.50', 'that', 'it', "won't", 'fail', ] result = cld2_surface_tokenizer(text) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 5127108..55eff3d 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -6,6 +6,10 @@ import math import csv import msgpack import gzip +import regex + + +URL_RE = regex.compile(r'https?://(?:\B\S)+') def count_tokens(filename): @@ -18,6 +22,7 @@ def count_tokens(filename): counts = defaultdict(int) with open(filename, encoding='utf-8', errors='replace') as infile: for line in infile: + line = URL_RE.sub('', line.strip()) for token in simple_tokenize(line): counts[token] += 1