Strip apostrophes from edges of tokens

The issue here is that if you had French text with an apostrophe,
such as "d'un", it would split it into "d'" and "un", but if "d'"
were re-tokenized it would come out as "d". Stripping apostrophes
makes the process more idempotent.
This commit is contained in:
Rob Speer 2015-08-25 12:41:48 -04:00
parent a8e7c29068
commit 5a1fc00aaa
3 changed files with 7 additions and 2 deletions

View File

@ -49,7 +49,7 @@ def simple_tokenize(text):
relatively untokenized. relatively untokenized.
""" """
text = unicodedata.normalize('NFC', text) text = unicodedata.normalize('NFC', text)
return [token.casefold() for token in TOKEN_RE.findall(text)] return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
def remove_arabic_marks(text): def remove_arabic_marks(text):

View File

@ -6,7 +6,7 @@ def test_tokenizer_1():
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."' text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
tokens = [ tokens = [
'this', 'is', 'a', 'test', 'she', 'said', 'this', 'is', 'a', 'test', 'she', 'said',
'and', "i'll", 'bet', "y'all", '3', '50', 'that', 'and', "i'll", 'bet', "y", "all", '3.50', 'that',
'it', "won't", 'fail', 'it', "won't", 'fail',
] ]
result = cld2_surface_tokenizer(text) result = cld2_surface_tokenizer(text)

View File

@ -6,6 +6,10 @@ import math
import csv import csv
import msgpack import msgpack
import gzip import gzip
import regex
URL_RE = regex.compile(r'https?://(?:\B\S)+')
def count_tokens(filename): def count_tokens(filename):
@ -18,6 +22,7 @@ def count_tokens(filename):
counts = defaultdict(int) counts = defaultdict(int)
with open(filename, encoding='utf-8', errors='replace') as infile: with open(filename, encoding='utf-8', errors='replace') as infile:
for line in infile: for line in infile:
line = URL_RE.sub('', line.strip())
for token in simple_tokenize(line): for token in simple_tokenize(line):
counts[token] += 1 counts[token] += 1