Strip apostrophes from edges of tokens

The issue here is that if you had French text with an apostrophe,
such as "d'un", it would split it into "d'" and "un", but if "d'"
were re-tokenized it would come out as "d". Stripping apostrophes
makes the process more idempotent.


Former-commit-id: 5a1fc00aaa
This commit is contained in:
Robyn Speer 2015-08-25 12:41:48 -04:00
parent 0b282c5055
commit b22a4b0f02
3 changed files with 7 additions and 2 deletions

View File

@ -49,7 +49,7 @@ def simple_tokenize(text):
relatively untokenized.
"""
text = unicodedata.normalize('NFC', text)
return [token.casefold() for token in TOKEN_RE.findall(text)]
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
def remove_arabic_marks(text):

View File

@ -6,7 +6,7 @@ def test_tokenizer_1():
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
tokens = [
'this', 'is', 'a', 'test', 'she', 'said',
'and', "i'll", 'bet', "y'all", '3', '50', 'that',
'and', "i'll", 'bet', "y", "all", '3.50', 'that',
'it', "won't", 'fail',
]
result = cld2_surface_tokenizer(text)

View File

@ -6,6 +6,10 @@ import math
import csv
import msgpack
import gzip
import regex
URL_RE = regex.compile(r'https?://(?:\B\S)+')
def count_tokens(filename):
@ -18,6 +22,7 @@ def count_tokens(filename):
counts = defaultdict(int)
with open(filename, encoding='utf-8', errors='replace') as infile:
for line in infile:
line = URL_RE.sub('', line.strip())
for token in simple_tokenize(line):
counts[token] += 1