mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
Strip apostrophes from edges of tokens
The issue here is that if you had French text with an apostrophe, such as "d'un", it would split it into "d'" and "un", but if "d'" were re-tokenized it would come out as "d". Stripping apostrophes makes the process more idempotent.
This commit is contained in:
parent
a8e7c29068
commit
5a1fc00aaa
@ -49,7 +49,7 @@ def simple_tokenize(text):
|
|||||||
relatively untokenized.
|
relatively untokenized.
|
||||||
"""
|
"""
|
||||||
text = unicodedata.normalize('NFC', text)
|
text = unicodedata.normalize('NFC', text)
|
||||||
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
||||||
|
|
||||||
|
|
||||||
def remove_arabic_marks(text):
|
def remove_arabic_marks(text):
|
||||||
|
@ -6,7 +6,7 @@ def test_tokenizer_1():
|
|||||||
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
|
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
|
||||||
tokens = [
|
tokens = [
|
||||||
'this', 'is', 'a', 'test', 'she', 'said',
|
'this', 'is', 'a', 'test', 'she', 'said',
|
||||||
'and', "i'll", 'bet', "y'all", '3', '50', 'that',
|
'and', "i'll", 'bet', "y", "all", '3.50', 'that',
|
||||||
'it', "won't", 'fail',
|
'it', "won't", 'fail',
|
||||||
]
|
]
|
||||||
result = cld2_surface_tokenizer(text)
|
result = cld2_surface_tokenizer(text)
|
||||||
|
@ -6,6 +6,10 @@ import math
|
|||||||
import csv
|
import csv
|
||||||
import msgpack
|
import msgpack
|
||||||
import gzip
|
import gzip
|
||||||
|
import regex
|
||||||
|
|
||||||
|
|
||||||
|
URL_RE = regex.compile(r'https?://(?:\B\S)+')
|
||||||
|
|
||||||
|
|
||||||
def count_tokens(filename):
|
def count_tokens(filename):
|
||||||
@ -18,6 +22,7 @@ def count_tokens(filename):
|
|||||||
counts = defaultdict(int)
|
counts = defaultdict(int)
|
||||||
with open(filename, encoding='utf-8', errors='replace') as infile:
|
with open(filename, encoding='utf-8', errors='replace') as infile:
|
||||||
for line in infile:
|
for line in infile:
|
||||||
|
line = URL_RE.sub('', line.strip())
|
||||||
for token in simple_tokenize(line):
|
for token in simple_tokenize(line):
|
||||||
counts[token] += 1
|
counts[token] += 1
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user