mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Strip apostrophes from edges of tokens
The issue here is that if you had French text with an apostrophe,
such as "d'un", it would split it into "d'" and "un", but if "d'"
were re-tokenized it would come out as "d". Stripping apostrophes
makes the process more idempotent.
Former-commit-id: 5a1fc00aaa
This commit is contained in:
parent
0b282c5055
commit
b22a4b0f02
@ -49,7 +49,7 @@ def simple_tokenize(text):
|
|||||||
relatively untokenized.
|
relatively untokenized.
|
||||||
"""
|
"""
|
||||||
text = unicodedata.normalize('NFC', text)
|
text = unicodedata.normalize('NFC', text)
|
||||||
return [token.casefold() for token in TOKEN_RE.findall(text)]
|
return [token.strip("'").casefold() for token in TOKEN_RE.findall(text)]
|
||||||
|
|
||||||
|
|
||||||
def remove_arabic_marks(text):
|
def remove_arabic_marks(text):
|
||||||
|
@ -6,7 +6,7 @@ def test_tokenizer_1():
|
|||||||
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
|
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
|
||||||
tokens = [
|
tokens = [
|
||||||
'this', 'is', 'a', 'test', 'she', 'said',
|
'this', 'is', 'a', 'test', 'she', 'said',
|
||||||
'and', "i'll", 'bet', "y'all", '3', '50', 'that',
|
'and', "i'll", 'bet', "y", "all", '3.50', 'that',
|
||||||
'it', "won't", 'fail',
|
'it', "won't", 'fail',
|
||||||
]
|
]
|
||||||
result = cld2_surface_tokenizer(text)
|
result = cld2_surface_tokenizer(text)
|
||||||
|
@ -6,6 +6,10 @@ import math
|
|||||||
import csv
|
import csv
|
||||||
import msgpack
|
import msgpack
|
||||||
import gzip
|
import gzip
|
||||||
|
import regex
|
||||||
|
|
||||||
|
|
||||||
|
URL_RE = regex.compile(r'https?://(?:\B\S)+')
|
||||||
|
|
||||||
|
|
||||||
def count_tokens(filename):
|
def count_tokens(filename):
|
||||||
@ -18,6 +22,7 @@ def count_tokens(filename):
|
|||||||
counts = defaultdict(int)
|
counts = defaultdict(int)
|
||||||
with open(filename, encoding='utf-8', errors='replace') as infile:
|
with open(filename, encoding='utf-8', errors='replace') as infile:
|
||||||
for line in infile:
|
for line in infile:
|
||||||
|
line = URL_RE.sub('', line.strip())
|
||||||
for token in simple_tokenize(line):
|
for token in simple_tokenize(line):
|
||||||
counts[token] += 1
|
counts[token] += 1
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user