mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
un-flake wordfreq_builder.tokenizers, and edit docstrings
Former-commit-id: a893823d6e
This commit is contained in:
parent
769d8c627c
commit
3a140ee02f
@ -1,4 +1,3 @@
|
||||
from html.entities import name2codepoint
|
||||
from wordfreq import tokenize
|
||||
from ftfy.fixes import unescape_html
|
||||
import regex
|
||||
@ -26,7 +25,7 @@ TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
|
||||
|
||||
def cld2_surface_tokenizer(text):
|
||||
"""
|
||||
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
|
||||
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
|
||||
"""
|
||||
text = unescape_html(text)
|
||||
text = TWITTER_HANDLE_RE.sub('', text)
|
||||
@ -38,7 +37,7 @@ def cld2_surface_tokenizer(text):
|
||||
|
||||
def cld2_detect_language(text):
|
||||
"""
|
||||
Uses CLD2 to detect the language
|
||||
Uses CLD2 to detect the language.
|
||||
"""
|
||||
# Format of pycld2.detect:
|
||||
# (Confident in result: bool,
|
||||
@ -55,9 +54,12 @@ def cld2_detect_language(text):
|
||||
|
||||
def tokenize_twitter(in_filename, out_prefix, tokenizer):
|
||||
"""
|
||||
Process a file by running it through the given tokenizer, sorting the
|
||||
results by the language of each line, and inserting newlines
|
||||
to mark the token boundaries.
|
||||
Process a file by running it through the Twitter-specific tokenizer,
|
||||
which uses cld2 for language detection, and removes Twitter handles
|
||||
and t.co URLs.
|
||||
|
||||
Produces output files that are separated by language, with newlines
|
||||
between the tokens.
|
||||
"""
|
||||
out_files = {}
|
||||
with open(in_filename, encoding='utf-8') as in_file:
|
||||
|
Loading…
Reference in New Issue
Block a user