mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
un-flake wordfreq_builder.tokenizers, and edit docstrings
Former-commit-id: a893823d6e
This commit is contained in:
parent
769d8c627c
commit
3a140ee02f
@ -1,4 +1,3 @@
|
|||||||
from html.entities import name2codepoint
|
|
||||||
from wordfreq import tokenize
|
from wordfreq import tokenize
|
||||||
from ftfy.fixes import unescape_html
|
from ftfy.fixes import unescape_html
|
||||||
import regex
|
import regex
|
||||||
@ -26,7 +25,7 @@ TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
|
|||||||
|
|
||||||
def cld2_surface_tokenizer(text):
|
def cld2_surface_tokenizer(text):
|
||||||
"""
|
"""
|
||||||
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
|
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
|
||||||
"""
|
"""
|
||||||
text = unescape_html(text)
|
text = unescape_html(text)
|
||||||
text = TWITTER_HANDLE_RE.sub('', text)
|
text = TWITTER_HANDLE_RE.sub('', text)
|
||||||
@ -38,7 +37,7 @@ def cld2_surface_tokenizer(text):
|
|||||||
|
|
||||||
def cld2_detect_language(text):
|
def cld2_detect_language(text):
|
||||||
"""
|
"""
|
||||||
Uses CLD2 to detect the language
|
Uses CLD2 to detect the language.
|
||||||
"""
|
"""
|
||||||
# Format of pycld2.detect:
|
# Format of pycld2.detect:
|
||||||
# (Confident in result: bool,
|
# (Confident in result: bool,
|
||||||
@ -55,9 +54,12 @@ def cld2_detect_language(text):
|
|||||||
|
|
||||||
def tokenize_twitter(in_filename, out_prefix, tokenizer):
|
def tokenize_twitter(in_filename, out_prefix, tokenizer):
|
||||||
"""
|
"""
|
||||||
Process a file by running it through the given tokenizer, sorting the
|
Process a file by running it through the Twitter-specific tokenizer,
|
||||||
results by the language of each line, and inserting newlines
|
which uses cld2 for language detection, and removes Twitter handles
|
||||||
to mark the token boundaries.
|
and t.co URLs.
|
||||||
|
|
||||||
|
Produces output files that are separated by language, with newlines
|
||||||
|
between the tokens.
|
||||||
"""
|
"""
|
||||||
out_files = {}
|
out_files = {}
|
||||||
with open(in_filename, encoding='utf-8') as in_file:
|
with open(in_filename, encoding='utf-8') as in_file:
|
||||||
|
Loading…
Reference in New Issue
Block a user