mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
fix documentation in wordfreq_builder.tokenizers
Former-commit-id: 8ddc19a5ca
This commit is contained in:
parent
95cdf41fe8
commit
6eca3cff5a
@ -54,11 +54,17 @@ KEEP_THESE_LANGUAGES = {
|
||||
|
||||
|
||||
def cld2_reddit_tokenizer(text):
|
||||
"""
|
||||
A language-detecting tokenizer with special cases for handling text from
|
||||
Reddit.
|
||||
"""
|
||||
text = URL_RE.sub('', text)
|
||||
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
|
||||
|
||||
lang = cld2_detect_language(text)
|
||||
if lang not in KEEP_THESE_LANGUAGES:
|
||||
# Reddit is 99.9% English, so if we detected a rare language, it's
|
||||
# much more likely that it's actually English.
|
||||
lang = 'en'
|
||||
|
||||
tokens = tokenize(text, lang, include_punctuation=True)
|
||||
@ -86,7 +92,7 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer):
|
||||
"""
|
||||
Process a file by running it through a given tokenizer.
|
||||
|
||||
Produces output files that are separated by language, with newlines
|
||||
Produces output files that are separated by language, with spaces
|
||||
between the tokens.
|
||||
"""
|
||||
out_files = {}
|
||||
|
Loading…
Reference in New Issue
Block a user