diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py new file mode 100644 index 0000000..c93af67 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py @@ -0,0 +1,14 @@ +from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language +import argparse + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('filename', help='filename of input file containing one tweet per line') + parser.add_argument('outprefix', help='prefix of output filenames') + args = parser.parse_args() + tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer) + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py index 879caa4..d144866 100644 --- a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py +++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py @@ -1,4 +1,4 @@ -from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter +from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language import argparse @@ -7,7 +7,7 @@ def main(): parser.add_argument('filename', help='filename of input file containing one tweet per line') parser.add_argument('outprefix', help='prefix of output filenames') args = parser.parse_args() - tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer) + tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer) if __name__ == '__main__': diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index 7d18026..af5d115 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -22,6 +22,8 @@ CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE) TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+') TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+') +URL_RE = regex.compile(r'http(?:s)?://[^) ]*') +MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)') def cld2_surface_tokenizer(text): @@ -31,6 +33,7 @@ def cld2_surface_tokenizer(text): text = unescape_html(text) text = TWITTER_HANDLE_RE.sub('', text) text = TCO_RE.sub('', text) + lang = cld2_detect_language(text) # Don't allow tokenization in Chinese when language-detecting, because @@ -42,6 +45,26 @@ def cld2_surface_tokenizer(text): return lang, tokens +# Low-frequency languages tend to be detected incorrectly. Keep a limited +# list of languages we're allowed to use here. +KEEP_THESE_LANGUAGES = { + 'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'ja', 'ko', 'ms', 'nl', + 'pl', 'pt', 'ro', 'ru', 'sv', 'th' +} + + +def cld2_reddit_tokenizer(text): + text = URL_RE.sub('', text) + text = MARKDOWN_URL_RESIDUE_RE.sub(']', text) + + lang = cld2_detect_language(text) + if lang not in KEEP_THESE_LANGUAGES: + lang = 'en' + + tokens = tokenize(text, lang, include_punctuation=True) + return lang, tokens + + def cld2_detect_language(text): """ Uses CLD2 to detect the language. @@ -59,11 +82,9 @@ def cld2_detect_language(text): return pycld2.detect(text)[2][0][1] -def tokenize_twitter(in_filename, out_prefix, tokenizer): +def tokenize_by_language(in_filename, out_prefix, tokenizer): """ - Process a file by running it through the Twitter-specific tokenizer, - which uses cld2 for language detection, and removes Twitter handles - and t.co URLs. + Process a file by running it through a given tokenizer. Produces output files that are separated by language, with newlines between the tokens.