add tokenizer for Reddit

This commit is contained in:
Rob Speer 2015-11-30 18:16:54 -05:00
parent 2dcf368481
commit 5ef807117d
3 changed files with 41 additions and 6 deletions

View File

@ -0,0 +1,14 @@
from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument('filename', help='filename of input file containing one tweet per line')
parser.add_argument('outprefix', help='prefix of output filenames')
args = parser.parse_args()
tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer)
if __name__ == '__main__':
main()

View File

@ -1,4 +1,4 @@
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language
import argparse import argparse
@ -7,7 +7,7 @@ def main():
parser.add_argument('filename', help='filename of input file containing one tweet per line') parser.add_argument('filename', help='filename of input file containing one tweet per line')
parser.add_argument('outprefix', help='prefix of output filenames') parser.add_argument('outprefix', help='prefix of output filenames')
args = parser.parse_args() args = parser.parse_args()
tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer) tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -22,6 +22,8 @@ CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE)
TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+') TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+') TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
URL_RE = regex.compile(r'http(?:s)?://[^) ]*')
MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)')
def cld2_surface_tokenizer(text): def cld2_surface_tokenizer(text):
@ -31,6 +33,7 @@ def cld2_surface_tokenizer(text):
text = unescape_html(text) text = unescape_html(text)
text = TWITTER_HANDLE_RE.sub('', text) text = TWITTER_HANDLE_RE.sub('', text)
text = TCO_RE.sub('', text) text = TCO_RE.sub('', text)
lang = cld2_detect_language(text) lang = cld2_detect_language(text)
# Don't allow tokenization in Chinese when language-detecting, because # Don't allow tokenization in Chinese when language-detecting, because
@ -42,6 +45,26 @@ def cld2_surface_tokenizer(text):
return lang, tokens return lang, tokens
# Low-frequency languages tend to be detected incorrectly. Keep a limited
# list of languages we're allowed to use here.
KEEP_THESE_LANGUAGES = {
'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'ja', 'ko', 'ms', 'nl',
'pl', 'pt', 'ro', 'ru', 'sv', 'th'
}
def cld2_reddit_tokenizer(text):
text = URL_RE.sub('', text)
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
lang = cld2_detect_language(text)
if lang not in KEEP_THESE_LANGUAGES:
lang = 'en'
tokens = tokenize(text, lang, include_punctuation=True)
return lang, tokens
def cld2_detect_language(text): def cld2_detect_language(text):
""" """
Uses CLD2 to detect the language. Uses CLD2 to detect the language.
@ -59,11 +82,9 @@ def cld2_detect_language(text):
return pycld2.detect(text)[2][0][1] return pycld2.detect(text)[2][0][1]
def tokenize_twitter(in_filename, out_prefix, tokenizer): def tokenize_by_language(in_filename, out_prefix, tokenizer):
""" """
Process a file by running it through the Twitter-specific tokenizer, Process a file by running it through a given tokenizer.
which uses cld2 for language detection, and removes Twitter handles
and t.co URLs.
Produces output files that are separated by language, with newlines Produces output files that are separated by language, with newlines
between the tokens. between the tokens.