mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
add tokenizer for Reddit
This commit is contained in:
parent
2dcf368481
commit
5ef807117d
14
wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py
Normal file
14
wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('filename', help='filename of input file containing one tweet per line')
|
||||||
|
parser.add_argument('outprefix', help='prefix of output filenames')
|
||||||
|
args = parser.parse_args()
|
||||||
|
tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -1,4 +1,4 @@
|
|||||||
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
|
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
@ -7,7 +7,7 @@ def main():
|
|||||||
parser.add_argument('filename', help='filename of input file containing one tweet per line')
|
parser.add_argument('filename', help='filename of input file containing one tweet per line')
|
||||||
parser.add_argument('outprefix', help='prefix of output filenames')
|
parser.add_argument('outprefix', help='prefix of output filenames')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
|
tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -22,6 +22,8 @@ CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE)
|
|||||||
|
|
||||||
TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
|
TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
|
||||||
TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
|
TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
|
||||||
|
URL_RE = regex.compile(r'http(?:s)?://[^) ]*')
|
||||||
|
MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)')
|
||||||
|
|
||||||
|
|
||||||
def cld2_surface_tokenizer(text):
|
def cld2_surface_tokenizer(text):
|
||||||
@ -31,6 +33,7 @@ def cld2_surface_tokenizer(text):
|
|||||||
text = unescape_html(text)
|
text = unescape_html(text)
|
||||||
text = TWITTER_HANDLE_RE.sub('', text)
|
text = TWITTER_HANDLE_RE.sub('', text)
|
||||||
text = TCO_RE.sub('', text)
|
text = TCO_RE.sub('', text)
|
||||||
|
|
||||||
lang = cld2_detect_language(text)
|
lang = cld2_detect_language(text)
|
||||||
|
|
||||||
# Don't allow tokenization in Chinese when language-detecting, because
|
# Don't allow tokenization in Chinese when language-detecting, because
|
||||||
@ -42,6 +45,26 @@ def cld2_surface_tokenizer(text):
|
|||||||
return lang, tokens
|
return lang, tokens
|
||||||
|
|
||||||
|
|
||||||
|
# Low-frequency languages tend to be detected incorrectly. Keep a limited
|
||||||
|
# list of languages we're allowed to use here.
|
||||||
|
KEEP_THESE_LANGUAGES = {
|
||||||
|
'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'ja', 'ko', 'ms', 'nl',
|
||||||
|
'pl', 'pt', 'ro', 'ru', 'sv', 'th'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def cld2_reddit_tokenizer(text):
|
||||||
|
text = URL_RE.sub('', text)
|
||||||
|
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
|
||||||
|
|
||||||
|
lang = cld2_detect_language(text)
|
||||||
|
if lang not in KEEP_THESE_LANGUAGES:
|
||||||
|
lang = 'en'
|
||||||
|
|
||||||
|
tokens = tokenize(text, lang, include_punctuation=True)
|
||||||
|
return lang, tokens
|
||||||
|
|
||||||
|
|
||||||
def cld2_detect_language(text):
|
def cld2_detect_language(text):
|
||||||
"""
|
"""
|
||||||
Uses CLD2 to detect the language.
|
Uses CLD2 to detect the language.
|
||||||
@ -59,11 +82,9 @@ def cld2_detect_language(text):
|
|||||||
return pycld2.detect(text)[2][0][1]
|
return pycld2.detect(text)[2][0][1]
|
||||||
|
|
||||||
|
|
||||||
def tokenize_twitter(in_filename, out_prefix, tokenizer):
|
def tokenize_by_language(in_filename, out_prefix, tokenizer):
|
||||||
"""
|
"""
|
||||||
Process a file by running it through the Twitter-specific tokenizer,
|
Process a file by running it through a given tokenizer.
|
||||||
which uses cld2 for language detection, and removes Twitter handles
|
|
||||||
and t.co URLs.
|
|
||||||
|
|
||||||
Produces output files that are separated by language, with newlines
|
Produces output files that are separated by language, with newlines
|
||||||
between the tokens.
|
between the tokens.
|
||||||
|
Loading…
Reference in New Issue
Block a user