mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
actually use the results of language-detection on Reddit
Former-commit-id: 75a4a92110
This commit is contained in:
parent
a5fcfd100d
commit
c3364ef821
@ -46,6 +46,9 @@ rule simplify_chinese
|
||||
rule tokenize_twitter
|
||||
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_twitter $in $prefix
|
||||
|
||||
rule tokenize_reddit
|
||||
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_reddit $in $prefix
|
||||
|
||||
# To convert the Leeds corpus, look for space-separated lines that start with
|
||||
# an integer and a decimal. The integer is the rank, which we discard. The
|
||||
# decimal is the frequency, and the remaining text is the term. Use sed -n
|
||||
@ -101,4 +104,4 @@ rule cat
|
||||
command = cat $in > $out
|
||||
|
||||
rule extract_reddit
|
||||
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</</g' | sed 's/&/\&/g' | gzip -c > $out
|
||||
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</</g' | sed 's/&/\&/g' > $out
|
||||
|
@ -2,12 +2,12 @@ from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="wordfreq_builder",
|
||||
version='0.1',
|
||||
version='0.2',
|
||||
maintainer='Luminoso Technologies, Inc.',
|
||||
maintainer_email='info@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq_builder',
|
||||
platforms=["any"],
|
||||
description="Turns raw data into word frequency lists",
|
||||
packages=['wordfreq_builder'],
|
||||
install_requires=['msgpack-python', 'pycld2']
|
||||
install_requires=['msgpack-python', 'pycld2', 'langcodes']
|
||||
)
|
||||
|
@ -1,13 +1,17 @@
|
||||
from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language
|
||||
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language
|
||||
import argparse
|
||||
|
||||
|
||||
def reddit_tokenizer(text):
|
||||
return cld2_surface_tokenizer(text, mode='reddit')
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('filename', help='filename of input file containing one comment per line')
|
||||
parser.add_argument('outprefix', help='prefix of output filenames')
|
||||
args = parser.parse_args()
|
||||
tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer)
|
||||
tokenize_by_language(args.filename, args.outprefix, tokenizer=reddit_tokenizer)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -41,7 +41,10 @@ CONFIG = {
|
||||
'subtlex-en': ['en'],
|
||||
'subtlex-other': ['de', 'nl', 'zh'],
|
||||
'jieba': ['zh'],
|
||||
'reddit': ['en'],
|
||||
'reddit': [
|
||||
'ar', 'de', 'en', 'es', 'fr', 'it', 'ja', 'pl', 'pt', 'ro',
|
||||
'ru', 'sv'
|
||||
]
|
||||
},
|
||||
# Subtlex languages that need to be pre-processed
|
||||
'wordlist_paths': {
|
||||
|
@ -4,6 +4,8 @@ from wordfreq_builder.config import (
|
||||
import sys
|
||||
import pathlib
|
||||
import itertools
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
HEADER = """# This file is automatically generated. Do not edit it.
|
||||
# You can change its behavior by editing wordfreq_builder/ninja.py,
|
||||
@ -155,14 +157,12 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
|
||||
|
||||
for language in languages:
|
||||
combined_output = wordlist_filename('twitter', language, 'tokens.txt')
|
||||
|
||||
language_inputs = [
|
||||
'{prefix}.{lang}.txt'.format(
|
||||
prefix=slice_files[slicenum], lang=language
|
||||
)
|
||||
for slicenum in range(slices)
|
||||
]
|
||||
|
||||
add_dep(lines, 'cat', language_inputs, combined_output)
|
||||
|
||||
count_file = wordlist_filename('twitter', language, 'counts.txt')
|
||||
@ -236,25 +236,49 @@ def jieba_deps(dirname_in, languages):
|
||||
return lines
|
||||
|
||||
|
||||
def reddit_base_filename(path):
|
||||
"""
|
||||
Get the base name of a Reddit input file, without its path or extension.
|
||||
"""
|
||||
return path.name[:-4]
|
||||
|
||||
|
||||
def reddit_deps(dirname_in, languages):
|
||||
lines = []
|
||||
if not languages:
|
||||
return lines
|
||||
assert languages == ['en']
|
||||
|
||||
processed_files = []
|
||||
path_in = pathlib.Path(dirname_in)
|
||||
for filepath in path_in.glob('*/*.bz2'):
|
||||
base = filepath.name[:-4]
|
||||
transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz')
|
||||
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
|
||||
count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
|
||||
add_dep(lines, 'count', transformed_file, count_file)
|
||||
processed_files.append(count_file)
|
||||
slices = {}
|
||||
counts_by_language = defaultdict(list)
|
||||
|
||||
output_file = wordlist_filename('reddit', 'en', 'counts.txt')
|
||||
# Extract text from the Reddit comment dumps, and write them to
|
||||
# .txt.gz files
|
||||
for filepath in path_in.glob('*/*.bz2'):
|
||||
base = reddit_base_filename(filepath)
|
||||
transformed_file = wordlist_filename('reddit', base + '.all', '.txt')
|
||||
slices[base] = transformed_file
|
||||
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
|
||||
|
||||
for base in sorted(slices):
|
||||
transformed_file = slices[base]
|
||||
language_outputs = []
|
||||
for language in languages:
|
||||
filename = wordlist_filename('reddit', base + '.' + language, '.txt')
|
||||
language_outputs.append(filename)
|
||||
|
||||
count_filename = wordlist_filename('reddit', base + '.' + language, 'counts.txt')
|
||||
add_dep(lines, 'count', filename, count_filename)
|
||||
counts_by_language[language].append(count_filename)
|
||||
|
||||
# find the prefix by constructing a filename, then stripping off
|
||||
# '.xx.txt' from the end
|
||||
prefix = wordlist_filename('reddit', base + '.xx', '.txt')[:-7]
|
||||
add_dep(lines, 'tokenize_reddit', transformed_file, language_outputs,
|
||||
params={'prefix': prefix},
|
||||
extra='wordfreq_builder/tokenizers.py')
|
||||
|
||||
for language in languages:
|
||||
output_file = wordlist_filename('reddit', language, 'counts.txt')
|
||||
add_dep(
|
||||
lines, 'merge_counts', processed_files, output_file,
|
||||
lines, 'merge_counts', counts_by_language[language], output_file,
|
||||
params={'cutoff': 3}
|
||||
)
|
||||
return lines
|
||||
|
@ -2,6 +2,7 @@ from wordfreq import tokenize
|
||||
from ftfy.fixes import unescape_html
|
||||
import regex
|
||||
import pycld2
|
||||
import langcodes
|
||||
|
||||
CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
|
||||
[
|
||||
@ -26,48 +27,63 @@ URL_RE = regex.compile(r'http(?:s)?://[^) ]*')
|
||||
MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)')
|
||||
|
||||
|
||||
def cld2_surface_tokenizer(text):
|
||||
"""
|
||||
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
|
||||
"""
|
||||
text = unescape_html(text)
|
||||
text = TWITTER_HANDLE_RE.sub('', text)
|
||||
text = TCO_RE.sub('', text)
|
||||
# Low-frequency languages tend to be detected incorrectly by cld2. The
|
||||
# following list of languages are languages that appear in our data with any
|
||||
# reasonable frequency, and seem to usually be detected *correctly*. These are
|
||||
# the languages we'll keep in the Reddit and Twitter results.
|
||||
#
|
||||
# This list is larger than the list that wordfreq ultimately generates, so we
|
||||
# can look here as a source of future data.
|
||||
|
||||
lang = cld2_detect_language(text)
|
||||
|
||||
# Don't allow tokenization in Chinese when language-detecting, because
|
||||
# the Chinese tokenizer may not be built yet
|
||||
if lang == 'zh':
|
||||
lang = 'en'
|
||||
|
||||
tokens = tokenize(text, lang)
|
||||
return lang, tokens
|
||||
|
||||
|
||||
# Low-frequency languages tend to be detected incorrectly. Keep a limited
|
||||
# list of languages we're allowed to use here.
|
||||
KEEP_THESE_LANGUAGES = {
|
||||
'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'it', 'ja', 'ko', 'ms',
|
||||
'nl', 'pl', 'pt', 'ro', 'ru', 'sv'
|
||||
'af', 'ar', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fi',
|
||||
'fr', 'gl', 'he', 'hi', 'hr', 'hu', 'id', 'is', 'it', 'ja', 'ko', 'lv',
|
||||
'ms', 'nl', 'nn', 'no', 'pl', 'pt', 'ro', 'ru', 'sr', 'sv', 'sw', 'tl',
|
||||
'tr', 'uk', 'vi'
|
||||
}
|
||||
|
||||
# Semi-frequent languages that are excluded by the above:
|
||||
#
|
||||
# - Chinese, not because it's detected incorrectly, but because we can't
|
||||
# handle it until we already have word frequencies
|
||||
# - Thai (seems to be detected whenever someone uses Thai characters in
|
||||
# an emoticon)
|
||||
# - Welsh (which is detected for "ohmygodohmygodohmygod")
|
||||
# - Turkmen (detected for ASCII art)
|
||||
# - Irish Gaelic (detected for Cthulhu-related text)
|
||||
# - Kannada (looks of disapproval)
|
||||
# - Lao, Tamil, Xhosa, Slovak (various emoticons and Internet memes)
|
||||
# - Breton (the word "memes" itself)
|
||||
|
||||
def cld2_reddit_tokenizer(text):
|
||||
|
||||
def cld2_surface_tokenizer(text, mode='twitter'):
|
||||
"""
|
||||
A language-detecting tokenizer with special cases for handling text from
|
||||
Reddit.
|
||||
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
|
||||
|
||||
The `mode` can be 'twitter' or 'reddit', which slightly changes the
|
||||
pre-processing of the text.
|
||||
"""
|
||||
text = unescape_html(text)
|
||||
if mode == 'twitter':
|
||||
text = TWITTER_HANDLE_RE.sub('', text)
|
||||
text = TCO_RE.sub('', text)
|
||||
elif mode == 'reddit':
|
||||
text = URL_RE.sub('', text)
|
||||
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
|
||||
|
||||
lang = cld2_detect_language(text)
|
||||
if lang not in KEEP_THESE_LANGUAGES:
|
||||
# Reddit is 99.9% English, so if we detected a rare language, it's
|
||||
# much more likely that it's actually English.
|
||||
lang = 'en'
|
||||
|
||||
tokens = tokenize(text, lang, include_punctuation=True)
|
||||
# If the detected language isn't in our pretty generous list of languages,
|
||||
# return no tokens.
|
||||
if lang not in KEEP_THESE_LANGUAGES:
|
||||
return 'xx', []
|
||||
|
||||
# cld2's accuracy seems to improve dramatically with at least 50
|
||||
# bytes of input, so throw away non-English below this length.
|
||||
if len(text.encode('utf-8')) < 50 and lang != 'en':
|
||||
return 'xx', []
|
||||
|
||||
tokens = tokenize(text, lang)
|
||||
return lang, tokens
|
||||
|
||||
|
||||
@ -85,7 +101,12 @@ def cld2_detect_language(text):
|
||||
# Confidence score: float))
|
||||
|
||||
text = CLD2_BAD_CHARS_RE.sub('', text)
|
||||
return pycld2.detect(text)[2][0][1]
|
||||
lang = pycld2.detect(text)[2][0][1]
|
||||
|
||||
# Normalize the language code: 'iw' becomes 'he', and 'zh-Hant'
|
||||
# becomes 'zh'
|
||||
code = langcodes.get(lang).language
|
||||
return code
|
||||
|
||||
|
||||
def tokenize_by_language(in_filename, out_prefix, tokenizer):
|
||||
|
Loading…
Reference in New Issue
Block a user