mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
actually use the results of language-detection on Reddit
This commit is contained in:
parent
164a5b1a05
commit
75a4a92110
@ -46,6 +46,9 @@ rule simplify_chinese
|
|||||||
rule tokenize_twitter
|
rule tokenize_twitter
|
||||||
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_twitter $in $prefix
|
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_twitter $in $prefix
|
||||||
|
|
||||||
|
rule tokenize_reddit
|
||||||
|
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_reddit $in $prefix
|
||||||
|
|
||||||
# To convert the Leeds corpus, look for space-separated lines that start with
|
# To convert the Leeds corpus, look for space-separated lines that start with
|
||||||
# an integer and a decimal. The integer is the rank, which we discard. The
|
# an integer and a decimal. The integer is the rank, which we discard. The
|
||||||
# decimal is the frequency, and the remaining text is the term. Use sed -n
|
# decimal is the frequency, and the remaining text is the term. Use sed -n
|
||||||
@ -101,4 +104,4 @@ rule cat
|
|||||||
command = cat $in > $out
|
command = cat $in > $out
|
||||||
|
|
||||||
rule extract_reddit
|
rule extract_reddit
|
||||||
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</</g' | sed 's/&/\&/g' | gzip -c > $out
|
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</</g' | sed 's/&/\&/g' > $out
|
||||||
|
@ -2,12 +2,12 @@ from setuptools import setup
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="wordfreq_builder",
|
name="wordfreq_builder",
|
||||||
version='0.1',
|
version='0.2',
|
||||||
maintainer='Luminoso Technologies, Inc.',
|
maintainer='Luminoso Technologies, Inc.',
|
||||||
maintainer_email='info@luminoso.com',
|
maintainer_email='info@luminoso.com',
|
||||||
url='http://github.com/LuminosoInsight/wordfreq_builder',
|
url='http://github.com/LuminosoInsight/wordfreq_builder',
|
||||||
platforms=["any"],
|
platforms=["any"],
|
||||||
description="Turns raw data into word frequency lists",
|
description="Turns raw data into word frequency lists",
|
||||||
packages=['wordfreq_builder'],
|
packages=['wordfreq_builder'],
|
||||||
install_requires=['msgpack-python', 'pycld2']
|
install_requires=['msgpack-python', 'pycld2', 'langcodes']
|
||||||
)
|
)
|
||||||
|
@ -1,13 +1,17 @@
|
|||||||
from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language
|
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def reddit_tokenizer(text):
|
||||||
|
return cld2_surface_tokenizer(text, mode='reddit')
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('filename', help='filename of input file containing one comment per line')
|
parser.add_argument('filename', help='filename of input file containing one comment per line')
|
||||||
parser.add_argument('outprefix', help='prefix of output filenames')
|
parser.add_argument('outprefix', help='prefix of output filenames')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer)
|
tokenize_by_language(args.filename, args.outprefix, tokenizer=reddit_tokenizer)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -41,7 +41,10 @@ CONFIG = {
|
|||||||
'subtlex-en': ['en'],
|
'subtlex-en': ['en'],
|
||||||
'subtlex-other': ['de', 'nl', 'zh'],
|
'subtlex-other': ['de', 'nl', 'zh'],
|
||||||
'jieba': ['zh'],
|
'jieba': ['zh'],
|
||||||
'reddit': ['en'],
|
'reddit': [
|
||||||
|
'ar', 'de', 'en', 'es', 'fr', 'it', 'ja', 'pl', 'pt', 'ro',
|
||||||
|
'ru', 'sv'
|
||||||
|
]
|
||||||
},
|
},
|
||||||
# Subtlex languages that need to be pre-processed
|
# Subtlex languages that need to be pre-processed
|
||||||
'wordlist_paths': {
|
'wordlist_paths': {
|
||||||
|
@ -4,6 +4,8 @@ from wordfreq_builder.config import (
|
|||||||
import sys
|
import sys
|
||||||
import pathlib
|
import pathlib
|
||||||
import itertools
|
import itertools
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
|
||||||
HEADER = """# This file is automatically generated. Do not edit it.
|
HEADER = """# This file is automatically generated. Do not edit it.
|
||||||
# You can change its behavior by editing wordfreq_builder/ninja.py,
|
# You can change its behavior by editing wordfreq_builder/ninja.py,
|
||||||
@ -155,14 +157,12 @@ def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
|
|||||||
|
|
||||||
for language in languages:
|
for language in languages:
|
||||||
combined_output = wordlist_filename('twitter', language, 'tokens.txt')
|
combined_output = wordlist_filename('twitter', language, 'tokens.txt')
|
||||||
|
|
||||||
language_inputs = [
|
language_inputs = [
|
||||||
'{prefix}.{lang}.txt'.format(
|
'{prefix}.{lang}.txt'.format(
|
||||||
prefix=slice_files[slicenum], lang=language
|
prefix=slice_files[slicenum], lang=language
|
||||||
)
|
)
|
||||||
for slicenum in range(slices)
|
for slicenum in range(slices)
|
||||||
]
|
]
|
||||||
|
|
||||||
add_dep(lines, 'cat', language_inputs, combined_output)
|
add_dep(lines, 'cat', language_inputs, combined_output)
|
||||||
|
|
||||||
count_file = wordlist_filename('twitter', language, 'counts.txt')
|
count_file = wordlist_filename('twitter', language, 'counts.txt')
|
||||||
@ -236,25 +236,49 @@ def jieba_deps(dirname_in, languages):
|
|||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def reddit_base_filename(path):
|
||||||
|
"""
|
||||||
|
Get the base name of a Reddit input file, without its path or extension.
|
||||||
|
"""
|
||||||
|
return path.name[:-4]
|
||||||
|
|
||||||
|
|
||||||
def reddit_deps(dirname_in, languages):
|
def reddit_deps(dirname_in, languages):
|
||||||
lines = []
|
lines = []
|
||||||
if not languages:
|
|
||||||
return lines
|
|
||||||
assert languages == ['en']
|
|
||||||
|
|
||||||
processed_files = []
|
|
||||||
path_in = pathlib.Path(dirname_in)
|
path_in = pathlib.Path(dirname_in)
|
||||||
for filepath in path_in.glob('*/*.bz2'):
|
slices = {}
|
||||||
base = filepath.name[:-4]
|
counts_by_language = defaultdict(list)
|
||||||
transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz')
|
|
||||||
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
|
|
||||||
count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
|
|
||||||
add_dep(lines, 'count', transformed_file, count_file)
|
|
||||||
processed_files.append(count_file)
|
|
||||||
|
|
||||||
output_file = wordlist_filename('reddit', 'en', 'counts.txt')
|
# Extract text from the Reddit comment dumps, and write them to
|
||||||
|
# .txt.gz files
|
||||||
|
for filepath in path_in.glob('*/*.bz2'):
|
||||||
|
base = reddit_base_filename(filepath)
|
||||||
|
transformed_file = wordlist_filename('reddit', base + '.all', '.txt')
|
||||||
|
slices[base] = transformed_file
|
||||||
|
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
|
||||||
|
|
||||||
|
for base in sorted(slices):
|
||||||
|
transformed_file = slices[base]
|
||||||
|
language_outputs = []
|
||||||
|
for language in languages:
|
||||||
|
filename = wordlist_filename('reddit', base + '.' + language, '.txt')
|
||||||
|
language_outputs.append(filename)
|
||||||
|
|
||||||
|
count_filename = wordlist_filename('reddit', base + '.' + language, 'counts.txt')
|
||||||
|
add_dep(lines, 'count', filename, count_filename)
|
||||||
|
counts_by_language[language].append(count_filename)
|
||||||
|
|
||||||
|
# find the prefix by constructing a filename, then stripping off
|
||||||
|
# '.xx.txt' from the end
|
||||||
|
prefix = wordlist_filename('reddit', base + '.xx', '.txt')[:-7]
|
||||||
|
add_dep(lines, 'tokenize_reddit', transformed_file, language_outputs,
|
||||||
|
params={'prefix': prefix},
|
||||||
|
extra='wordfreq_builder/tokenizers.py')
|
||||||
|
|
||||||
|
for language in languages:
|
||||||
|
output_file = wordlist_filename('reddit', language, 'counts.txt')
|
||||||
add_dep(
|
add_dep(
|
||||||
lines, 'merge_counts', processed_files, output_file,
|
lines, 'merge_counts', counts_by_language[language], output_file,
|
||||||
params={'cutoff': 3}
|
params={'cutoff': 3}
|
||||||
)
|
)
|
||||||
return lines
|
return lines
|
||||||
|
@ -2,6 +2,7 @@ from wordfreq import tokenize
|
|||||||
from ftfy.fixes import unescape_html
|
from ftfy.fixes import unescape_html
|
||||||
import regex
|
import regex
|
||||||
import pycld2
|
import pycld2
|
||||||
|
import langcodes
|
||||||
|
|
||||||
CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
|
CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
|
||||||
[
|
[
|
||||||
@ -26,48 +27,63 @@ URL_RE = regex.compile(r'http(?:s)?://[^) ]*')
|
|||||||
MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)')
|
MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)')
|
||||||
|
|
||||||
|
|
||||||
def cld2_surface_tokenizer(text):
|
# Low-frequency languages tend to be detected incorrectly by cld2. The
|
||||||
"""
|
# following list of languages are languages that appear in our data with any
|
||||||
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
|
# reasonable frequency, and seem to usually be detected *correctly*. These are
|
||||||
"""
|
# the languages we'll keep in the Reddit and Twitter results.
|
||||||
text = unescape_html(text)
|
#
|
||||||
text = TWITTER_HANDLE_RE.sub('', text)
|
# This list is larger than the list that wordfreq ultimately generates, so we
|
||||||
text = TCO_RE.sub('', text)
|
# can look here as a source of future data.
|
||||||
|
|
||||||
lang = cld2_detect_language(text)
|
|
||||||
|
|
||||||
# Don't allow tokenization in Chinese when language-detecting, because
|
|
||||||
# the Chinese tokenizer may not be built yet
|
|
||||||
if lang == 'zh':
|
|
||||||
lang = 'en'
|
|
||||||
|
|
||||||
tokens = tokenize(text, lang)
|
|
||||||
return lang, tokens
|
|
||||||
|
|
||||||
|
|
||||||
# Low-frequency languages tend to be detected incorrectly. Keep a limited
|
|
||||||
# list of languages we're allowed to use here.
|
|
||||||
KEEP_THESE_LANGUAGES = {
|
KEEP_THESE_LANGUAGES = {
|
||||||
'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'it', 'ja', 'ko', 'ms',
|
'af', 'ar', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fi',
|
||||||
'nl', 'pl', 'pt', 'ro', 'ru', 'sv'
|
'fr', 'gl', 'he', 'hi', 'hr', 'hu', 'id', 'is', 'it', 'ja', 'ko', 'lv',
|
||||||
|
'ms', 'nl', 'nn', 'no', 'pl', 'pt', 'ro', 'ru', 'sr', 'sv', 'sw', 'tl',
|
||||||
|
'tr', 'uk', 'vi'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Semi-frequent languages that are excluded by the above:
|
||||||
|
#
|
||||||
|
# - Chinese, not because it's detected incorrectly, but because we can't
|
||||||
|
# handle it until we already have word frequencies
|
||||||
|
# - Thai (seems to be detected whenever someone uses Thai characters in
|
||||||
|
# an emoticon)
|
||||||
|
# - Welsh (which is detected for "ohmygodohmygodohmygod")
|
||||||
|
# - Turkmen (detected for ASCII art)
|
||||||
|
# - Irish Gaelic (detected for Cthulhu-related text)
|
||||||
|
# - Kannada (looks of disapproval)
|
||||||
|
# - Lao, Tamil, Xhosa, Slovak (various emoticons and Internet memes)
|
||||||
|
# - Breton (the word "memes" itself)
|
||||||
|
|
||||||
def cld2_reddit_tokenizer(text):
|
|
||||||
|
def cld2_surface_tokenizer(text, mode='twitter'):
|
||||||
"""
|
"""
|
||||||
A language-detecting tokenizer with special cases for handling text from
|
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens.
|
||||||
Reddit.
|
|
||||||
|
The `mode` can be 'twitter' or 'reddit', which slightly changes the
|
||||||
|
pre-processing of the text.
|
||||||
"""
|
"""
|
||||||
|
text = unescape_html(text)
|
||||||
|
if mode == 'twitter':
|
||||||
|
text = TWITTER_HANDLE_RE.sub('', text)
|
||||||
|
text = TCO_RE.sub('', text)
|
||||||
|
elif mode == 'reddit':
|
||||||
text = URL_RE.sub('', text)
|
text = URL_RE.sub('', text)
|
||||||
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
|
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
|
||||||
|
|
||||||
lang = cld2_detect_language(text)
|
lang = cld2_detect_language(text)
|
||||||
if lang not in KEEP_THESE_LANGUAGES:
|
|
||||||
# Reddit is 99.9% English, so if we detected a rare language, it's
|
|
||||||
# much more likely that it's actually English.
|
|
||||||
lang = 'en'
|
|
||||||
|
|
||||||
tokens = tokenize(text, lang, include_punctuation=True)
|
# If the detected language isn't in our pretty generous list of languages,
|
||||||
|
# return no tokens.
|
||||||
|
if lang not in KEEP_THESE_LANGUAGES:
|
||||||
|
return 'xx', []
|
||||||
|
|
||||||
|
# cld2's accuracy seems to improve dramatically with at least 50
|
||||||
|
# bytes of input, so throw away non-English below this length.
|
||||||
|
if len(text.encode('utf-8')) < 50 and lang != 'en':
|
||||||
|
return 'xx', []
|
||||||
|
|
||||||
|
tokens = tokenize(text, lang)
|
||||||
return lang, tokens
|
return lang, tokens
|
||||||
|
|
||||||
|
|
||||||
@ -85,7 +101,12 @@ def cld2_detect_language(text):
|
|||||||
# Confidence score: float))
|
# Confidence score: float))
|
||||||
|
|
||||||
text = CLD2_BAD_CHARS_RE.sub('', text)
|
text = CLD2_BAD_CHARS_RE.sub('', text)
|
||||||
return pycld2.detect(text)[2][0][1]
|
lang = pycld2.detect(text)[2][0][1]
|
||||||
|
|
||||||
|
# Normalize the language code: 'iw' becomes 'he', and 'zh-Hant'
|
||||||
|
# becomes 'zh'
|
||||||
|
code = langcodes.get(lang).language
|
||||||
|
return code
|
||||||
|
|
||||||
|
|
||||||
def tokenize_by_language(in_filename, out_prefix, tokenizer):
|
def tokenize_by_language(in_filename, out_prefix, tokenizer):
|
||||||
|
Loading…
Reference in New Issue
Block a user