mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
Merge pull request #30 from LuminosoInsight/add-reddit
Add English data from Reddit corpus
Former-commit-id: d18fee3d78
This commit is contained in:
commit
927d4f45a4
2
setup.py
2
setup.py
@ -34,7 +34,7 @@ if sys.version_info < (3, 4):
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="wordfreq",
|
name="wordfreq",
|
||||||
version='1.2',
|
version='1.3',
|
||||||
maintainer='Luminoso Technologies, Inc.',
|
maintainer='Luminoso Technologies, Inc.',
|
||||||
maintainer_email='info@luminoso.com',
|
maintainer_email='info@luminoso.com',
|
||||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq_builder/lib/jq-linux64
Executable file
BIN
wordfreq_builder/lib/jq-linux64
Executable file
Binary file not shown.
@ -13,7 +13,7 @@
|
|||||||
# contains the programatically-defined dependency graph.
|
# contains the programatically-defined dependency graph.
|
||||||
|
|
||||||
# Variables
|
# Variables
|
||||||
DATA = ./data
|
JQ = lib/jq-linux64
|
||||||
|
|
||||||
# How to build the build.ninja file itself. (Use the Makefile to get it the
|
# How to build the build.ninja file itself. (Use the Makefile to get it the
|
||||||
# first time.)
|
# first time.)
|
||||||
@ -92,10 +92,13 @@ rule merge
|
|||||||
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
|
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
|
||||||
|
|
||||||
rule merge_counts
|
rule merge_counts
|
||||||
command = python -m wordfreq_builder.cli.merge_counts -o $out $in
|
command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in
|
||||||
|
|
||||||
rule freqs2cB
|
rule freqs2cB
|
||||||
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
|
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
|
||||||
|
|
||||||
rule cat
|
rule cat
|
||||||
command = cat $in > $out
|
command = cat $in > $out
|
||||||
|
|
||||||
|
rule extract_reddit
|
||||||
|
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</</g' | sed 's/&/\&/g' | gzip -c > $out
|
||||||
|
@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
|
|||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
def merge_lists(input_names, output_name):
|
def merge_lists(input_names, output_name, cutoff=0):
|
||||||
count_dicts = []
|
count_dicts = []
|
||||||
for input_name in input_names:
|
for input_name in input_names:
|
||||||
values, total = read_values(input_name, cutoff=0)
|
values, total = read_values(input_name, cutoff=cutoff, max_size=1000000)
|
||||||
count_dicts.append(values)
|
count_dicts.append(values)
|
||||||
merged = merge_counts(count_dicts)
|
merged = merge_counts(count_dicts)
|
||||||
write_wordlist(merged, output_name)
|
write_wordlist(merged, output_name)
|
||||||
@ -13,8 +13,12 @@ def merge_lists(input_names, output_name):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
|
parser.add_argument('-o', '--output', default='combined-counts.csv',
|
||||||
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
help='filename to write the output to')
|
||||||
|
parser.add_argument('-c', '--cutoff', type=int, default=0
|
||||||
|
help='minimum count to read from an input file')
|
||||||
|
parser.add_argument('inputs', nargs='+',
|
||||||
|
help='names of input files to merge')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
merge_lists(args.inputs, args.output)
|
merge_lists(args.inputs, args.output, cutoff=args.cutoff)
|
||||||
|
|
||||||
|
@ -18,10 +18,14 @@ def merge_lists(input_names, output_name, cutoff, lang):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
|
parser.add_argument('-o', '--output', default='combined-freqs.csv',
|
||||||
parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
|
help='filename to write the output to')
|
||||||
parser.add_argument('-l', '--language', help='language code for which language the words are in', default=None)
|
parser.add_argument('-c', '--cutoff', type=int, default=2,
|
||||||
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
help='stop after seeing a count below this')
|
||||||
|
parser.add_argument('-l', '--language', default=None,
|
||||||
|
help='language code for which language the words are in')
|
||||||
|
parser.add_argument('inputs', nargs='+',
|
||||||
|
help='names of input files to merge')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
merge_lists(args.inputs, args.output, args.cutoff, args.language)
|
merge_lists(args.inputs, args.output, args.cutoff, args.language)
|
||||||
|
|
||||||
|
14
wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py
Normal file
14
wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('filename', help='filename of input file containing one comment per line')
|
||||||
|
parser.add_argument('outprefix', help='prefix of output filenames')
|
||||||
|
args = parser.parse_args()
|
||||||
|
tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
@ -1,4 +1,4 @@
|
|||||||
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
|
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
@ -7,7 +7,7 @@ def main():
|
|||||||
parser.add_argument('filename', help='filename of input file containing one tweet per line')
|
parser.add_argument('filename', help='filename of input file containing one tweet per line')
|
||||||
parser.add_argument('outprefix', help='prefix of output filenames')
|
parser.add_argument('outprefix', help='prefix of output filenames')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
|
tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -40,7 +40,8 @@ CONFIG = {
|
|||||||
],
|
],
|
||||||
'subtlex-en': ['en'],
|
'subtlex-en': ['en'],
|
||||||
'subtlex-other': ['de', 'nl', 'zh'],
|
'subtlex-other': ['de', 'nl', 'zh'],
|
||||||
'jieba': ['zh']
|
'jieba': ['zh'],
|
||||||
|
'reddit': ['en'],
|
||||||
},
|
},
|
||||||
# Subtlex languages that need to be pre-processed
|
# Subtlex languages that need to be pre-processed
|
||||||
'wordlist_paths': {
|
'wordlist_paths': {
|
||||||
@ -52,6 +53,7 @@ CONFIG = {
|
|||||||
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
|
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||||
'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
|
'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||||
'jieba': 'generated/jieba/jieba_{lang}.{ext}',
|
'jieba': 'generated/jieba/jieba_{lang}.{ext}',
|
||||||
|
'reddit': 'generated/reddit/reddit_{lang}.{ext}',
|
||||||
'combined': 'generated/combined/combined_{lang}.{ext}',
|
'combined': 'generated/combined/combined_{lang}.{ext}',
|
||||||
'combined-dist': 'dist/combined_{lang}.{ext}',
|
'combined-dist': 'dist/combined_{lang}.{ext}',
|
||||||
'twitter-dist': 'dist/twitter_{lang}.{ext}',
|
'twitter-dist': 'dist/twitter_{lang}.{ext}',
|
||||||
|
@ -77,6 +77,10 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
|||||||
data_filename('source-lists/subtlex'),
|
data_filename('source-lists/subtlex'),
|
||||||
CONFIG['sources']['subtlex-other']
|
CONFIG['sources']['subtlex-other']
|
||||||
),
|
),
|
||||||
|
reddit_deps(
|
||||||
|
data_filename('raw-input/reddit'),
|
||||||
|
CONFIG['sources']['reddit']
|
||||||
|
),
|
||||||
jieba_deps(
|
jieba_deps(
|
||||||
data_filename('source-lists/jieba'),
|
data_filename('source-lists/jieba'),
|
||||||
CONFIG['sources']['jieba']
|
CONFIG['sources']['jieba']
|
||||||
@ -232,6 +236,30 @@ def jieba_deps(dirname_in, languages):
|
|||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def reddit_deps(dirname_in, languages):
|
||||||
|
lines = []
|
||||||
|
if not languages:
|
||||||
|
return lines
|
||||||
|
assert languages == ['en']
|
||||||
|
|
||||||
|
processed_files = []
|
||||||
|
path_in = pathlib.Path(dirname_in)
|
||||||
|
for filepath in path_in.glob('*/*.bz2'):
|
||||||
|
base = filepath.name[:-4]
|
||||||
|
transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz')
|
||||||
|
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
|
||||||
|
count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
|
||||||
|
add_dep(lines, 'count', transformed_file, count_file)
|
||||||
|
processed_files.append(count_file)
|
||||||
|
|
||||||
|
output_file = wordlist_filename('reddit', 'en', 'counts.txt')
|
||||||
|
add_dep(
|
||||||
|
lines, 'merge_counts', processed_files, output_file,
|
||||||
|
params={'cutoff': 3}
|
||||||
|
)
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
# Which columns of the SUBTLEX data files do the word and its frequency appear
|
# Which columns of the SUBTLEX data files do the word and its frequency appear
|
||||||
# in?
|
# in?
|
||||||
SUBTLEX_COLUMN_MAP = {
|
SUBTLEX_COLUMN_MAP = {
|
||||||
@ -264,7 +292,10 @@ def subtlex_en_deps(dirname_in, languages):
|
|||||||
)
|
)
|
||||||
|
|
||||||
output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
|
output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
|
||||||
add_dep(lines, 'merge_counts', processed_files, output_file)
|
add_dep(
|
||||||
|
lines, 'merge_counts', processed_files, output_file,
|
||||||
|
params={'cutoff': 0}
|
||||||
|
)
|
||||||
|
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
@ -292,7 +323,8 @@ def subtlex_other_deps(dirname_in, languages):
|
|||||||
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
|
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
|
||||||
)
|
)
|
||||||
add_dep(
|
add_dep(
|
||||||
lines, 'merge_counts', processed_file, output_file
|
lines, 'merge_counts', processed_file, output_file,
|
||||||
|
params={'cutoff': 0}
|
||||||
)
|
)
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
@ -22,6 +22,8 @@ CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE)
|
|||||||
|
|
||||||
TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
|
TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
|
||||||
TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
|
TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
|
||||||
|
URL_RE = regex.compile(r'http(?:s)?://[^) ]*')
|
||||||
|
MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)')
|
||||||
|
|
||||||
|
|
||||||
def cld2_surface_tokenizer(text):
|
def cld2_surface_tokenizer(text):
|
||||||
@ -31,6 +33,7 @@ def cld2_surface_tokenizer(text):
|
|||||||
text = unescape_html(text)
|
text = unescape_html(text)
|
||||||
text = TWITTER_HANDLE_RE.sub('', text)
|
text = TWITTER_HANDLE_RE.sub('', text)
|
||||||
text = TCO_RE.sub('', text)
|
text = TCO_RE.sub('', text)
|
||||||
|
|
||||||
lang = cld2_detect_language(text)
|
lang = cld2_detect_language(text)
|
||||||
|
|
||||||
# Don't allow tokenization in Chinese when language-detecting, because
|
# Don't allow tokenization in Chinese when language-detecting, because
|
||||||
@ -42,6 +45,32 @@ def cld2_surface_tokenizer(text):
|
|||||||
return lang, tokens
|
return lang, tokens
|
||||||
|
|
||||||
|
|
||||||
|
# Low-frequency languages tend to be detected incorrectly. Keep a limited
|
||||||
|
# list of languages we're allowed to use here.
|
||||||
|
KEEP_THESE_LANGUAGES = {
|
||||||
|
'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'it', 'ja', 'ko', 'ms',
|
||||||
|
'nl', 'pl', 'pt', 'ro', 'ru', 'sv'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def cld2_reddit_tokenizer(text):
|
||||||
|
"""
|
||||||
|
A language-detecting tokenizer with special cases for handling text from
|
||||||
|
Reddit.
|
||||||
|
"""
|
||||||
|
text = URL_RE.sub('', text)
|
||||||
|
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
|
||||||
|
|
||||||
|
lang = cld2_detect_language(text)
|
||||||
|
if lang not in KEEP_THESE_LANGUAGES:
|
||||||
|
# Reddit is 99.9% English, so if we detected a rare language, it's
|
||||||
|
# much more likely that it's actually English.
|
||||||
|
lang = 'en'
|
||||||
|
|
||||||
|
tokens = tokenize(text, lang, include_punctuation=True)
|
||||||
|
return lang, tokens
|
||||||
|
|
||||||
|
|
||||||
def cld2_detect_language(text):
|
def cld2_detect_language(text):
|
||||||
"""
|
"""
|
||||||
Uses CLD2 to detect the language.
|
Uses CLD2 to detect the language.
|
||||||
@ -59,13 +88,11 @@ def cld2_detect_language(text):
|
|||||||
return pycld2.detect(text)[2][0][1]
|
return pycld2.detect(text)[2][0][1]
|
||||||
|
|
||||||
|
|
||||||
def tokenize_twitter(in_filename, out_prefix, tokenizer):
|
def tokenize_by_language(in_filename, out_prefix, tokenizer):
|
||||||
"""
|
"""
|
||||||
Process a file by running it through the Twitter-specific tokenizer,
|
Process a file by running it through a given tokenizer.
|
||||||
which uses cld2 for language detection, and removes Twitter handles
|
|
||||||
and t.co URLs.
|
|
||||||
|
|
||||||
Produces output files that are separated by language, with newlines
|
Produces output files that are separated by language, with spaces
|
||||||
between the tokens.
|
between the tokens.
|
||||||
"""
|
"""
|
||||||
out_files = {}
|
out_files = {}
|
||||||
@ -74,7 +101,7 @@ def tokenize_twitter(in_filename, out_prefix, tokenizer):
|
|||||||
text = line.split('\t')[-1].strip()
|
text = line.split('\t')[-1].strip()
|
||||||
language, tokens = tokenizer(text)
|
language, tokens = tokenizer(text)
|
||||||
if language != 'un':
|
if language != 'un':
|
||||||
tokenized = '\n'.join(tokens)
|
tokenized = ' '.join(tokens)
|
||||||
out_filename = '%s.%s.txt' % (out_prefix, language)
|
out_filename = '%s.%s.txt' % (out_prefix, language)
|
||||||
if out_filename in out_files:
|
if out_filename in out_files:
|
||||||
out_file = out_files[out_filename]
|
out_file = out_files[out_filename]
|
||||||
|
@ -24,16 +24,19 @@ def count_tokens(filename):
|
|||||||
containing '<EFBFBD>'.
|
containing '<EFBFBD>'.
|
||||||
"""
|
"""
|
||||||
counts = defaultdict(int)
|
counts = defaultdict(int)
|
||||||
with open(filename, encoding='utf-8', errors='replace') as infile:
|
if filename.endswith('gz'):
|
||||||
for line in infile:
|
infile = gzip.open(filename, 'rt', encoding='utf-8', errors='replace')
|
||||||
line = URL_RE.sub('', line.strip())
|
else:
|
||||||
for token in simple_tokenize(line):
|
infile = open(filename, encoding='utf-8', errors='replace')
|
||||||
counts[token] += 1
|
for line in infile:
|
||||||
|
line = URL_RE.sub('', line.strip())
|
||||||
|
for token in simple_tokenize(line):
|
||||||
|
counts[token] += 1
|
||||||
|
infile.close()
|
||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
|
||||||
def read_values(filename, cutoff=0, lang=None):
|
def read_values(filename, cutoff=0, max_size=1e8, lang=None):
|
||||||
"""
|
"""
|
||||||
Read words and their frequency or count values from a CSV file. Returns
|
Read words and their frequency or count values from a CSV file. Returns
|
||||||
a dictionary of values and the total of all values.
|
a dictionary of values and the total of all values.
|
||||||
@ -52,7 +55,7 @@ def read_values(filename, cutoff=0, lang=None):
|
|||||||
for key, strval in csv.reader(infile):
|
for key, strval in csv.reader(infile):
|
||||||
val = float(strval)
|
val = float(strval)
|
||||||
key = fix_text(key)
|
key = fix_text(key)
|
||||||
if val < cutoff:
|
if val < cutoff or len(values) >= max_size:
|
||||||
break
|
break
|
||||||
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
|
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
@ -76,7 +79,7 @@ def read_freqs(filename, cutoff=0, lang=None):
|
|||||||
If lang is given, read_freqs will apply language specific preprocessing
|
If lang is given, read_freqs will apply language specific preprocessing
|
||||||
operations.
|
operations.
|
||||||
"""
|
"""
|
||||||
values, total = read_values(filename, cutoff, lang)
|
values, total = read_values(filename, cutoff, lang=lang)
|
||||||
for word in values:
|
for word in values:
|
||||||
values[word] /= total
|
values[word] /= total
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user