mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Merge pull request #30 from LuminosoInsight/add-reddit
Add English data from Reddit corpus
Former-commit-id: d18fee3d78
This commit is contained in:
commit
927d4f45a4
2
setup.py
2
setup.py
@ -34,7 +34,7 @@ if sys.version_info < (3, 4):
|
||||
|
||||
setup(
|
||||
name="wordfreq",
|
||||
version='1.2',
|
||||
version='1.3',
|
||||
maintainer='Luminoso Technologies, Inc.',
|
||||
maintainer_email='info@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq/',
|
||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wordfreq_builder/lib/jq-linux64
Executable file
BIN
wordfreq_builder/lib/jq-linux64
Executable file
Binary file not shown.
@ -13,7 +13,7 @@
|
||||
# contains the programatically-defined dependency graph.
|
||||
|
||||
# Variables
|
||||
DATA = ./data
|
||||
JQ = lib/jq-linux64
|
||||
|
||||
# How to build the build.ninja file itself. (Use the Makefile to get it the
|
||||
# first time.)
|
||||
@ -92,10 +92,13 @@ rule merge
|
||||
command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in
|
||||
|
||||
rule merge_counts
|
||||
command = python -m wordfreq_builder.cli.merge_counts -o $out $in
|
||||
command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in
|
||||
|
||||
rule freqs2cB
|
||||
command = python -m wordfreq_builder.cli.freqs_to_cB $in $out
|
||||
|
||||
rule cat
|
||||
command = cat $in > $out
|
||||
|
||||
rule extract_reddit
|
||||
command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</</g' | sed 's/&/\&/g' | gzip -c > $out
|
||||
|
@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli
|
||||
import argparse
|
||||
|
||||
|
||||
def merge_lists(input_names, output_name):
|
||||
def merge_lists(input_names, output_name, cutoff=0):
|
||||
count_dicts = []
|
||||
for input_name in input_names:
|
||||
values, total = read_values(input_name, cutoff=0)
|
||||
values, total = read_values(input_name, cutoff=cutoff, max_size=1000000)
|
||||
count_dicts.append(values)
|
||||
merged = merge_counts(count_dicts)
|
||||
write_wordlist(merged, output_name)
|
||||
@ -13,8 +13,12 @@ def merge_lists(input_names, output_name):
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
|
||||
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
||||
parser.add_argument('-o', '--output', default='combined-counts.csv',
|
||||
help='filename to write the output to')
|
||||
parser.add_argument('-c', '--cutoff', type=int, default=0
|
||||
help='minimum count to read from an input file')
|
||||
parser.add_argument('inputs', nargs='+',
|
||||
help='names of input files to merge')
|
||||
args = parser.parse_args()
|
||||
merge_lists(args.inputs, args.output)
|
||||
merge_lists(args.inputs, args.output, cutoff=args.cutoff)
|
||||
|
||||
|
@ -18,10 +18,14 @@ def merge_lists(input_names, output_name, cutoff, lang):
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
|
||||
parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
|
||||
parser.add_argument('-l', '--language', help='language code for which language the words are in', default=None)
|
||||
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
||||
parser.add_argument('-o', '--output', default='combined-freqs.csv',
|
||||
help='filename to write the output to')
|
||||
parser.add_argument('-c', '--cutoff', type=int, default=2,
|
||||
help='stop after seeing a count below this')
|
||||
parser.add_argument('-l', '--language', default=None,
|
||||
help='language code for which language the words are in')
|
||||
parser.add_argument('inputs', nargs='+',
|
||||
help='names of input files to merge')
|
||||
args = parser.parse_args()
|
||||
merge_lists(args.inputs, args.output, args.cutoff, args.language)
|
||||
|
||||
|
14
wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py
Normal file
14
wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py
Normal file
@ -0,0 +1,14 @@
|
||||
from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language
|
||||
import argparse
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('filename', help='filename of input file containing one comment per line')
|
||||
parser.add_argument('outprefix', help='prefix of output filenames')
|
||||
args = parser.parse_args()
|
||||
tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -1,4 +1,4 @@
|
||||
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
|
||||
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language
|
||||
import argparse
|
||||
|
||||
|
||||
@ -7,7 +7,7 @@ def main():
|
||||
parser.add_argument('filename', help='filename of input file containing one tweet per line')
|
||||
parser.add_argument('outprefix', help='prefix of output filenames')
|
||||
args = parser.parse_args()
|
||||
tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
|
||||
tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -40,7 +40,8 @@ CONFIG = {
|
||||
],
|
||||
'subtlex-en': ['en'],
|
||||
'subtlex-other': ['de', 'nl', 'zh'],
|
||||
'jieba': ['zh']
|
||||
'jieba': ['zh'],
|
||||
'reddit': ['en'],
|
||||
},
|
||||
# Subtlex languages that need to be pre-processed
|
||||
'wordlist_paths': {
|
||||
@ -52,6 +53,7 @@ CONFIG = {
|
||||
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||
'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||
'jieba': 'generated/jieba/jieba_{lang}.{ext}',
|
||||
'reddit': 'generated/reddit/reddit_{lang}.{ext}',
|
||||
'combined': 'generated/combined/combined_{lang}.{ext}',
|
||||
'combined-dist': 'dist/combined_{lang}.{ext}',
|
||||
'twitter-dist': 'dist/twitter_{lang}.{ext}',
|
||||
|
@ -77,6 +77,10 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
||||
data_filename('source-lists/subtlex'),
|
||||
CONFIG['sources']['subtlex-other']
|
||||
),
|
||||
reddit_deps(
|
||||
data_filename('raw-input/reddit'),
|
||||
CONFIG['sources']['reddit']
|
||||
),
|
||||
jieba_deps(
|
||||
data_filename('source-lists/jieba'),
|
||||
CONFIG['sources']['jieba']
|
||||
@ -232,6 +236,30 @@ def jieba_deps(dirname_in, languages):
|
||||
return lines
|
||||
|
||||
|
||||
def reddit_deps(dirname_in, languages):
|
||||
lines = []
|
||||
if not languages:
|
||||
return lines
|
||||
assert languages == ['en']
|
||||
|
||||
processed_files = []
|
||||
path_in = pathlib.Path(dirname_in)
|
||||
for filepath in path_in.glob('*/*.bz2'):
|
||||
base = filepath.name[:-4]
|
||||
transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz')
|
||||
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
|
||||
count_file = wordlist_filename('reddit', 'en', base + '.counts.txt')
|
||||
add_dep(lines, 'count', transformed_file, count_file)
|
||||
processed_files.append(count_file)
|
||||
|
||||
output_file = wordlist_filename('reddit', 'en', 'counts.txt')
|
||||
add_dep(
|
||||
lines, 'merge_counts', processed_files, output_file,
|
||||
params={'cutoff': 3}
|
||||
)
|
||||
return lines
|
||||
|
||||
|
||||
# Which columns of the SUBTLEX data files do the word and its frequency appear
|
||||
# in?
|
||||
SUBTLEX_COLUMN_MAP = {
|
||||
@ -264,7 +292,10 @@ def subtlex_en_deps(dirname_in, languages):
|
||||
)
|
||||
|
||||
output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
|
||||
add_dep(lines, 'merge_counts', processed_files, output_file)
|
||||
add_dep(
|
||||
lines, 'merge_counts', processed_files, output_file,
|
||||
params={'cutoff': 0}
|
||||
)
|
||||
|
||||
return lines
|
||||
|
||||
@ -292,7 +323,8 @@ def subtlex_other_deps(dirname_in, languages):
|
||||
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
|
||||
)
|
||||
add_dep(
|
||||
lines, 'merge_counts', processed_file, output_file
|
||||
lines, 'merge_counts', processed_file, output_file,
|
||||
params={'cutoff': 0}
|
||||
)
|
||||
return lines
|
||||
|
||||
|
@ -22,6 +22,8 @@ CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE)
|
||||
|
||||
TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+')
|
||||
TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
|
||||
URL_RE = regex.compile(r'http(?:s)?://[^) ]*')
|
||||
MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)')
|
||||
|
||||
|
||||
def cld2_surface_tokenizer(text):
|
||||
@ -31,6 +33,7 @@ def cld2_surface_tokenizer(text):
|
||||
text = unescape_html(text)
|
||||
text = TWITTER_HANDLE_RE.sub('', text)
|
||||
text = TCO_RE.sub('', text)
|
||||
|
||||
lang = cld2_detect_language(text)
|
||||
|
||||
# Don't allow tokenization in Chinese when language-detecting, because
|
||||
@ -42,6 +45,32 @@ def cld2_surface_tokenizer(text):
|
||||
return lang, tokens
|
||||
|
||||
|
||||
# Low-frequency languages tend to be detected incorrectly. Keep a limited
|
||||
# list of languages we're allowed to use here.
|
||||
KEEP_THESE_LANGUAGES = {
|
||||
'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'it', 'ja', 'ko', 'ms',
|
||||
'nl', 'pl', 'pt', 'ro', 'ru', 'sv'
|
||||
}
|
||||
|
||||
|
||||
def cld2_reddit_tokenizer(text):
|
||||
"""
|
||||
A language-detecting tokenizer with special cases for handling text from
|
||||
Reddit.
|
||||
"""
|
||||
text = URL_RE.sub('', text)
|
||||
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
|
||||
|
||||
lang = cld2_detect_language(text)
|
||||
if lang not in KEEP_THESE_LANGUAGES:
|
||||
# Reddit is 99.9% English, so if we detected a rare language, it's
|
||||
# much more likely that it's actually English.
|
||||
lang = 'en'
|
||||
|
||||
tokens = tokenize(text, lang, include_punctuation=True)
|
||||
return lang, tokens
|
||||
|
||||
|
||||
def cld2_detect_language(text):
|
||||
"""
|
||||
Uses CLD2 to detect the language.
|
||||
@ -59,13 +88,11 @@ def cld2_detect_language(text):
|
||||
return pycld2.detect(text)[2][0][1]
|
||||
|
||||
|
||||
def tokenize_twitter(in_filename, out_prefix, tokenizer):
|
||||
def tokenize_by_language(in_filename, out_prefix, tokenizer):
|
||||
"""
|
||||
Process a file by running it through the Twitter-specific tokenizer,
|
||||
which uses cld2 for language detection, and removes Twitter handles
|
||||
and t.co URLs.
|
||||
Process a file by running it through a given tokenizer.
|
||||
|
||||
Produces output files that are separated by language, with newlines
|
||||
Produces output files that are separated by language, with spaces
|
||||
between the tokens.
|
||||
"""
|
||||
out_files = {}
|
||||
@ -74,7 +101,7 @@ def tokenize_twitter(in_filename, out_prefix, tokenizer):
|
||||
text = line.split('\t')[-1].strip()
|
||||
language, tokens = tokenizer(text)
|
||||
if language != 'un':
|
||||
tokenized = '\n'.join(tokens)
|
||||
tokenized = ' '.join(tokens)
|
||||
out_filename = '%s.%s.txt' % (out_prefix, language)
|
||||
if out_filename in out_files:
|
||||
out_file = out_files[out_filename]
|
||||
|
@ -24,16 +24,19 @@ def count_tokens(filename):
|
||||
containing '<EFBFBD>'.
|
||||
"""
|
||||
counts = defaultdict(int)
|
||||
with open(filename, encoding='utf-8', errors='replace') as infile:
|
||||
if filename.endswith('gz'):
|
||||
infile = gzip.open(filename, 'rt', encoding='utf-8', errors='replace')
|
||||
else:
|
||||
infile = open(filename, encoding='utf-8', errors='replace')
|
||||
for line in infile:
|
||||
line = URL_RE.sub('', line.strip())
|
||||
for token in simple_tokenize(line):
|
||||
counts[token] += 1
|
||||
|
||||
infile.close()
|
||||
return counts
|
||||
|
||||
|
||||
def read_values(filename, cutoff=0, lang=None):
|
||||
def read_values(filename, cutoff=0, max_size=1e8, lang=None):
|
||||
"""
|
||||
Read words and their frequency or count values from a CSV file. Returns
|
||||
a dictionary of values and the total of all values.
|
||||
@ -52,7 +55,7 @@ def read_values(filename, cutoff=0, lang=None):
|
||||
for key, strval in csv.reader(infile):
|
||||
val = float(strval)
|
||||
key = fix_text(key)
|
||||
if val < cutoff:
|
||||
if val < cutoff or len(values) >= max_size:
|
||||
break
|
||||
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
|
||||
for token in tokens:
|
||||
@ -76,7 +79,7 @@ def read_freqs(filename, cutoff=0, lang=None):
|
||||
If lang is given, read_freqs will apply language specific preprocessing
|
||||
operations.
|
||||
"""
|
||||
values, total = read_values(filename, cutoff, lang)
|
||||
values, total = read_values(filename, cutoff, lang=lang)
|
||||
for word in values:
|
||||
values[word] /= total
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user