enable wordlist balancing, surface form counting

This commit is contained in:
Robyn Speer 2015-02-17 13:43:22 -05:00
parent 07e61be7e3
commit bc780c63c8
5 changed files with 50 additions and 23 deletions

View File

@ -3,18 +3,19 @@ from pathlib import Path
import argparse import argparse
def merge_lists(input_names, output_name): def merge_lists(input_names, output_name, balance=False):
count_dicts = [] count_dicts = []
for input_name in input_names: for input_name in input_names:
count_dicts.append(read_counts(Path(input_name))) count_dicts.append(read_counts(Path(input_name)))
merged = merge_counts(count_dicts) merged = merge_counts(count_dicts, balance=balance)
write_counts(merged, Path(output_name)) write_counts(merged, Path(output_name))
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv') parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
parser.add_argument('-b', '--balance', action='store_true', help='Automatically balance unequally-sampled word frequencies')
parser.add_argument('inputs', help='names of input files to merge', nargs='+') parser.add_argument('inputs', help='names of input files to merge', nargs='+')
args = parser.parse_args() args = parser.parse_args()
merge_lists(args.inputs, args.output) merge_lists(args.inputs, args.output, balance=args.balance)

View File

@ -1,12 +1,16 @@
from wordfreq_builder.word_counts import WordCountBuilder from wordfreq_builder.word_counts import WordCountBuilder
from wordfreq_builder.tokenizers import rosette_tokenizer from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer
from pathlib import Path from pathlib import Path
import argparse import argparse
def count_twitter(pathname, offset=0, nsplit=1): def count_twitter(pathname, offset=0, nsplit=1, surface=False):
path = Path(pathname) path = Path(pathname)
builder = WordCountBuilder(tokenizer=rosette_tokenizer) if surface == True:
tokenizer = rosette_surface_tokenizer
else:
tokenizer = rosette_tokenizer
builder = WordCountBuilder(tokenizer=tokenizer)
save_filename = 'twitter-counts-%d.csv' % offset save_filename = 'twitter-counts-%d.csv' % offset
save_pathname = path.parent / save_filename save_pathname = path.parent / save_filename
builder.count_twitter(path, offset, nsplit) builder.count_twitter(path, offset, nsplit)
@ -18,6 +22,7 @@ if __name__ == '__main__':
parser.add_argument('filename', help='filename of input file containing one tweet per line') parser.add_argument('filename', help='filename of input file containing one tweet per line')
parser.add_argument('offset', type=int) parser.add_argument('offset', type=int)
parser.add_argument('nsplit', type=int) parser.add_argument('nsplit', type=int)
parser.add_argument('-s', '--surface', action='store_true', help='Use surface text instead of stems')
args = parser.parse_args() args = parser.parse_args()
count_twitter(args.filename, args.offset, args.nsplit) count_twitter(args.filename, args.offset, args.nsplit, surface=args.surface)

View File

@ -1,12 +1,16 @@
from wordfreq_builder.word_counts import WordCountBuilder from wordfreq_builder.word_counts import WordCountBuilder
from wordfreq_builder.tokenizers import rosette_tokenizer from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer
from pathlib import Path from pathlib import Path
import argparse import argparse
def count_wikipedia(pathname): def count_wikipedia(pathname, surface=False):
path = Path(pathname) path = Path(pathname)
builder = WordCountBuilder() if surface == True:
tokenizer = rosette_surface_tokenizer
else:
tokenizer = rosette_tokenizer
builder = WordCountBuilder(tokenizer=tokenizer)
builder.count_wikipedia(path) builder.count_wikipedia(path)
builder.save_wordlist(path / 'counts.csv') builder.save_wordlist(path / 'counts.csv')
@ -14,6 +18,7 @@ def count_wikipedia(pathname):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('dir', help='directory containing extracted Wikipedia text') parser.add_argument('dir', help='directory containing extracted Wikipedia text')
parser.add_argument('-s', '--surface', action='store_true', help='Use surface text instead of stems')
args = parser.parse_args() args = parser.parse_args()
count_wikipedia(args.dir) count_wikipedia(args.dir, surface=args.surface)

View File

@ -7,12 +7,18 @@ ROSETTE = RosetteReader()
def rosette_tokenizer(text): def rosette_tokenizer(text):
analysis, lang = ROSETTE.rosette.analyze(text) analysis, lang = ROSETTE.rosette.analyze(text)
# I'm aware this doesn't do the right things with multi-word stems.
# Wordfreq doesn't either. And wordfreq isn't designed to look up
# multiple words anyway.
return [stem + '|' + lang for (stem, pos, span) in analysis] return [stem + '|' + lang for (stem, pos, span) in analysis]
def rosette_surface_tokenizer(text):
analysis, lang = ROSETTE.rosette.analyze(text)
return [text[span[0]:span[1]] + '|' + lang for (stem, pos, span) in analysis]
def treebank_tokenizer(text): def treebank_surface_tokenizer(text):
""" """
This is a simplified version of the Treebank tokenizer in NLTK. This is a simplified version of the Treebank tokenizer in NLTK.

View File

@ -1,9 +1,10 @@
from wordfreq_builder.tokenizers import treebank_tokenizer from wordfreq_builder.tokenizers import treebank_surface_tokenizer
from collections import defaultdict from collections import defaultdict
from operator import itemgetter from operator import itemgetter
from pathlib import Path from pathlib import Path
from unicodedata import normalize from unicodedata import normalize
import csv import csv
import sys
def read_counts(path): def read_counts(path):
@ -11,7 +12,7 @@ def read_counts(path):
with path.open(encoding='utf-8', newline='') as infile: with path.open(encoding='utf-8', newline='') as infile:
reader = csv.reader(infile) reader = csv.reader(infile)
for key, strval in reader: for key, strval in reader:
val = int(strval) val = float(strval)
# Use += so that, if we give the reader concatenated files with # Use += so that, if we give the reader concatenated files with
# duplicates, it does the right thing # duplicates, it does the right thing
counts[key] += val counts[key] += val
@ -27,11 +28,14 @@ def count_languages(counts):
return langcounts return langcounts
def merge_counts(count_dicts): def merge_counts(count_dicts, balance=False):
merged = defaultdict(int) merged = defaultdict(float)
for counts in count_dicts: for counts in count_dicts:
weight = 1
if balance:
weight = 1e9 / max(counts.values()) / len(count_dicts)
for key, val in counts.items(): for key, val in counts.items():
merged[key] += val merged[key] += val * weight
return merged return merged
@ -52,7 +56,7 @@ class WordCountBuilder:
self.counts = defaultdict(int) self.counts = defaultdict(int)
self.unique_docs = unique_docs self.unique_docs = unique_docs
if tokenizer is None: if tokenizer is None:
self.tokenizer = treebank_tokenizer self.tokenizer = treebank_surface_tokenizer
else: else:
self.tokenizer = tokenizer self.tokenizer = tokenizer
@ -60,8 +64,9 @@ class WordCountBuilder:
text = normalize('NFKC', text).lower() text = normalize('NFKC', text).lower()
try: try:
tokens = self.tokenizer(text) tokens = self.tokenizer(text)
# print(' '.join(tokens))
except Exception as e: except Exception as e:
print("Couldn't tokenize due to %r: %s" % (e, text)) print("Couldn't tokenize due to %r: %s" % (e, text), file=sys.stderr)
return return
if self.unique_docs: if self.unique_docs:
tokens = set(tokens) tokens = set(tokens)
@ -69,6 +74,11 @@ class WordCountBuilder:
self.counts[tok] += 1 self.counts[tok] += 1
def count_wikipedia(self, path, glob='*/*'): def count_wikipedia(self, path, glob='*/*'):
"""
Read a directory of extracted Wikipedia articles. The articles can be
grouped together into files, in which case they should be separated by
lines beginning with ##.
"""
for filepath in sorted(path.glob(glob)): for filepath in sorted(path.glob(glob)):
print(filepath) print(filepath)
with filepath.open(encoding='utf-8') as file: with filepath.open(encoding='utf-8') as file:
@ -82,6 +92,10 @@ class WordCountBuilder:
buf.append(line) buf.append(line)
self.try_wiki_article(' '.join(buf)) self.try_wiki_article(' '.join(buf))
def try_wiki_article(self, text):
if len(text) > 1000:
self.add_text(text)
def count_twitter(self, path, offset, nsplit): def count_twitter(self, path, offset, nsplit):
with path.open(encoding='utf-8') as file: with path.open(encoding='utf-8') as file:
for i, line in enumerate(file): for i, line in enumerate(file):
@ -90,9 +104,5 @@ class WordCountBuilder:
text = line.split('\t')[-1] text = line.split('\t')[-1]
self.add_text(text) self.add_text(text)
def try_wiki_article(self, text):
if len(text) > 1000:
self.add_text(text)
def save_wordlist(self, path): def save_wordlist(self, path):
write_counts(self.counts, path) write_counts(self.counts, path)