mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
enable wordlist balancing, surface form counting
This commit is contained in:
parent
07e61be7e3
commit
bc780c63c8
@ -3,18 +3,19 @@ from pathlib import Path
|
|||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
def merge_lists(input_names, output_name):
|
def merge_lists(input_names, output_name, balance=False):
|
||||||
count_dicts = []
|
count_dicts = []
|
||||||
for input_name in input_names:
|
for input_name in input_names:
|
||||||
count_dicts.append(read_counts(Path(input_name)))
|
count_dicts.append(read_counts(Path(input_name)))
|
||||||
merged = merge_counts(count_dicts)
|
merged = merge_counts(count_dicts, balance=balance)
|
||||||
write_counts(merged, Path(output_name))
|
write_counts(merged, Path(output_name))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
|
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
|
||||||
|
parser.add_argument('-b', '--balance', action='store_true', help='Automatically balance unequally-sampled word frequencies')
|
||||||
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
merge_lists(args.inputs, args.output)
|
merge_lists(args.inputs, args.output, balance=args.balance)
|
||||||
|
|
||||||
|
@ -1,12 +1,16 @@
|
|||||||
from wordfreq_builder.word_counts import WordCountBuilder
|
from wordfreq_builder.word_counts import WordCountBuilder
|
||||||
from wordfreq_builder.tokenizers import rosette_tokenizer
|
from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
def count_twitter(pathname, offset=0, nsplit=1):
|
def count_twitter(pathname, offset=0, nsplit=1, surface=False):
|
||||||
path = Path(pathname)
|
path = Path(pathname)
|
||||||
builder = WordCountBuilder(tokenizer=rosette_tokenizer)
|
if surface == True:
|
||||||
|
tokenizer = rosette_surface_tokenizer
|
||||||
|
else:
|
||||||
|
tokenizer = rosette_tokenizer
|
||||||
|
builder = WordCountBuilder(tokenizer=tokenizer)
|
||||||
save_filename = 'twitter-counts-%d.csv' % offset
|
save_filename = 'twitter-counts-%d.csv' % offset
|
||||||
save_pathname = path.parent / save_filename
|
save_pathname = path.parent / save_filename
|
||||||
builder.count_twitter(path, offset, nsplit)
|
builder.count_twitter(path, offset, nsplit)
|
||||||
@ -18,6 +22,7 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument('filename', help='filename of input file containing one tweet per line')
|
parser.add_argument('filename', help='filename of input file containing one tweet per line')
|
||||||
parser.add_argument('offset', type=int)
|
parser.add_argument('offset', type=int)
|
||||||
parser.add_argument('nsplit', type=int)
|
parser.add_argument('nsplit', type=int)
|
||||||
|
parser.add_argument('-s', '--surface', action='store_true', help='Use surface text instead of stems')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
count_twitter(args.filename, args.offset, args.nsplit)
|
count_twitter(args.filename, args.offset, args.nsplit, surface=args.surface)
|
||||||
|
|
||||||
|
@ -1,12 +1,16 @@
|
|||||||
from wordfreq_builder.word_counts import WordCountBuilder
|
from wordfreq_builder.word_counts import WordCountBuilder
|
||||||
from wordfreq_builder.tokenizers import rosette_tokenizer
|
from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
def count_wikipedia(pathname):
|
def count_wikipedia(pathname, surface=False):
|
||||||
path = Path(pathname)
|
path = Path(pathname)
|
||||||
builder = WordCountBuilder()
|
if surface == True:
|
||||||
|
tokenizer = rosette_surface_tokenizer
|
||||||
|
else:
|
||||||
|
tokenizer = rosette_tokenizer
|
||||||
|
builder = WordCountBuilder(tokenizer=tokenizer)
|
||||||
builder.count_wikipedia(path)
|
builder.count_wikipedia(path)
|
||||||
builder.save_wordlist(path / 'counts.csv')
|
builder.save_wordlist(path / 'counts.csv')
|
||||||
|
|
||||||
@ -14,6 +18,7 @@ def count_wikipedia(pathname):
|
|||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('dir', help='directory containing extracted Wikipedia text')
|
parser.add_argument('dir', help='directory containing extracted Wikipedia text')
|
||||||
|
parser.add_argument('-s', '--surface', action='store_true', help='Use surface text instead of stems')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
count_wikipedia(args.dir)
|
count_wikipedia(args.dir, surface=args.surface)
|
||||||
|
|
||||||
|
@ -7,12 +7,18 @@ ROSETTE = RosetteReader()
|
|||||||
|
|
||||||
def rosette_tokenizer(text):
|
def rosette_tokenizer(text):
|
||||||
analysis, lang = ROSETTE.rosette.analyze(text)
|
analysis, lang = ROSETTE.rosette.analyze(text)
|
||||||
|
# I'm aware this doesn't do the right things with multi-word stems.
|
||||||
|
# Wordfreq doesn't either. And wordfreq isn't designed to look up
|
||||||
|
# multiple words anyway.
|
||||||
return [stem + '|' + lang for (stem, pos, span) in analysis]
|
return [stem + '|' + lang for (stem, pos, span) in analysis]
|
||||||
|
|
||||||
|
|
||||||
|
def rosette_surface_tokenizer(text):
|
||||||
|
analysis, lang = ROSETTE.rosette.analyze(text)
|
||||||
|
return [text[span[0]:span[1]] + '|' + lang for (stem, pos, span) in analysis]
|
||||||
|
|
||||||
|
|
||||||
def treebank_tokenizer(text):
|
def treebank_surface_tokenizer(text):
|
||||||
"""
|
"""
|
||||||
This is a simplified version of the Treebank tokenizer in NLTK.
|
This is a simplified version of the Treebank tokenizer in NLTK.
|
||||||
|
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
from wordfreq_builder.tokenizers import treebank_tokenizer
|
from wordfreq_builder.tokenizers import treebank_surface_tokenizer
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from unicodedata import normalize
|
from unicodedata import normalize
|
||||||
import csv
|
import csv
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
def read_counts(path):
|
def read_counts(path):
|
||||||
@ -11,7 +12,7 @@ def read_counts(path):
|
|||||||
with path.open(encoding='utf-8', newline='') as infile:
|
with path.open(encoding='utf-8', newline='') as infile:
|
||||||
reader = csv.reader(infile)
|
reader = csv.reader(infile)
|
||||||
for key, strval in reader:
|
for key, strval in reader:
|
||||||
val = int(strval)
|
val = float(strval)
|
||||||
# Use += so that, if we give the reader concatenated files with
|
# Use += so that, if we give the reader concatenated files with
|
||||||
# duplicates, it does the right thing
|
# duplicates, it does the right thing
|
||||||
counts[key] += val
|
counts[key] += val
|
||||||
@ -27,11 +28,14 @@ def count_languages(counts):
|
|||||||
return langcounts
|
return langcounts
|
||||||
|
|
||||||
|
|
||||||
def merge_counts(count_dicts):
|
def merge_counts(count_dicts, balance=False):
|
||||||
merged = defaultdict(int)
|
merged = defaultdict(float)
|
||||||
for counts in count_dicts:
|
for counts in count_dicts:
|
||||||
|
weight = 1
|
||||||
|
if balance:
|
||||||
|
weight = 1e9 / max(counts.values()) / len(count_dicts)
|
||||||
for key, val in counts.items():
|
for key, val in counts.items():
|
||||||
merged[key] += val
|
merged[key] += val * weight
|
||||||
return merged
|
return merged
|
||||||
|
|
||||||
|
|
||||||
@ -52,7 +56,7 @@ class WordCountBuilder:
|
|||||||
self.counts = defaultdict(int)
|
self.counts = defaultdict(int)
|
||||||
self.unique_docs = unique_docs
|
self.unique_docs = unique_docs
|
||||||
if tokenizer is None:
|
if tokenizer is None:
|
||||||
self.tokenizer = treebank_tokenizer
|
self.tokenizer = treebank_surface_tokenizer
|
||||||
else:
|
else:
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
|
|
||||||
@ -60,8 +64,9 @@ class WordCountBuilder:
|
|||||||
text = normalize('NFKC', text).lower()
|
text = normalize('NFKC', text).lower()
|
||||||
try:
|
try:
|
||||||
tokens = self.tokenizer(text)
|
tokens = self.tokenizer(text)
|
||||||
|
# print(' '.join(tokens))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Couldn't tokenize due to %r: %s" % (e, text))
|
print("Couldn't tokenize due to %r: %s" % (e, text), file=sys.stderr)
|
||||||
return
|
return
|
||||||
if self.unique_docs:
|
if self.unique_docs:
|
||||||
tokens = set(tokens)
|
tokens = set(tokens)
|
||||||
@ -69,6 +74,11 @@ class WordCountBuilder:
|
|||||||
self.counts[tok] += 1
|
self.counts[tok] += 1
|
||||||
|
|
||||||
def count_wikipedia(self, path, glob='*/*'):
|
def count_wikipedia(self, path, glob='*/*'):
|
||||||
|
"""
|
||||||
|
Read a directory of extracted Wikipedia articles. The articles can be
|
||||||
|
grouped together into files, in which case they should be separated by
|
||||||
|
lines beginning with ##.
|
||||||
|
"""
|
||||||
for filepath in sorted(path.glob(glob)):
|
for filepath in sorted(path.glob(glob)):
|
||||||
print(filepath)
|
print(filepath)
|
||||||
with filepath.open(encoding='utf-8') as file:
|
with filepath.open(encoding='utf-8') as file:
|
||||||
@ -82,6 +92,10 @@ class WordCountBuilder:
|
|||||||
buf.append(line)
|
buf.append(line)
|
||||||
self.try_wiki_article(' '.join(buf))
|
self.try_wiki_article(' '.join(buf))
|
||||||
|
|
||||||
|
def try_wiki_article(self, text):
|
||||||
|
if len(text) > 1000:
|
||||||
|
self.add_text(text)
|
||||||
|
|
||||||
def count_twitter(self, path, offset, nsplit):
|
def count_twitter(self, path, offset, nsplit):
|
||||||
with path.open(encoding='utf-8') as file:
|
with path.open(encoding='utf-8') as file:
|
||||||
for i, line in enumerate(file):
|
for i, line in enumerate(file):
|
||||||
@ -90,9 +104,5 @@ class WordCountBuilder:
|
|||||||
text = line.split('\t')[-1]
|
text = line.split('\t')[-1]
|
||||||
self.add_text(text)
|
self.add_text(text)
|
||||||
|
|
||||||
def try_wiki_article(self, text):
|
|
||||||
if len(text) > 1000:
|
|
||||||
self.add_text(text)
|
|
||||||
|
|
||||||
def save_wordlist(self, path):
|
def save_wordlist(self, path):
|
||||||
write_counts(self.counts, path)
|
write_counts(self.counts, path)
|
||||||
|
Loading…
Reference in New Issue
Block a user