command-line entry points

This commit is contained in:
Robyn Speer 2015-02-10 12:28:29 -05:00
parent 8b322ce534
commit 23bd5ba76c
4 changed files with 66 additions and 13 deletions

View File

@ -0,0 +1,23 @@
from wordfreq_builder.word_counts import WordCountBuilder
from wordfreq_builder.tokenizers import rosette_tokenizer
from pathlib import Path
import argparse
def count_twitter(pathname, offset=0, nsplit=1):
path = Path(pathname)
builder = WordCountBuilder(tokenizer=rosette_tokenizer)
save_filename = 'twitter-counts-%d.csv' % offset
save_pathname = path.parent / save_filename
builder.count_twitter(path, offset, nsplit)
builder.save_wordlist(save_pathname)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('filename', help='filename of input file containing one tweet per line')
parser.add_argument('offset', type=int)
parser.add_argument('nsplit', type=int)
args = parser.parse_args()
count_twitter(args.filename, args.offset, args.nsplit)

View File

@ -0,0 +1,19 @@
from wordfreq_builder.word_counts import WordCountBuilder
from wordfreq_builder.tokenizers import rosette_tokenizer
from pathlib import Path
import argparse
def count_wikipedia(pathname):
path = Path(pathname)
builder = WordCountBuilder()
builder.count_wikipedia(path)
builder.save_wordlist(path / 'counts.csv')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('dir', help='directory containing extracted Wikipedia text')
args = parser.parse_args()
count_wikipedia(args.dir)

View File

@ -1,6 +1,17 @@
from lumi_science.text_readers.rosette_readers import RosetteReader
import re import re
ROSETTE = RosetteReader()
def rosette_tokenizer(text):
analysis, lang = ROSETTE.rosette.analyze(text)
return [stem + '|' + lang for (stem, pos, span) in analysis]
def treebank_tokenizer(text): def treebank_tokenizer(text):
""" """
This is a simplified version of the Treebank tokenizer in NLTK. This is a simplified version of the Treebank tokenizer in NLTK.

View File

@ -1,4 +1,4 @@
from wordfreq_builder.tokenize import treebank_tokenizer from wordfreq_builder.tokenizers import treebank_tokenizer
from collections import defaultdict from collections import defaultdict
from operator import itemgetter from operator import itemgetter
from pathlib import Path from pathlib import Path
@ -17,7 +17,11 @@ class WordCountBuilder:
def add_text(self, text): def add_text(self, text):
text = normalize('NFKC', text).lower() text = normalize('NFKC', text).lower()
try:
tokens = self.tokenizer(text) tokens = self.tokenizer(text)
except Exception as e:
print("Couldn't tokenize due to %r: %s" % (e, text))
return
if self.unique_docs: if self.unique_docs:
tokens = set(tokens) tokens = set(tokens)
for tok in tokens: for tok in tokens:
@ -37,8 +41,13 @@ class WordCountBuilder:
buf.append(line) buf.append(line)
self.try_wiki_article(' '.join(buf)) self.try_wiki_article(' '.join(buf))
#def count_twitter(self, path): def count_twitter(self, path, offset, nsplit):
# with path.open(encoding='utf-8') as file: with path.open(encoding='utf-8') as file:
for i, line in enumerate(file):
if i % nsplit == offset:
line = line.strip()
text = line.split('\t')[-1]
self.add_text(text)
def try_wiki_article(self, text): def try_wiki_article(self, text):
if len(text) > 1000: if len(text) > 1000:
@ -55,12 +64,3 @@ class WordCountBuilder:
writer.writerow([word, count]) writer.writerow([word, count])
def count_wikipedia(pathname):
path = Path(pathname)
builder = WordCountBuilder()
builder.count_wikipedia(path)
builder.save_wordlist(path / 'counts.csv')
if __name__ == '__main__':
count_wikipedia('/hd/data/wikipedia/wikipedia-extractor/fr.wikipedia.org')