mirror of
https://github.com/rspeer/wordfreq.git
synced 2025-01-14 13:15:59 +00:00
command-line entry points
This commit is contained in:
parent
8b322ce534
commit
23bd5ba76c
wordfreq_builder/wordfreq_builder
23
wordfreq_builder/wordfreq_builder/cmd_count_twitter.py
Normal file
23
wordfreq_builder/wordfreq_builder/cmd_count_twitter.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
from wordfreq_builder.word_counts import WordCountBuilder
|
||||||
|
from wordfreq_builder.tokenizers import rosette_tokenizer
|
||||||
|
from pathlib import Path
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def count_twitter(pathname, offset=0, nsplit=1):
|
||||||
|
path = Path(pathname)
|
||||||
|
builder = WordCountBuilder(tokenizer=rosette_tokenizer)
|
||||||
|
save_filename = 'twitter-counts-%d.csv' % offset
|
||||||
|
save_pathname = path.parent / save_filename
|
||||||
|
builder.count_twitter(path, offset, nsplit)
|
||||||
|
builder.save_wordlist(save_pathname)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('filename', help='filename of input file containing one tweet per line')
|
||||||
|
parser.add_argument('offset', type=int)
|
||||||
|
parser.add_argument('nsplit', type=int)
|
||||||
|
args = parser.parse_args()
|
||||||
|
count_twitter(args.filename, args.offset, args.nsplit)
|
||||||
|
|
19
wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py
Normal file
19
wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from wordfreq_builder.word_counts import WordCountBuilder
|
||||||
|
from wordfreq_builder.tokenizers import rosette_tokenizer
|
||||||
|
from pathlib import Path
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def count_wikipedia(pathname):
|
||||||
|
path = Path(pathname)
|
||||||
|
builder = WordCountBuilder()
|
||||||
|
builder.count_wikipedia(path)
|
||||||
|
builder.save_wordlist(path / 'counts.csv')
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('dir', help='directory containing extracted Wikipedia text')
|
||||||
|
args = parser.parse_args()
|
||||||
|
count_wikipedia(args.dir)
|
||||||
|
|
@ -1,6 +1,17 @@
|
|||||||
|
from lumi_science.text_readers.rosette_readers import RosetteReader
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
ROSETTE = RosetteReader()
|
||||||
|
|
||||||
|
|
||||||
|
def rosette_tokenizer(text):
|
||||||
|
analysis, lang = ROSETTE.rosette.analyze(text)
|
||||||
|
return [stem + '|' + lang for (stem, pos, span) in analysis]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def treebank_tokenizer(text):
|
def treebank_tokenizer(text):
|
||||||
"""
|
"""
|
||||||
This is a simplified version of the Treebank tokenizer in NLTK.
|
This is a simplified version of the Treebank tokenizer in NLTK.
|
@ -1,4 +1,4 @@
|
|||||||
from wordfreq_builder.tokenize import treebank_tokenizer
|
from wordfreq_builder.tokenizers import treebank_tokenizer
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -17,7 +17,11 @@ class WordCountBuilder:
|
|||||||
|
|
||||||
def add_text(self, text):
|
def add_text(self, text):
|
||||||
text = normalize('NFKC', text).lower()
|
text = normalize('NFKC', text).lower()
|
||||||
|
try:
|
||||||
tokens = self.tokenizer(text)
|
tokens = self.tokenizer(text)
|
||||||
|
except Exception as e:
|
||||||
|
print("Couldn't tokenize due to %r: %s" % (e, text))
|
||||||
|
return
|
||||||
if self.unique_docs:
|
if self.unique_docs:
|
||||||
tokens = set(tokens)
|
tokens = set(tokens)
|
||||||
for tok in tokens:
|
for tok in tokens:
|
||||||
@ -37,8 +41,13 @@ class WordCountBuilder:
|
|||||||
buf.append(line)
|
buf.append(line)
|
||||||
self.try_wiki_article(' '.join(buf))
|
self.try_wiki_article(' '.join(buf))
|
||||||
|
|
||||||
#def count_twitter(self, path):
|
def count_twitter(self, path, offset, nsplit):
|
||||||
# with path.open(encoding='utf-8') as file:
|
with path.open(encoding='utf-8') as file:
|
||||||
|
for i, line in enumerate(file):
|
||||||
|
if i % nsplit == offset:
|
||||||
|
line = line.strip()
|
||||||
|
text = line.split('\t')[-1]
|
||||||
|
self.add_text(text)
|
||||||
|
|
||||||
def try_wiki_article(self, text):
|
def try_wiki_article(self, text):
|
||||||
if len(text) > 1000:
|
if len(text) > 1000:
|
||||||
@ -55,12 +64,3 @@ class WordCountBuilder:
|
|||||||
writer.writerow([word, count])
|
writer.writerow([word, count])
|
||||||
|
|
||||||
|
|
||||||
def count_wikipedia(pathname):
|
|
||||||
path = Path(pathname)
|
|
||||||
builder = WordCountBuilder()
|
|
||||||
builder.count_wikipedia(path)
|
|
||||||
builder.save_wordlist(path / 'counts.csv')
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
count_wikipedia('/hd/data/wikipedia/wikipedia-extractor/fr.wikipedia.org')
|
|
||||||
|
Loading…
Reference in New Issue
Block a user