commit 8b322ce534899a68455546e0433012331a30b1a7 Author: Robyn Speer Date: Wed Feb 4 20:19:36 2015 -0500 Initial commit diff --git a/wordfreq_builder/.gitignore b/wordfreq_builder/.gitignore new file mode 100644 index 0000000..46c58ff --- /dev/null +++ b/wordfreq_builder/.gitignore @@ -0,0 +1,8 @@ +*.pyc +__pycache__ +.coverage +.idea +dist +*.egg-info +build +_build diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py new file mode 100755 index 0000000..e57c58e --- /dev/null +++ b/wordfreq_builder/setup.py @@ -0,0 +1,12 @@ +from setuptools import setup + +setup( + name="wordfreq_builder", + version='0.1', + maintainer='Luminoso Technologies, Inc.', + maintainer_email='info@luminoso.com', + url='http://github.com/LuminosoInsight/wordfreq_builder', + platforms=["any"], + description="Turns raw data into word frequency lists", + packages=['wordfreq_builder'], +) diff --git a/wordfreq_builder/wordfreq_builder/__init__.py b/wordfreq_builder/wordfreq_builder/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py b/wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py new file mode 100644 index 0000000..7715831 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py @@ -0,0 +1,21 @@ +from wordfreq_builder.tokenize import treebank_tokenizer +from nose.tools import eq_ + + +def test_tokenizer_1(): + text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."' + tokens = [ + "``", 'This', 'is', 'a', 'test', ',', "''", 'she', 'said', ',', + "``", 'and', 'I', "'ll", 'bet', "y'all", '$', '3.50', 'that', + 'it', 'wo', "n't", 'fail', '.', "''" + ] + eq_(treebank_tokenizer(text), tokens) + + +def test_tokenizer_2(): + text = "i use punctuation informally...see?like this." + tokens = [ + 'i', 'use', 'punctuation', 'informally', '...', 'see', '?', + 'like', 'this', '.' + ] + eq_(treebank_tokenizer(text), tokens) diff --git a/wordfreq_builder/wordfreq_builder/tokenize.py b/wordfreq_builder/wordfreq_builder/tokenize.py new file mode 100644 index 0000000..608f37f --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/tokenize.py @@ -0,0 +1,57 @@ +import re + + +def treebank_tokenizer(text): + """ + This is a simplified version of the Treebank tokenizer in NLTK. + + NLTK's version depends on the text first having been sentence-tokenized + using Punkt, which is a statistical model that we'd rather not implement + here. The main reason to use Punkt first is to disambiguate periods that + are sentence-ending from those that are part of abbreviations. + + NLTK's tokenizer thus assumes that any periods that appear in the middle + of the text are meant to be there, and leaves them attached to words. We + can skip the complication of Punkt at the cost of altering abbreviations + such as "U.S.". + + NLTK also splits contractions that lack apostrophes, giving pseudo-words + as a result -- for example, it splits "wanna" into "wan" and "na", which + are supposed to be considered unusual surface forms of "want" and "to". + We just leave it as the word "wanna". + """ + #starting quotes + text = re.sub(r'^\"', r'``', text) + text = re.sub(r'(``)', r' \1 ', text) + text = re.sub(r'([ (\[{<])"', r'\1 `` ', text) + + #punctuation + text = re.sub(r'([:,])([^\d])', r' \1 \2', text) + text = re.sub(r'\.\.\.', r' ... ', text) + text = re.sub(r'[;@#$%&]', r' \g<0> ', text) + + # The following rule was modified from NLTK, which only separated periods + # at the end of the text. We simply made whitespace an alternative to the + # text-ending symbol $. + text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)(\s|$)', r'\1 \2\3 ', text) + text = re.sub(r'[?!]', r' \g<0> ', text) + + text = re.sub(r"([^'])' ", r"\1 ' ", text) + + #parens, brackets, etc. + text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text) + text = re.sub(r'--', r' -- ', text) + + #add extra space to make things easier + text = " " + text + " " + + #ending quotes + text = re.sub(r'"', " '' ", text) + text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text) + + #contractions + text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text) + text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ", + text) + + return text.split() diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py new file mode 100644 index 0000000..cb70256 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -0,0 +1,66 @@ +from wordfreq_builder.tokenize import treebank_tokenizer +from collections import defaultdict +from operator import itemgetter +from pathlib import Path +from unicodedata import normalize +import csv + + +class WordCountBuilder: + def __init__(self, unique_docs=True, tokenizer=None): + self.counts = defaultdict(int) + self.unique_docs = unique_docs + if tokenizer is None: + self.tokenizer = treebank_tokenizer + else: + self.tokenizer = tokenizer + + def add_text(self, text): + text = normalize('NFKC', text).lower() + tokens = self.tokenizer(text) + if self.unique_docs: + tokens = set(tokens) + for tok in tokens: + self.counts[tok] += 1 + + def count_wikipedia(self, path, glob='*/*'): + for filepath in sorted(path.glob(glob)): + print(filepath) + with filepath.open(encoding='utf-8') as file: + buf = [] + for line in file: + line = line.strip() + if line.startswith('##'): + self.try_wiki_article(' '.join(buf)) + buf = [] + else: + buf.append(line) + self.try_wiki_article(' '.join(buf)) + + #def count_twitter(self, path): + # with path.open(encoding='utf-8') as file: + + def try_wiki_article(self, text): + if len(text) > 1000: + self.add_text(text) + + def save_wordlist(self, path): + with path.open('w', encoding='utf-8', newline='') as outfile: + writer = csv.writer(outfile) + items = sorted(self.counts.items(), key=itemgetter(1), reverse=True) + for word, count in items: + if count <= 1: + # Don't write all the terms that appeared only once + break + writer.writerow([word, count]) + + +def count_wikipedia(pathname): + path = Path(pathname) + builder = WordCountBuilder() + builder.count_wikipedia(path) + builder.save_wordlist(path / 'counts.csv') + + +if __name__ == '__main__': + count_wikipedia('/hd/data/wikipedia/wikipedia-extractor/fr.wikipedia.org')