Initial commit

This commit is contained in:
Robyn Speer 2015-02-04 20:19:36 -05:00
commit 8b322ce534
6 changed files with 164 additions and 0 deletions

8
wordfreq_builder/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
*.pyc
__pycache__
.coverage
.idea
dist
*.egg-info
build
_build

12
wordfreq_builder/setup.py Executable file
View File

@ -0,0 +1,12 @@
from setuptools import setup
setup(
name="wordfreq_builder",
version='0.1',
maintainer='Luminoso Technologies, Inc.',
maintainer_email='info@luminoso.com',
url='http://github.com/LuminosoInsight/wordfreq_builder',
platforms=["any"],
description="Turns raw data into word frequency lists",
packages=['wordfreq_builder'],
)

View File

@ -0,0 +1,21 @@
from wordfreq_builder.tokenize import treebank_tokenizer
from nose.tools import eq_
def test_tokenizer_1():
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
tokens = [
"``", 'This', 'is', 'a', 'test', ',', "''", 'she', 'said', ',',
"``", 'and', 'I', "'ll", 'bet', "y'all", '$', '3.50', 'that',
'it', 'wo', "n't", 'fail', '.', "''"
]
eq_(treebank_tokenizer(text), tokens)
def test_tokenizer_2():
text = "i use punctuation informally...see?like this."
tokens = [
'i', 'use', 'punctuation', 'informally', '...', 'see', '?',
'like', 'this', '.'
]
eq_(treebank_tokenizer(text), tokens)

View File

@ -0,0 +1,57 @@
import re
def treebank_tokenizer(text):
"""
This is a simplified version of the Treebank tokenizer in NLTK.
NLTK's version depends on the text first having been sentence-tokenized
using Punkt, which is a statistical model that we'd rather not implement
here. The main reason to use Punkt first is to disambiguate periods that
are sentence-ending from those that are part of abbreviations.
NLTK's tokenizer thus assumes that any periods that appear in the middle
of the text are meant to be there, and leaves them attached to words. We
can skip the complication of Punkt at the cost of altering abbreviations
such as "U.S.".
NLTK also splits contractions that lack apostrophes, giving pseudo-words
as a result -- for example, it splits "wanna" into "wan" and "na", which
are supposed to be considered unusual surface forms of "want" and "to".
We just leave it as the word "wanna".
"""
#starting quotes
text = re.sub(r'^\"', r'``', text)
text = re.sub(r'(``)', r' \1 ', text)
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
#punctuation
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
text = re.sub(r'\.\.\.', r' ... ', text)
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
# The following rule was modified from NLTK, which only separated periods
# at the end of the text. We simply made whitespace an alternative to the
# text-ending symbol $.
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)(\s|$)', r'\1 \2\3 ', text)
text = re.sub(r'[?!]', r' \g<0> ', text)
text = re.sub(r"([^'])' ", r"\1 ' ", text)
#parens, brackets, etc.
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
text = re.sub(r'--', r' -- ', text)
#add extra space to make things easier
text = " " + text + " "
#ending quotes
text = re.sub(r'"', " '' ", text)
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
#contractions
text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
text)
return text.split()

View File

@ -0,0 +1,66 @@
from wordfreq_builder.tokenize import treebank_tokenizer
from collections import defaultdict
from operator import itemgetter
from pathlib import Path
from unicodedata import normalize
import csv
class WordCountBuilder:
def __init__(self, unique_docs=True, tokenizer=None):
self.counts = defaultdict(int)
self.unique_docs = unique_docs
if tokenizer is None:
self.tokenizer = treebank_tokenizer
else:
self.tokenizer = tokenizer
def add_text(self, text):
text = normalize('NFKC', text).lower()
tokens = self.tokenizer(text)
if self.unique_docs:
tokens = set(tokens)
for tok in tokens:
self.counts[tok] += 1
def count_wikipedia(self, path, glob='*/*'):
for filepath in sorted(path.glob(glob)):
print(filepath)
with filepath.open(encoding='utf-8') as file:
buf = []
for line in file:
line = line.strip()
if line.startswith('##'):
self.try_wiki_article(' '.join(buf))
buf = []
else:
buf.append(line)
self.try_wiki_article(' '.join(buf))
#def count_twitter(self, path):
# with path.open(encoding='utf-8') as file:
def try_wiki_article(self, text):
if len(text) > 1000:
self.add_text(text)
def save_wordlist(self, path):
with path.open('w', encoding='utf-8', newline='') as outfile:
writer = csv.writer(outfile)
items = sorted(self.counts.items(), key=itemgetter(1), reverse=True)
for word, count in items:
if count <= 1:
# Don't write all the terms that appeared only once
break
writer.writerow([word, count])
def count_wikipedia(pathname):
path = Path(pathname)
builder = WordCountBuilder()
builder.count_wikipedia(path)
builder.save_wordlist(path / 'counts.csv')
if __name__ == '__main__':
count_wikipedia('/hd/data/wikipedia/wikipedia-extractor/fr.wikipedia.org')