mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
Initial commit
This commit is contained in:
commit
8b322ce534
8
wordfreq_builder/.gitignore
vendored
Normal file
8
wordfreq_builder/.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
*.pyc
|
||||
__pycache__
|
||||
.coverage
|
||||
.idea
|
||||
dist
|
||||
*.egg-info
|
||||
build
|
||||
_build
|
12
wordfreq_builder/setup.py
Executable file
12
wordfreq_builder/setup.py
Executable file
@ -0,0 +1,12 @@
|
||||
from setuptools import setup
|
||||
|
||||
setup(
|
||||
name="wordfreq_builder",
|
||||
version='0.1',
|
||||
maintainer='Luminoso Technologies, Inc.',
|
||||
maintainer_email='info@luminoso.com',
|
||||
url='http://github.com/LuminosoInsight/wordfreq_builder',
|
||||
platforms=["any"],
|
||||
description="Turns raw data into word frequency lists",
|
||||
packages=['wordfreq_builder'],
|
||||
)
|
0
wordfreq_builder/wordfreq_builder/__init__.py
Normal file
0
wordfreq_builder/wordfreq_builder/__init__.py
Normal file
21
wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py
Normal file
21
wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py
Normal file
@ -0,0 +1,21 @@
|
||||
from wordfreq_builder.tokenize import treebank_tokenizer
|
||||
from nose.tools import eq_
|
||||
|
||||
|
||||
def test_tokenizer_1():
|
||||
text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
|
||||
tokens = [
|
||||
"``", 'This', 'is', 'a', 'test', ',', "''", 'she', 'said', ',',
|
||||
"``", 'and', 'I', "'ll", 'bet', "y'all", '$', '3.50', 'that',
|
||||
'it', 'wo', "n't", 'fail', '.', "''"
|
||||
]
|
||||
eq_(treebank_tokenizer(text), tokens)
|
||||
|
||||
|
||||
def test_tokenizer_2():
|
||||
text = "i use punctuation informally...see?like this."
|
||||
tokens = [
|
||||
'i', 'use', 'punctuation', 'informally', '...', 'see', '?',
|
||||
'like', 'this', '.'
|
||||
]
|
||||
eq_(treebank_tokenizer(text), tokens)
|
57
wordfreq_builder/wordfreq_builder/tokenize.py
Normal file
57
wordfreq_builder/wordfreq_builder/tokenize.py
Normal file
@ -0,0 +1,57 @@
|
||||
import re
|
||||
|
||||
|
||||
def treebank_tokenizer(text):
|
||||
"""
|
||||
This is a simplified version of the Treebank tokenizer in NLTK.
|
||||
|
||||
NLTK's version depends on the text first having been sentence-tokenized
|
||||
using Punkt, which is a statistical model that we'd rather not implement
|
||||
here. The main reason to use Punkt first is to disambiguate periods that
|
||||
are sentence-ending from those that are part of abbreviations.
|
||||
|
||||
NLTK's tokenizer thus assumes that any periods that appear in the middle
|
||||
of the text are meant to be there, and leaves them attached to words. We
|
||||
can skip the complication of Punkt at the cost of altering abbreviations
|
||||
such as "U.S.".
|
||||
|
||||
NLTK also splits contractions that lack apostrophes, giving pseudo-words
|
||||
as a result -- for example, it splits "wanna" into "wan" and "na", which
|
||||
are supposed to be considered unusual surface forms of "want" and "to".
|
||||
We just leave it as the word "wanna".
|
||||
"""
|
||||
#starting quotes
|
||||
text = re.sub(r'^\"', r'``', text)
|
||||
text = re.sub(r'(``)', r' \1 ', text)
|
||||
text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
|
||||
|
||||
#punctuation
|
||||
text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
|
||||
text = re.sub(r'\.\.\.', r' ... ', text)
|
||||
text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
|
||||
|
||||
# The following rule was modified from NLTK, which only separated periods
|
||||
# at the end of the text. We simply made whitespace an alternative to the
|
||||
# text-ending symbol $.
|
||||
text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)(\s|$)', r'\1 \2\3 ', text)
|
||||
text = re.sub(r'[?!]', r' \g<0> ', text)
|
||||
|
||||
text = re.sub(r"([^'])' ", r"\1 ' ", text)
|
||||
|
||||
#parens, brackets, etc.
|
||||
text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
|
||||
text = re.sub(r'--', r' -- ', text)
|
||||
|
||||
#add extra space to make things easier
|
||||
text = " " + text + " "
|
||||
|
||||
#ending quotes
|
||||
text = re.sub(r'"', " '' ", text)
|
||||
text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
|
||||
|
||||
#contractions
|
||||
text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
|
||||
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
|
||||
text)
|
||||
|
||||
return text.split()
|
66
wordfreq_builder/wordfreq_builder/word_counts.py
Normal file
66
wordfreq_builder/wordfreq_builder/word_counts.py
Normal file
@ -0,0 +1,66 @@
|
||||
from wordfreq_builder.tokenize import treebank_tokenizer
|
||||
from collections import defaultdict
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
from unicodedata import normalize
|
||||
import csv
|
||||
|
||||
|
||||
class WordCountBuilder:
|
||||
def __init__(self, unique_docs=True, tokenizer=None):
|
||||
self.counts = defaultdict(int)
|
||||
self.unique_docs = unique_docs
|
||||
if tokenizer is None:
|
||||
self.tokenizer = treebank_tokenizer
|
||||
else:
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
def add_text(self, text):
|
||||
text = normalize('NFKC', text).lower()
|
||||
tokens = self.tokenizer(text)
|
||||
if self.unique_docs:
|
||||
tokens = set(tokens)
|
||||
for tok in tokens:
|
||||
self.counts[tok] += 1
|
||||
|
||||
def count_wikipedia(self, path, glob='*/*'):
|
||||
for filepath in sorted(path.glob(glob)):
|
||||
print(filepath)
|
||||
with filepath.open(encoding='utf-8') as file:
|
||||
buf = []
|
||||
for line in file:
|
||||
line = line.strip()
|
||||
if line.startswith('##'):
|
||||
self.try_wiki_article(' '.join(buf))
|
||||
buf = []
|
||||
else:
|
||||
buf.append(line)
|
||||
self.try_wiki_article(' '.join(buf))
|
||||
|
||||
#def count_twitter(self, path):
|
||||
# with path.open(encoding='utf-8') as file:
|
||||
|
||||
def try_wiki_article(self, text):
|
||||
if len(text) > 1000:
|
||||
self.add_text(text)
|
||||
|
||||
def save_wordlist(self, path):
|
||||
with path.open('w', encoding='utf-8', newline='') as outfile:
|
||||
writer = csv.writer(outfile)
|
||||
items = sorted(self.counts.items(), key=itemgetter(1), reverse=True)
|
||||
for word, count in items:
|
||||
if count <= 1:
|
||||
# Don't write all the terms that appeared only once
|
||||
break
|
||||
writer.writerow([word, count])
|
||||
|
||||
|
||||
def count_wikipedia(pathname):
|
||||
path = Path(pathname)
|
||||
builder = WordCountBuilder()
|
||||
builder.count_wikipedia(path)
|
||||
builder.save_wordlist(path / 'counts.csv')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
count_wikipedia('/hd/data/wikipedia/wikipedia-extractor/fr.wikipedia.org')
|
Loading…
Reference in New Issue
Block a user