Initial commit

2024-12-23 17:31:41 +00:00 · 2015-02-04 20:19:36 -05:00 · 2015-02-04 20:19:36 -05:00 · 8b322ce534
commit 8b322ce534
6 changed files with 164 additions and 0 deletions
--- a/wordfreq_builder/.gitignore
+++ b/wordfreq_builder/.gitignore
@ -0,0 +1,8 @@
+*.pyc
+__pycache__
+.coverage
+.idea
+dist
+*.egg-info
+build
+_build
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@ -0,0 +1,12 @@
+from setuptools import setup
+
+setup(
+    name="wordfreq_builder",
+    version='0.1',
+    maintainer='Luminoso Technologies, Inc.',
+    maintainer_email='info@luminoso.com',
+    url='http://github.com/LuminosoInsight/wordfreq_builder',
+    platforms=["any"],
+    description="Turns raw data into word frequency lists",
+    packages=['wordfreq_builder'],
+)
--- a/wordfreq_builder/wordfreq_builder/init.py
+++ b/wordfreq_builder/wordfreq_builder/init.py
--- a/wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py
+++ b/wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py
@ -0,0 +1,21 @@
+from wordfreq_builder.tokenize import treebank_tokenizer
+from nose.tools import eq_
+
+
+def test_tokenizer_1():
+    text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."'
+    tokens = [
+        "``", 'This', 'is', 'a', 'test', ',', "''", 'she', 'said', ',',
+        "``", 'and', 'I', "'ll", 'bet', "y'all", '$', '3.50', 'that',
+        'it', 'wo', "n't", 'fail', '.', "''"
+    ]
+    eq_(treebank_tokenizer(text), tokens)
+
+
+def test_tokenizer_2():
+    text = "i use punctuation informally...see?like this."
+    tokens = [
+        'i', 'use', 'punctuation', 'informally', '...', 'see', '?',
+        'like', 'this', '.'
+    ]
+    eq_(treebank_tokenizer(text), tokens)
--- a/wordfreq_builder/wordfreq_builder/tokenize.py
+++ b/wordfreq_builder/wordfreq_builder/tokenize.py
@ -0,0 +1,57 @@
+import re
+
+
+def treebank_tokenizer(text):
+    """
+    This is a simplified version of the Treebank tokenizer in NLTK.
+
+    NLTK's version depends on the text first having been sentence-tokenized
+    using Punkt, which is a statistical model that we'd rather not implement
+    here. The main reason to use Punkt first is to disambiguate periods that
+    are sentence-ending from those that are part of abbreviations.
+
+    NLTK's tokenizer thus assumes that any periods that appear in the middle
+    of the text are meant to be there, and leaves them attached to words. We
+    can skip the complication of Punkt at the cost of altering abbreviations
+    such as "U.S.".
+
+    NLTK also splits contractions that lack apostrophes, giving pseudo-words
+    as a result -- for example, it splits "wanna" into "wan" and "na", which
+    are supposed to be considered unusual surface forms of "want" and "to".
+    We just leave it as the word "wanna".
+    """
+    #starting quotes
+    text = re.sub(r'^\"', r'``', text)
+    text = re.sub(r'(``)', r' \1 ', text)
+    text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)
+
+    #punctuation
+    text = re.sub(r'([:,])([^\d])', r' \1 \2', text)
+    text = re.sub(r'\.\.\.', r' ... ', text)
+    text = re.sub(r'[;@#$%&]', r' \g<0> ', text)
+
+    # The following rule was modified from NLTK, which only separated periods
+    # at the end of the text. We simply made whitespace an alternative to the
+    # text-ending symbol $.
+    text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)(\s|$)', r'\1 \2\3 ', text)
+    text = re.sub(r'[?!]', r' \g<0> ', text)
+
+    text = re.sub(r"([^'])' ", r"\1 ' ", text)
+
+    #parens, brackets, etc.
+    text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text)
+    text = re.sub(r'--', r' -- ', text)
+
+    #add extra space to make things easier
+    text = " " + text + " "
+
+    #ending quotes
+    text = re.sub(r'"', " '' ", text)
+    text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text)
+
+    #contractions
+    text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text)
+    text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
+                  text)
+
+    return text.split()
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -0,0 +1,66 @@
+from wordfreq_builder.tokenize import treebank_tokenizer
+from collections import defaultdict
+from operator import itemgetter
+from pathlib import Path
+from unicodedata import normalize
+import csv
+
+
+class WordCountBuilder:
+    def __init__(self, unique_docs=True, tokenizer=None):
+        self.counts = defaultdict(int)
+        self.unique_docs = unique_docs
+        if tokenizer is None:
+            self.tokenizer = treebank_tokenizer
+        else:
+            self.tokenizer = tokenizer
+
+    def add_text(self, text):
+        text = normalize('NFKC', text).lower()
+        tokens = self.tokenizer(text)
+        if self.unique_docs:
+            tokens = set(tokens)
+        for tok in tokens:
+            self.counts[tok] += 1
+
+    def count_wikipedia(self, path, glob='*/*'):
+        for filepath in sorted(path.glob(glob)):
+            print(filepath)
+            with filepath.open(encoding='utf-8') as file:
+                buf = []
+                for line in file:
+                    line = line.strip()
+                    if line.startswith('##'):
+                        self.try_wiki_article(' '.join(buf))
+                        buf = []
+                    else:
+                        buf.append(line)
+                self.try_wiki_article(' '.join(buf))
+
+    #def count_twitter(self, path):
+    #    with path.open(encoding='utf-8') as file:
+
+    def try_wiki_article(self, text):
+        if len(text) > 1000:
+            self.add_text(text)
+
+    def save_wordlist(self, path):
+        with path.open('w', encoding='utf-8', newline='') as outfile:
+            writer = csv.writer(outfile)
+            items = sorted(self.counts.items(), key=itemgetter(1), reverse=True)
+            for word, count in items:
+                if count <= 1:
+                    # Don't write all the terms that appeared only once
+                    break
+                writer.writerow([word, count])
+
+
+def count_wikipedia(pathname):
+    path = Path(pathname)
+    builder = WordCountBuilder()
+    builder.count_wikipedia(path)
+    builder.save_wordlist(path / 'counts.csv')
+
+
+if __name__ == '__main__':
+    count_wikipedia('/hd/data/wikipedia/wikipedia-extractor/fr.wikipedia.org')