command-line entry points

2024-12-23 09:21:37 +00:00 · 2015-02-10 12:28:29 -05:00 · 2015-02-10 12:28:29 -05:00 · 23bd5ba76c
commit 23bd5ba76c
parent 8b322ce534
4 changed files with 66 additions and 13 deletions
--- a/wordfreq_builder/wordfreq_builder/cmd_count_twitter.py
+++ b/wordfreq_builder/wordfreq_builder/cmd_count_twitter.py
@ -0,0 +1,23 @@
+from wordfreq_builder.word_counts import WordCountBuilder
+from wordfreq_builder.tokenizers import rosette_tokenizer
+from pathlib import Path
+import argparse
+
+
+def count_twitter(pathname, offset=0, nsplit=1):
+    path = Path(pathname)
+    builder = WordCountBuilder(tokenizer=rosette_tokenizer)
+    save_filename = 'twitter-counts-%d.csv' % offset
+    save_pathname = path.parent / save_filename
+    builder.count_twitter(path, offset, nsplit)
+    builder.save_wordlist(save_pathname)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('filename', help='filename of input file containing one tweet per line')
+    parser.add_argument('offset', type=int)
+    parser.add_argument('nsplit', type=int)
+    args = parser.parse_args()
+    count_twitter(args.filename, args.offset, args.nsplit)
+
--- a/wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py
+++ b/wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py
@ -0,0 +1,19 @@
+from wordfreq_builder.word_counts import WordCountBuilder
+from wordfreq_builder.tokenizers import rosette_tokenizer
+from pathlib import Path
+import argparse
+
+
+def count_wikipedia(pathname):
+    path = Path(pathname)
+    builder = WordCountBuilder()
+    builder.count_wikipedia(path)
+    builder.save_wordlist(path / 'counts.csv')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('dir', help='directory containing extracted Wikipedia text')
+    args = parser.parse_args()
+    count_wikipedia(args.dir)
+
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -1,6 +1,17 @@
+from lumi_science.text_readers.rosette_readers import RosetteReader
 import re


+ROSETTE = RosetteReader()
+
+
+def rosette_tokenizer(text):
+    analysis, lang = ROSETTE.rosette.analyze(text)
+    return [stem + '|' + lang for (stem, pos, span) in analysis]
+
+
+
+
 def treebank_tokenizer(text):
    """
    This is a simplified version of the Treebank tokenizer in NLTK.
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -1,4 +1,4 @@
-from wordfreq_builder.tokenize import treebank_tokenizer
+from wordfreq_builder.tokenizers import treebank_tokenizer
 from collections import defaultdict
 from operator import itemgetter
 from pathlib import Path
@ -17,7 +17,11 @@ class WordCountBuilder:

    def add_text(self, text):
        text = normalize('NFKC', text).lower()
-        tokens = self.tokenizer(text)
+        try:
+            tokens = self.tokenizer(text)
+        except Exception as e:
+            print("Couldn't tokenize due to %r: %s" % (e, text))
+            return
        if self.unique_docs:
            tokens = set(tokens)
        for tok in tokens:
@ -37,8 +41,13 @@ class WordCountBuilder:
                        buf.append(line)
                self.try_wiki_article(' '.join(buf))

-    #def count_twitter(self, path):
-    #    with path.open(encoding='utf-8') as file:
+    def count_twitter(self, path, offset, nsplit):
+        with path.open(encoding='utf-8') as file:
+            for i, line in enumerate(file):
+                if i % nsplit == offset:
+                    line = line.strip()
+                    text = line.split('\t')[-1]
+                    self.add_text(text)

    def try_wiki_article(self, text):
        if len(text) > 1000:
@ -55,12 +64,3 @@ class WordCountBuilder:
                writer.writerow([word, count])


-def count_wikipedia(pathname):
-    path = Path(pathname)
-    builder = WordCountBuilder()
-    builder.count_wikipedia(path)
-    builder.save_wordlist(path / 'counts.csv')
-
-
-if __name__ == '__main__':
-    count_wikipedia('/hd/data/wikipedia/wikipedia-extractor/fr.wikipedia.org')