enable wordlist balancing, surface form counting

2024-12-23 17:31:41 +00:00 · 2015-02-17 13:43:22 -05:00 · 2015-02-17 13:43:22 -05:00 · bc780c63c8
commit bc780c63c8
parent 07e61be7e3
5 changed files with 50 additions and 23 deletions
--- a/wordfreq_builder/wordfreq_builder/cmd_combine_lists.py
+++ b/wordfreq_builder/wordfreq_builder/cmd_combine_lists.py
@ -3,18 +3,19 @@ from pathlib import Path
 import argparse


-def merge_lists(input_names, output_name):
+def merge_lists(input_names, output_name, balance=False):
    count_dicts = []
    for input_name in input_names:
        count_dicts.append(read_counts(Path(input_name)))
-    merged = merge_counts(count_dicts)
+    merged = merge_counts(count_dicts, balance=balance)
    write_counts(merged, Path(output_name))


 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
+    parser.add_argument('-b', '--balance', action='store_true', help='Automatically balance unequally-sampled word frequencies')
    parser.add_argument('inputs', help='names of input files to merge', nargs='+')
    args = parser.parse_args()
-    merge_lists(args.inputs, args.output)
+    merge_lists(args.inputs, args.output, balance=args.balance)

--- a/wordfreq_builder/wordfreq_builder/cmd_count_twitter.py
+++ b/wordfreq_builder/wordfreq_builder/cmd_count_twitter.py
@ -1,12 +1,16 @@
 from wordfreq_builder.word_counts import WordCountBuilder
-from wordfreq_builder.tokenizers import rosette_tokenizer
+from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer
 from pathlib import Path
 import argparse


-def count_twitter(pathname, offset=0, nsplit=1):
+def count_twitter(pathname, offset=0, nsplit=1, surface=False):
    path = Path(pathname)
-    builder = WordCountBuilder(tokenizer=rosette_tokenizer)
+    if surface == True:
+        tokenizer = rosette_surface_tokenizer
+    else:
+        tokenizer = rosette_tokenizer
+    builder = WordCountBuilder(tokenizer=tokenizer)
    save_filename = 'twitter-counts-%d.csv' % offset
    save_pathname = path.parent / save_filename
    builder.count_twitter(path, offset, nsplit)
@ -18,6 +22,7 @@ if __name__ == '__main__':
    parser.add_argument('filename', help='filename of input file containing one tweet per line')
    parser.add_argument('offset', type=int)
    parser.add_argument('nsplit', type=int)
+    parser.add_argument('-s', '--surface', action='store_true', help='Use surface text instead of stems')
    args = parser.parse_args()
-    count_twitter(args.filename, args.offset, args.nsplit)
+    count_twitter(args.filename, args.offset, args.nsplit, surface=args.surface)

--- a/wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py
+++ b/wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py
@ -1,12 +1,16 @@
 from wordfreq_builder.word_counts import WordCountBuilder
-from wordfreq_builder.tokenizers import rosette_tokenizer
+from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer
 from pathlib import Path
 import argparse


-def count_wikipedia(pathname):
+def count_wikipedia(pathname, surface=False):
    path = Path(pathname)
-    builder = WordCountBuilder()
+    if surface == True:
+        tokenizer = rosette_surface_tokenizer
+    else:
+        tokenizer = rosette_tokenizer
+    builder = WordCountBuilder(tokenizer=tokenizer)
    builder.count_wikipedia(path)
    builder.save_wordlist(path / 'counts.csv')

@ -14,6 +18,7 @@ def count_wikipedia(pathname):
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('dir', help='directory containing extracted Wikipedia text')
+    parser.add_argument('-s', '--surface', action='store_true', help='Use surface text instead of stems')
    args = parser.parse_args()
-    count_wikipedia(args.dir)
+    count_wikipedia(args.dir, surface=args.surface)

--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -7,12 +7,18 @@ ROSETTE = RosetteReader()

 def rosette_tokenizer(text):
    analysis, lang = ROSETTE.rosette.analyze(text)
+    # I'm aware this doesn't do the right things with multi-word stems.
+    # Wordfreq doesn't either. And wordfreq isn't designed to look up
+    # multiple words anyway.
    return [stem + '|' + lang for (stem, pos, span) in analysis]


+def rosette_surface_tokenizer(text):
+    analysis, lang = ROSETTE.rosette.analyze(text)
+    return [text[span[0]:span[1]] + '|' + lang for (stem, pos, span) in analysis]


-def treebank_tokenizer(text):
+def treebank_surface_tokenizer(text):
    """
    This is a simplified version of the Treebank tokenizer in NLTK.

--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -1,9 +1,10 @@
-from wordfreq_builder.tokenizers import treebank_tokenizer
+from wordfreq_builder.tokenizers import treebank_surface_tokenizer
 from collections import defaultdict
 from operator import itemgetter
 from pathlib import Path
 from unicodedata import normalize
 import csv
+import sys


 def read_counts(path):
@ -11,7 +12,7 @@ def read_counts(path):
    with path.open(encoding='utf-8', newline='') as infile:
        reader = csv.reader(infile)
        for key, strval in reader:
-            val = int(strval)
+            val = float(strval)
            # Use += so that, if we give the reader concatenated files with
            # duplicates, it does the right thing
            counts[key] += val
@ -27,11 +28,14 @@ def count_languages(counts):
    return langcounts


-def merge_counts(count_dicts):
-    merged = defaultdict(int)
+def merge_counts(count_dicts, balance=False):
+    merged = defaultdict(float)
    for counts in count_dicts:
+        weight = 1
+        if balance:
+            weight = 1e9 / max(counts.values()) / len(count_dicts)
        for key, val in counts.items():
-            merged[key] += val
+            merged[key] += val * weight
    return merged


@ -52,7 +56,7 @@ class WordCountBuilder:
        self.counts = defaultdict(int)
        self.unique_docs = unique_docs
        if tokenizer is None:
-            self.tokenizer = treebank_tokenizer
+            self.tokenizer = treebank_surface_tokenizer
        else:
            self.tokenizer = tokenizer

@ -60,8 +64,9 @@ class WordCountBuilder:
        text = normalize('NFKC', text).lower()
        try:
            tokens = self.tokenizer(text)
+            # print(' '.join(tokens))
        except Exception as e:
-            print("Couldn't tokenize due to %r: %s" % (e, text))
+            print("Couldn't tokenize due to %r: %s" % (e, text), file=sys.stderr)
            return
        if self.unique_docs:
            tokens = set(tokens)
@ -69,6 +74,11 @@ class WordCountBuilder:
            self.counts[tok] += 1

    def count_wikipedia(self, path, glob='*/*'):
+        """
+        Read a directory of extracted Wikipedia articles. The articles can be
+        grouped together into files, in which case they should be separated by
+        lines beginning with ##.
+        """
        for filepath in sorted(path.glob(glob)):
            print(filepath)
            with filepath.open(encoding='utf-8') as file:
@ -82,6 +92,10 @@ class WordCountBuilder:
                        buf.append(line)
                self.try_wiki_article(' '.join(buf))

+    def try_wiki_article(self, text):
+        if len(text) > 1000:
+            self.add_text(text)
+
    def count_twitter(self, path, offset, nsplit):
        with path.open(encoding='utf-8') as file:
            for i, line in enumerate(file):
@ -90,9 +104,5 @@ class WordCountBuilder:
                    text = line.split('\t')[-1]
                    self.add_text(text)

-    def try_wiki_article(self, text):
-        if len(text) > 1000:
-            self.add_text(text)
-
    def save_wordlist(self, path):
        write_counts(self.counts, path)