diff --git a/scripts/ninja2dot.py b/scripts/ninja2dot.py index 42b5362..f73131c 100644 --- a/scripts/ninja2dot.py +++ b/scripts/ninja2dot.py @@ -1,30 +1,39 @@ """ This file generates a graph of the dependencies for the ninja build.""" import sys +import re def ninja_to_dot(): - def last_component(path): - return path.split('/')[-1] + def simplified_filename(path): + component = path.split('/')[-1] + return re.sub( + r'[0-9]+-of', 'NN-of', + re.sub(r'part[0-9]+', 'partNN', component) + ) print("digraph G {") print('rankdir="LR";') + seen_edges = set() for line in sys.stdin: line = line.rstrip() if line.startswith('build'): # the output file is the first argument; strip off the colon that # comes from ninja syntax output_text, input_text = line.split(':') - outfiles = [last_component(part) for part in output_text.split(' ')[1:]] + outfiles = [simplified_filename(part) for part in output_text.split(' ')[1:]] inputs = input_text.strip().split(' ') - infiles = [last_component(part) for part in inputs[1:]] + infiles = [simplified_filename(part) for part in inputs[1:]] operation = inputs[0] for infile in infiles: if infile == '|': # external dependencies start here; let's not graph those break for outfile in outfiles: - print('"%s" -> "%s" [label="%s"]' % (infile, outfile, operation)) + edge = '"%s" -> "%s" [label="%s"]' % (infile, outfile, operation) + if edge not in seen_edges: + seen_edges.add(edge) + print(edge) print("}") diff --git a/wordfreq_builder/build.png b/wordfreq_builder/build.png new file mode 100644 index 0000000..15635c6 Binary files /dev/null and b/wordfreq_builder/build.png differ diff --git a/wordfreq_builder/build.png.REMOVED.git-id b/wordfreq_builder/build.png.REMOVED.git-id deleted file mode 100644 index 9fe6754..0000000 --- a/wordfreq_builder/build.png.REMOVED.git-id +++ /dev/null @@ -1 +0,0 @@ -ef54b21e931c530f5b75c1cd87c5841cc4691e43 \ No newline at end of file diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index b708533..f06e5f2 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -56,6 +56,11 @@ rule convert_leeds rule convert_opensubtitles command = tr ' ' ',' < $in > $out +# To convert SUBTLEX, we take the 1st and Nth columns, strip the header, convert +# tabs to commas and commas to nothing, and remove obvious mojibake. +rule convert_subtlex + command = cut -f 1,$col $in | tail -n +2 | tr ' ,' ', ' | grep -v 'รข,' > $out + # Convert and clean up the Google Books Syntactic N-grams data. Concatenate all # the input files, keep only the single words and their counts, and only keep # lines with counts of 100 or more. @@ -71,7 +76,10 @@ rule count command = python -m wordfreq_builder.cli.count_tokens $in $out rule merge - command = python -m wordfreq_builder.cli.combine_lists -o $out $in + command = python -m wordfreq_builder.cli.combine_lists -o $out -c $cutoff $in + +rule merge_counts + command = python -m wordfreq_builder.cli.merge_counts -o $out $in rule freqs2cB command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out diff --git a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py similarity index 70% rename from wordfreq_builder/wordfreq_builder/cli/combine_lists.py rename to wordfreq_builder/wordfreq_builder/cli/merge_counts.py index 61d4b1d..772b951 100644 --- a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py @@ -1,12 +1,12 @@ -from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist +from wordfreq_builder.word_counts import read_freqs, merge_counts, write_wordlist import argparse def merge_lists(input_names, output_name): - freq_dicts = [] + count_dicts = [] for input_name in input_names: - freq_dicts.append(read_freqs(input_name, cutoff=2)) - merged = merge_freqs(freq_dicts) + count_dicts.append(read_freqs(input_name, cutoff=0)) + merged = merge_counts(count_dicts) write_wordlist(merged, output_name) diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py new file mode 100644 index 0000000..0bbe1c1 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py @@ -0,0 +1,20 @@ +from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist +import argparse + + +def merge_lists(input_names, output_name, cutoff): + freq_dicts = [] + for input_name in input_names: + freq_dicts.append(read_freqs(input_name, cutoff=cutoff)) + merged = merge_freqs(freq_dicts) + write_wordlist(merged, output_name) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv') + parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2) + parser.add_argument('inputs', help='names of input files to merge', nargs='+') + args = parser.parse_args() + merge_lists(args.inputs, args.output, args.cutoff) + diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index a80c327..7c523fb 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -11,12 +11,12 @@ CONFIG = { 'twitter': [ 'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', 'pt', 'ru', - # can be added later: 'th', 'tr' + # can be added later: 'el', 'tr' ], 'wikipedia': [ 'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', 'pt', 'ru' - # many more can be added + # consider adding 'el' and 'tr' ], 'opensubtitles': [ # All languages where the most common word in OpenSubtitles @@ -33,14 +33,19 @@ CONFIG = { 'en', # Using the 2012 data, we could get French, German, Italian, # Russian, Spanish, and (Simplified) Chinese. - ] + ], + 'subtlex-en': ['en'], + 'subtlex-zh': ['zh'], }, + # Subtlex languages that need to be pre-processed 'wordlist_paths': { 'twitter': 'generated/twitter/tweets-2014.{lang}.{ext}', 'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}', 'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}', 'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}', 'google-books': 'generated/google-books/google_books_{lang}.{ext}', + 'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}', + 'subtlex-zh': 'generated/subtlex/subtlex_{lang}.{ext}', 'combined': 'generated/combined/combined_{lang}.{ext}', 'combined-dist': 'dist/combined_{lang}.{ext}', 'twitter-dist': 'dist/twitter_{lang}.{ext}' diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 84c1818..65773d6 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -5,7 +5,8 @@ import sys import pathlib HEADER = """# This file is automatically generated. Do not edit it. -# You can regenerate it using the 'wordfreq-build-deps' command. +# You can change its behavior by editing wordfreq_builder/ninja.py, +# and regenerate it by running 'make'. """ TMPDIR = data_filename('tmp') @@ -76,6 +77,18 @@ def make_ninja_deps(rules_filename, out=sys.stdout): CONFIG['sources']['opensubtitles'] ) ) + lines.extend( + subtlex_en_deps( + data_filename('source-lists/subtlex'), + CONFIG['sources']['subtlex-en'] + ) + ) + lines.extend( + subtlex_zh_deps( + data_filename('source-lists/subtlex'), + CONFIG['sources']['subtlex-zh'] + ) + ) lines.extend(combine_lists(all_languages())) print('\n'.join(lines), file=out) @@ -188,12 +201,53 @@ def opensubtitles_deps(dirname_in, languages): prefix=dirname_in, lang=language ) reformatted_file = wordlist_filename( - 'opensubtitles', language, 'counts.txt') + 'opensubtitles', language, 'counts.txt' + ) add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file) return lines +def subtlex_en_deps(dirname_in, languages): + lines = [] + assert languages == ['en'] + regions = ['en-US', 'en-GB'] + processed_files = [] + for region in regions: + input_file = '{prefix}/subtlex.{region}.txt'.format( + prefix=dirname_in, region=region + ) + processed_file = wordlist_filename('subtlex-en', region, 'processed.txt') + processed_files.append(processed_file) + add_dep( + lines, 'convert_subtlex', input_file, processed_file, + params={'col': 2} + ) + + output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt') + add_dep(lines, 'merge_counts', processed_files, output_file) + + return lines + + +def subtlex_zh_deps(dirname_in, languages): + lines = [] + for language in languages: + input_file = '{prefix}/subtlex.{lang}.txt'.format( + prefix=dirname_in, lang=language + ) + processed_file = wordlist_filename('subtlex-zh', language, 'processed.txt') + output_file = wordlist_filename('subtlex-zh', language, 'counts.txt') + add_dep( + lines, 'convert_subtlex', input_file, processed_file, + params={'col': 5} + ) + add_dep( + lines, 'merge_counts', processed_file, output_file + ) + return lines + + def combine_lists(languages): lines = [] for language in languages: @@ -204,7 +258,8 @@ def combine_lists(languages): ] output_file = wordlist_filename('combined', language) add_dep(lines, 'merge', input_files, output_file, - extra='wordfreq_builder/word_counts.py') + extra='wordfreq_builder/word_counts.py', + params={'cutoff': 2}) output_cBpack = wordlist_filename( 'combined-dist', language, 'msgpack.gz') diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 9da95a3..63d1980 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -49,13 +49,14 @@ def read_freqs(filename, cutoff=0, lang=None): with open(filename, encoding='utf-8', newline='') as infile: for key, strval in csv.reader(infile): val = float(strval) + key = fix_text(key) if val < cutoff: break tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) for token in tokens: # Use += so that, if we give the reader concatenated files with # duplicates, it does the right thing - raw_counts[fix_text(token)] += val + raw_counts[token] += val total += val for word in raw_counts: @@ -96,6 +97,17 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None): msgpack.dump(cBpack_data, outfile) +def merge_counts(count_dicts): + """ + Merge multiple dictionaries of counts by adding their entries. + """ + merged = defaultdict(int) + for count_dict in count_dicts: + for term, count in count_dict.items(): + merged[term] += count + return merged + + def merge_freqs(freq_dicts): """ Merge multiple dictionaries of frequencies, representing each word with