diff --git a/wordfreq_builder/README.md b/wordfreq_builder/README.md index 021bc0f..f2fdfb9 100644 --- a/wordfreq_builder/README.md +++ b/wordfreq_builder/README.md @@ -174,14 +174,23 @@ following reasonable conditions: `data/source-lists/subtlex` contains the following files: -- `subtlex.en-US.txt`, which was downloaded from [here][subtlex-us], +- `subtlex.de.txt`, which was downloaded as [SUBTLEX-DE raw file.xlsx][subtlex-de], + and exported from Excel format to tab-separated UTF-8 using LibreOffice +- `subtlex.el.txt`, which was downloaded as [SUBTLEX-GR\_CD.txt][subtlex-gr] +- `subtlex.en-US.txt`, which was downloaded as [subtlexus5.zip][subtlex-us], extracted, and converted from ISO-8859-1 to UTF-8 -- `subtlex.en-GB.txt`, which was exported as tab-separated UTF-8 - from [this Excel file][subtlex-uk] -- `subtlex.zh.txt`, which was downloaded and extracted from - [here][subtlex-ch] +- `subtlex.en-GB.txt`, which was downloaded as + [SUBTLEX-UK\_all.xlsx][subtlex-uk], and exported from Excel format to + tab-separated UTF-8 using LibreOffice +- `subtlex.nl.txt`, which was downloaded as + [SUBTLEX-NL.cd-above2.txt.zip][subtlex-nl] and extracted +- `subtlex.zh.txt`, which was downloaded as + [subtlexch131210.zip][subtlex-ch] and extracted +[subtlex-de]: http://crr.ugent.be/SUBTLEX-DE/SUBTLEX-DE%20raw%20file.xlsx +[subtlex-gr]: http://www.bcbl.eu/bcbl-corporativa/wp-content/uploads/2013/01/SUBTLEX-GR_CD.txt [subtlex-us]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus5.zip [subtlex-uk]: http://crr.ugent.be/papers/SUBTLEX-UK_all.xlsx +[subtlex-nl]: http://crr.ugent.be/subtlex-nl/SUBTLEX-NL.cd-above2.txt.zip [subtlex-ch]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch/subtlexch131210.zip diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index e4d95e0..986678c 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -56,10 +56,11 @@ rule convert_leeds rule convert_opensubtitles command = tr ' ' ',' < $in > $out -# To convert SUBTLEX, we take the 1st and Nth columns, strip the header, convert -# tabs to commas and commas to nothing, and remove obvious mojibake. +# To convert SUBTLEX, we take the 1st and Nth columns, strip the header, +# run it through ftfy, convert tabs to commas and spurious CSV formatting to +# and remove lines with unfixable half-mojibake. rule convert_subtlex - command = cut -f 1,$col $in | tail -n +2 | tr ' ,' ', ' | grep -v 'â,' > $out + command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out # Convert and clean up the Google Books Syntactic N-grams data. Concatenate all # the input files, keep only the single words and their counts, and only keep diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py index 772b951..5e3de69 100644 --- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py @@ -1,11 +1,12 @@ -from wordfreq_builder.word_counts import read_freqs, merge_counts, write_wordlist +from wordfreq_builder.word_counts import read_values, merge_counts, write_wordlist import argparse def merge_lists(input_names, output_name): count_dicts = [] for input_name in input_names: - count_dicts.append(read_freqs(input_name, cutoff=0)) + values, total = read_values(input_name, cutoff=0) + count_dicts.append(values) merged = merge_counts(count_dicts) write_wordlist(merged, output_name) diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index 044f987..87c575b 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -13,6 +13,7 @@ CONFIG = { # 'th' when we get tokenization for it # 'hi' when we stop messing up its tokenization # 'tl' because it's probably ready right now + # 'pl' because we have 3 sources for it 'twitter': [ 'ar', 'de', 'el', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', 'pt', 'ru', 'tr' @@ -38,7 +39,7 @@ CONFIG = { # Russian, Spanish, and (Simplified) Chinese. ], 'subtlex-en': ['en'], - 'subtlex-zh': ['zh'], + 'subtlex-other': ['de', 'el', 'nl', 'zh'], }, # Subtlex languages that need to be pre-processed 'wordlist_paths': { @@ -48,7 +49,7 @@ CONFIG = { 'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}', 'google-books': 'generated/google-books/google_books_{lang}.{ext}', 'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}', - 'subtlex-zh': 'generated/subtlex/subtlex_{lang}.{ext}', + 'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}', 'combined': 'generated/combined/combined_{lang}.{ext}', 'combined-dist': 'dist/combined_{lang}.{ext}', 'twitter-dist': 'dist/twitter_{lang}.{ext}' diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 65773d6..2ae66c4 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -84,9 +84,9 @@ def make_ninja_deps(rules_filename, out=sys.stdout): ) ) lines.extend( - subtlex_zh_deps( + subtlex_other_deps( data_filename('source-lists/subtlex'), - CONFIG['sources']['subtlex-zh'] + CONFIG['sources']['subtlex-other'] ) ) lines.extend(combine_lists(all_languages())) @@ -208,6 +208,17 @@ def opensubtitles_deps(dirname_in, languages): return lines +# Which columns of the SUBTLEX data files do the word and its frequency appear +# in? +SUBTLEX_COLUMN_MAP = { + 'de': (1, 3), + 'el': (2, 3), + 'en': (1, 2), + 'nl': (1, 2), + 'zh': (1, 5) +} + + def subtlex_en_deps(dirname_in, languages): lines = [] assert languages == ['en'] @@ -217,11 +228,12 @@ def subtlex_en_deps(dirname_in, languages): input_file = '{prefix}/subtlex.{region}.txt'.format( prefix=dirname_in, region=region ) + textcol, freqcol = SUBTLEX_COLUMN_MAP['en'] processed_file = wordlist_filename('subtlex-en', region, 'processed.txt') processed_files.append(processed_file) add_dep( lines, 'convert_subtlex', input_file, processed_file, - params={'col': 2} + params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2} ) output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt') @@ -230,17 +242,25 @@ def subtlex_en_deps(dirname_in, languages): return lines -def subtlex_zh_deps(dirname_in, languages): +def subtlex_other_deps(dirname_in, languages): lines = [] for language in languages: input_file = '{prefix}/subtlex.{lang}.txt'.format( prefix=dirname_in, lang=language ) - processed_file = wordlist_filename('subtlex-zh', language, 'processed.txt') - output_file = wordlist_filename('subtlex-zh', language, 'counts.txt') + processed_file = wordlist_filename('subtlex-other', language, 'processed.txt') + output_file = wordlist_filename('subtlex-other', language, 'counts.txt') + textcol, freqcol = SUBTLEX_COLUMN_MAP[language] + + # Greek has three extra header lines for no reason + if language == 'el': + startrow = 5 + else: + startrow = 2 + add_dep( lines, 'convert_subtlex', input_file, processed_file, - params={'col': 5} + params={'textcol': textcol, 'freqcol': freqcol, 'startrow': startrow} ) add_dep( lines, 'merge_counts', processed_file, output_file @@ -259,7 +279,7 @@ def combine_lists(languages): output_file = wordlist_filename('combined', language) add_dep(lines, 'merge', input_files, output_file, extra='wordfreq_builder/word_counts.py', - params={'cutoff': 2}) + params={'cutoff': 0}) output_cBpack = wordlist_filename( 'combined-dist', language, 'msgpack.gz') diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 63d1980..1933295 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -32,19 +32,20 @@ def count_tokens(filename): return counts -def read_freqs(filename, cutoff=0, lang=None): +def read_values(filename, cutoff=0, lang=None): """ - Read words and their frequencies from a CSV file. + Read words and their frequency or count values from a CSV file. Returns + a dictionary of values and the total of all values. - Only words with a frequency greater than or equal to `cutoff` are returned. + Only words with a value greater than or equal to `cutoff` are returned. - If `cutoff` is greater than 0, the csv file must be sorted by frequency + If `cutoff` is greater than 0, the csv file must be sorted by value in descending order. - If lang is given, read_freqs will apply language specific preprocessing + If lang is given, it will apply language specific preprocessing operations. """ - raw_counts = defaultdict(float) + values = defaultdict(float) total = 0. with open(filename, encoding='utf-8', newline='') as infile: for key, strval in csv.reader(infile): @@ -56,13 +57,29 @@ def read_freqs(filename, cutoff=0, lang=None): for token in tokens: # Use += so that, if we give the reader concatenated files with # duplicates, it does the right thing - raw_counts[token] += val + values[token] += val total += val + return values, total - for word in raw_counts: - raw_counts[word] /= total - return raw_counts +def read_freqs(filename, cutoff=0, lang=None): + """ + Read words and their frequencies from a CSV file, normalizing the + frequencies to add up to 1. + + Only words with a frequency greater than or equal to `cutoff` are returned. + + If `cutoff` is greater than 0, the csv file must be sorted by frequency + in descending order. + + If lang is given, read_freqs will apply language specific preprocessing + operations. + """ + values, total = read_values(filename, cutoff, lang) + for word in values: + values[word] /= total + + return values def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):