add more SUBTLEX and fix its build rules

Former-commit-id: 34474939f2
This commit is contained in:
Robyn Speer 2015-09-04 12:37:35 -04:00
parent 8035df998a
commit d0ada70355
6 changed files with 79 additions and 30 deletions

View File

@ -174,14 +174,23 @@ following reasonable conditions:
`data/source-lists/subtlex` contains the following files:
- `subtlex.en-US.txt`, which was downloaded from [here][subtlex-us],
- `subtlex.de.txt`, which was downloaded as [SUBTLEX-DE raw file.xlsx][subtlex-de],
and exported from Excel format to tab-separated UTF-8 using LibreOffice
- `subtlex.el.txt`, which was downloaded as [SUBTLEX-GR\_CD.txt][subtlex-gr]
- `subtlex.en-US.txt`, which was downloaded as [subtlexus5.zip][subtlex-us],
extracted, and converted from ISO-8859-1 to UTF-8
- `subtlex.en-GB.txt`, which was exported as tab-separated UTF-8
from [this Excel file][subtlex-uk]
- `subtlex.zh.txt`, which was downloaded and extracted from
[here][subtlex-ch]
- `subtlex.en-GB.txt`, which was downloaded as
[SUBTLEX-UK\_all.xlsx][subtlex-uk], and exported from Excel format to
tab-separated UTF-8 using LibreOffice
- `subtlex.nl.txt`, which was downloaded as
[SUBTLEX-NL.cd-above2.txt.zip][subtlex-nl] and extracted
- `subtlex.zh.txt`, which was downloaded as
[subtlexch131210.zip][subtlex-ch] and extracted
[subtlex-de]: http://crr.ugent.be/SUBTLEX-DE/SUBTLEX-DE%20raw%20file.xlsx
[subtlex-gr]: http://www.bcbl.eu/bcbl-corporativa/wp-content/uploads/2013/01/SUBTLEX-GR_CD.txt
[subtlex-us]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus5.zip
[subtlex-uk]: http://crr.ugent.be/papers/SUBTLEX-UK_all.xlsx
[subtlex-nl]: http://crr.ugent.be/subtlex-nl/SUBTLEX-NL.cd-above2.txt.zip
[subtlex-ch]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch/subtlexch131210.zip

View File

@ -56,10 +56,11 @@ rule convert_leeds
rule convert_opensubtitles
command = tr ' ' ',' < $in > $out
# To convert SUBTLEX, we take the 1st and Nth columns, strip the header, convert
# tabs to commas and commas to nothing, and remove obvious mojibake.
# To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
# run it through ftfy, convert tabs to commas and spurious CSV formatting to
# and remove lines with unfixable half-mojibake.
rule convert_subtlex
command = cut -f 1,$col $in | tail -n +2 | tr ' ,' ', ' | grep -v 'â,' > $out
command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
# the input files, keep only the single words and their counts, and only keep

View File

@ -1,11 +1,12 @@
from wordfreq_builder.word_counts import read_freqs, merge_counts, write_wordlist
from wordfreq_builder.word_counts import read_values, merge_counts, write_wordlist
import argparse
def merge_lists(input_names, output_name):
count_dicts = []
for input_name in input_names:
count_dicts.append(read_freqs(input_name, cutoff=0))
values, total = read_values(input_name, cutoff=0)
count_dicts.append(values)
merged = merge_counts(count_dicts)
write_wordlist(merged, output_name)

View File

@ -13,6 +13,7 @@ CONFIG = {
# 'th' when we get tokenization for it
# 'hi' when we stop messing up its tokenization
# 'tl' because it's probably ready right now
# 'pl' because we have 3 sources for it
'twitter': [
'ar', 'de', 'el', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
'pt', 'ru', 'tr'
@ -38,7 +39,7 @@ CONFIG = {
# Russian, Spanish, and (Simplified) Chinese.
],
'subtlex-en': ['en'],
'subtlex-zh': ['zh'],
'subtlex-other': ['de', 'el', 'nl', 'zh'],
},
# Subtlex languages that need to be pre-processed
'wordlist_paths': {
@ -48,7 +49,7 @@ CONFIG = {
'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
'google-books': 'generated/google-books/google_books_{lang}.{ext}',
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
'subtlex-zh': 'generated/subtlex/subtlex_{lang}.{ext}',
'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
'combined': 'generated/combined/combined_{lang}.{ext}',
'combined-dist': 'dist/combined_{lang}.{ext}',
'twitter-dist': 'dist/twitter_{lang}.{ext}'

View File

@ -84,9 +84,9 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
)
)
lines.extend(
subtlex_zh_deps(
subtlex_other_deps(
data_filename('source-lists/subtlex'),
CONFIG['sources']['subtlex-zh']
CONFIG['sources']['subtlex-other']
)
)
lines.extend(combine_lists(all_languages()))
@ -208,6 +208,17 @@ def opensubtitles_deps(dirname_in, languages):
return lines
# Which columns of the SUBTLEX data files do the word and its frequency appear
# in?
SUBTLEX_COLUMN_MAP = {
'de': (1, 3),
'el': (2, 3),
'en': (1, 2),
'nl': (1, 2),
'zh': (1, 5)
}
def subtlex_en_deps(dirname_in, languages):
lines = []
assert languages == ['en']
@ -217,11 +228,12 @@ def subtlex_en_deps(dirname_in, languages):
input_file = '{prefix}/subtlex.{region}.txt'.format(
prefix=dirname_in, region=region
)
textcol, freqcol = SUBTLEX_COLUMN_MAP['en']
processed_file = wordlist_filename('subtlex-en', region, 'processed.txt')
processed_files.append(processed_file)
add_dep(
lines, 'convert_subtlex', input_file, processed_file,
params={'col': 2}
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
)
output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
@ -230,17 +242,25 @@ def subtlex_en_deps(dirname_in, languages):
return lines
def subtlex_zh_deps(dirname_in, languages):
def subtlex_other_deps(dirname_in, languages):
lines = []
for language in languages:
input_file = '{prefix}/subtlex.{lang}.txt'.format(
prefix=dirname_in, lang=language
)
processed_file = wordlist_filename('subtlex-zh', language, 'processed.txt')
output_file = wordlist_filename('subtlex-zh', language, 'counts.txt')
processed_file = wordlist_filename('subtlex-other', language, 'processed.txt')
output_file = wordlist_filename('subtlex-other', language, 'counts.txt')
textcol, freqcol = SUBTLEX_COLUMN_MAP[language]
# Greek has three extra header lines for no reason
if language == 'el':
startrow = 5
else:
startrow = 2
add_dep(
lines, 'convert_subtlex', input_file, processed_file,
params={'col': 5}
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': startrow}
)
add_dep(
lines, 'merge_counts', processed_file, output_file
@ -259,7 +279,7 @@ def combine_lists(languages):
output_file = wordlist_filename('combined', language)
add_dep(lines, 'merge', input_files, output_file,
extra='wordfreq_builder/word_counts.py',
params={'cutoff': 2})
params={'cutoff': 0})
output_cBpack = wordlist_filename(
'combined-dist', language, 'msgpack.gz')

View File

@ -32,19 +32,20 @@ def count_tokens(filename):
return counts
def read_freqs(filename, cutoff=0, lang=None):
def read_values(filename, cutoff=0, lang=None):
"""
Read words and their frequencies from a CSV file.
Read words and their frequency or count values from a CSV file. Returns
a dictionary of values and the total of all values.
Only words with a frequency greater than or equal to `cutoff` are returned.
Only words with a value greater than or equal to `cutoff` are returned.
If `cutoff` is greater than 0, the csv file must be sorted by frequency
If `cutoff` is greater than 0, the csv file must be sorted by value
in descending order.
If lang is given, read_freqs will apply language specific preprocessing
If lang is given, it will apply language specific preprocessing
operations.
"""
raw_counts = defaultdict(float)
values = defaultdict(float)
total = 0.
with open(filename, encoding='utf-8', newline='') as infile:
for key, strval in csv.reader(infile):
@ -56,13 +57,29 @@ def read_freqs(filename, cutoff=0, lang=None):
for token in tokens:
# Use += so that, if we give the reader concatenated files with
# duplicates, it does the right thing
raw_counts[token] += val
values[token] += val
total += val
return values, total
for word in raw_counts:
raw_counts[word] /= total
return raw_counts
def read_freqs(filename, cutoff=0, lang=None):
"""
Read words and their frequencies from a CSV file, normalizing the
frequencies to add up to 1.
Only words with a frequency greater than or equal to `cutoff` are returned.
If `cutoff` is greater than 0, the csv file must be sorted by frequency
in descending order.
If lang is given, read_freqs will apply language specific preprocessing
operations.
"""
values, total = read_values(filename, cutoff, lang)
for word in values:
values[word] /= total
return values
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):