mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
add more SUBTLEX and fix its build rules
This commit is contained in:
parent
c11e3b7a9d
commit
34474939f2
@ -174,14 +174,23 @@ following reasonable conditions:
|
|||||||
|
|
||||||
`data/source-lists/subtlex` contains the following files:
|
`data/source-lists/subtlex` contains the following files:
|
||||||
|
|
||||||
- `subtlex.en-US.txt`, which was downloaded from [here][subtlex-us],
|
- `subtlex.de.txt`, which was downloaded as [SUBTLEX-DE raw file.xlsx][subtlex-de],
|
||||||
|
and exported from Excel format to tab-separated UTF-8 using LibreOffice
|
||||||
|
- `subtlex.el.txt`, which was downloaded as [SUBTLEX-GR\_CD.txt][subtlex-gr]
|
||||||
|
- `subtlex.en-US.txt`, which was downloaded as [subtlexus5.zip][subtlex-us],
|
||||||
extracted, and converted from ISO-8859-1 to UTF-8
|
extracted, and converted from ISO-8859-1 to UTF-8
|
||||||
- `subtlex.en-GB.txt`, which was exported as tab-separated UTF-8
|
- `subtlex.en-GB.txt`, which was downloaded as
|
||||||
from [this Excel file][subtlex-uk]
|
[SUBTLEX-UK\_all.xlsx][subtlex-uk], and exported from Excel format to
|
||||||
- `subtlex.zh.txt`, which was downloaded and extracted from
|
tab-separated UTF-8 using LibreOffice
|
||||||
[here][subtlex-ch]
|
- `subtlex.nl.txt`, which was downloaded as
|
||||||
|
[SUBTLEX-NL.cd-above2.txt.zip][subtlex-nl] and extracted
|
||||||
|
- `subtlex.zh.txt`, which was downloaded as
|
||||||
|
[subtlexch131210.zip][subtlex-ch] and extracted
|
||||||
|
|
||||||
|
[subtlex-de]: http://crr.ugent.be/SUBTLEX-DE/SUBTLEX-DE%20raw%20file.xlsx
|
||||||
|
[subtlex-gr]: http://www.bcbl.eu/bcbl-corporativa/wp-content/uploads/2013/01/SUBTLEX-GR_CD.txt
|
||||||
[subtlex-us]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus5.zip
|
[subtlex-us]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus5.zip
|
||||||
[subtlex-uk]: http://crr.ugent.be/papers/SUBTLEX-UK_all.xlsx
|
[subtlex-uk]: http://crr.ugent.be/papers/SUBTLEX-UK_all.xlsx
|
||||||
|
[subtlex-nl]: http://crr.ugent.be/subtlex-nl/SUBTLEX-NL.cd-above2.txt.zip
|
||||||
[subtlex-ch]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch/subtlexch131210.zip
|
[subtlex-ch]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch/subtlexch131210.zip
|
||||||
|
|
||||||
|
@ -56,10 +56,11 @@ rule convert_leeds
|
|||||||
rule convert_opensubtitles
|
rule convert_opensubtitles
|
||||||
command = tr ' ' ',' < $in > $out
|
command = tr ' ' ',' < $in > $out
|
||||||
|
|
||||||
# To convert SUBTLEX, we take the 1st and Nth columns, strip the header, convert
|
# To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
|
||||||
# tabs to commas and commas to nothing, and remove obvious mojibake.
|
# run it through ftfy, convert tabs to commas and spurious CSV formatting to
|
||||||
|
# and remove lines with unfixable half-mojibake.
|
||||||
rule convert_subtlex
|
rule convert_subtlex
|
||||||
command = cut -f 1,$col $in | tail -n +2 | tr ' ,' ', ' | grep -v 'â,' > $out
|
command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out
|
||||||
|
|
||||||
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
|
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
|
||||||
# the input files, keep only the single words and their counts, and only keep
|
# the input files, keep only the single words and their counts, and only keep
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
from wordfreq_builder.word_counts import read_freqs, merge_counts, write_wordlist
|
from wordfreq_builder.word_counts import read_values, merge_counts, write_wordlist
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
def merge_lists(input_names, output_name):
|
def merge_lists(input_names, output_name):
|
||||||
count_dicts = []
|
count_dicts = []
|
||||||
for input_name in input_names:
|
for input_name in input_names:
|
||||||
count_dicts.append(read_freqs(input_name, cutoff=0))
|
values, total = read_values(input_name, cutoff=0)
|
||||||
|
count_dicts.append(values)
|
||||||
merged = merge_counts(count_dicts)
|
merged = merge_counts(count_dicts)
|
||||||
write_wordlist(merged, output_name)
|
write_wordlist(merged, output_name)
|
||||||
|
|
||||||
|
@ -13,6 +13,7 @@ CONFIG = {
|
|||||||
# 'th' when we get tokenization for it
|
# 'th' when we get tokenization for it
|
||||||
# 'hi' when we stop messing up its tokenization
|
# 'hi' when we stop messing up its tokenization
|
||||||
# 'tl' because it's probably ready right now
|
# 'tl' because it's probably ready right now
|
||||||
|
# 'pl' because we have 3 sources for it
|
||||||
'twitter': [
|
'twitter': [
|
||||||
'ar', 'de', 'el', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
|
'ar', 'de', 'el', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
|
||||||
'pt', 'ru', 'tr'
|
'pt', 'ru', 'tr'
|
||||||
@ -38,7 +39,7 @@ CONFIG = {
|
|||||||
# Russian, Spanish, and (Simplified) Chinese.
|
# Russian, Spanish, and (Simplified) Chinese.
|
||||||
],
|
],
|
||||||
'subtlex-en': ['en'],
|
'subtlex-en': ['en'],
|
||||||
'subtlex-zh': ['zh'],
|
'subtlex-other': ['de', 'el', 'nl', 'zh'],
|
||||||
},
|
},
|
||||||
# Subtlex languages that need to be pre-processed
|
# Subtlex languages that need to be pre-processed
|
||||||
'wordlist_paths': {
|
'wordlist_paths': {
|
||||||
@ -48,7 +49,7 @@ CONFIG = {
|
|||||||
'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
|
'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
|
||||||
'google-books': 'generated/google-books/google_books_{lang}.{ext}',
|
'google-books': 'generated/google-books/google_books_{lang}.{ext}',
|
||||||
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
|
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||||
'subtlex-zh': 'generated/subtlex/subtlex_{lang}.{ext}',
|
'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||||
'combined': 'generated/combined/combined_{lang}.{ext}',
|
'combined': 'generated/combined/combined_{lang}.{ext}',
|
||||||
'combined-dist': 'dist/combined_{lang}.{ext}',
|
'combined-dist': 'dist/combined_{lang}.{ext}',
|
||||||
'twitter-dist': 'dist/twitter_{lang}.{ext}'
|
'twitter-dist': 'dist/twitter_{lang}.{ext}'
|
||||||
|
@ -84,9 +84,9 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
lines.extend(
|
lines.extend(
|
||||||
subtlex_zh_deps(
|
subtlex_other_deps(
|
||||||
data_filename('source-lists/subtlex'),
|
data_filename('source-lists/subtlex'),
|
||||||
CONFIG['sources']['subtlex-zh']
|
CONFIG['sources']['subtlex-other']
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
lines.extend(combine_lists(all_languages()))
|
lines.extend(combine_lists(all_languages()))
|
||||||
@ -208,6 +208,17 @@ def opensubtitles_deps(dirname_in, languages):
|
|||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
# Which columns of the SUBTLEX data files do the word and its frequency appear
|
||||||
|
# in?
|
||||||
|
SUBTLEX_COLUMN_MAP = {
|
||||||
|
'de': (1, 3),
|
||||||
|
'el': (2, 3),
|
||||||
|
'en': (1, 2),
|
||||||
|
'nl': (1, 2),
|
||||||
|
'zh': (1, 5)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def subtlex_en_deps(dirname_in, languages):
|
def subtlex_en_deps(dirname_in, languages):
|
||||||
lines = []
|
lines = []
|
||||||
assert languages == ['en']
|
assert languages == ['en']
|
||||||
@ -217,11 +228,12 @@ def subtlex_en_deps(dirname_in, languages):
|
|||||||
input_file = '{prefix}/subtlex.{region}.txt'.format(
|
input_file = '{prefix}/subtlex.{region}.txt'.format(
|
||||||
prefix=dirname_in, region=region
|
prefix=dirname_in, region=region
|
||||||
)
|
)
|
||||||
|
textcol, freqcol = SUBTLEX_COLUMN_MAP['en']
|
||||||
processed_file = wordlist_filename('subtlex-en', region, 'processed.txt')
|
processed_file = wordlist_filename('subtlex-en', region, 'processed.txt')
|
||||||
processed_files.append(processed_file)
|
processed_files.append(processed_file)
|
||||||
add_dep(
|
add_dep(
|
||||||
lines, 'convert_subtlex', input_file, processed_file,
|
lines, 'convert_subtlex', input_file, processed_file,
|
||||||
params={'col': 2}
|
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
|
||||||
)
|
)
|
||||||
|
|
||||||
output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
|
output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
|
||||||
@ -230,17 +242,25 @@ def subtlex_en_deps(dirname_in, languages):
|
|||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
def subtlex_zh_deps(dirname_in, languages):
|
def subtlex_other_deps(dirname_in, languages):
|
||||||
lines = []
|
lines = []
|
||||||
for language in languages:
|
for language in languages:
|
||||||
input_file = '{prefix}/subtlex.{lang}.txt'.format(
|
input_file = '{prefix}/subtlex.{lang}.txt'.format(
|
||||||
prefix=dirname_in, lang=language
|
prefix=dirname_in, lang=language
|
||||||
)
|
)
|
||||||
processed_file = wordlist_filename('subtlex-zh', language, 'processed.txt')
|
processed_file = wordlist_filename('subtlex-other', language, 'processed.txt')
|
||||||
output_file = wordlist_filename('subtlex-zh', language, 'counts.txt')
|
output_file = wordlist_filename('subtlex-other', language, 'counts.txt')
|
||||||
|
textcol, freqcol = SUBTLEX_COLUMN_MAP[language]
|
||||||
|
|
||||||
|
# Greek has three extra header lines for no reason
|
||||||
|
if language == 'el':
|
||||||
|
startrow = 5
|
||||||
|
else:
|
||||||
|
startrow = 2
|
||||||
|
|
||||||
add_dep(
|
add_dep(
|
||||||
lines, 'convert_subtlex', input_file, processed_file,
|
lines, 'convert_subtlex', input_file, processed_file,
|
||||||
params={'col': 5}
|
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': startrow}
|
||||||
)
|
)
|
||||||
add_dep(
|
add_dep(
|
||||||
lines, 'merge_counts', processed_file, output_file
|
lines, 'merge_counts', processed_file, output_file
|
||||||
@ -259,7 +279,7 @@ def combine_lists(languages):
|
|||||||
output_file = wordlist_filename('combined', language)
|
output_file = wordlist_filename('combined', language)
|
||||||
add_dep(lines, 'merge', input_files, output_file,
|
add_dep(lines, 'merge', input_files, output_file,
|
||||||
extra='wordfreq_builder/word_counts.py',
|
extra='wordfreq_builder/word_counts.py',
|
||||||
params={'cutoff': 2})
|
params={'cutoff': 0})
|
||||||
|
|
||||||
output_cBpack = wordlist_filename(
|
output_cBpack = wordlist_filename(
|
||||||
'combined-dist', language, 'msgpack.gz')
|
'combined-dist', language, 'msgpack.gz')
|
||||||
|
@ -32,19 +32,20 @@ def count_tokens(filename):
|
|||||||
return counts
|
return counts
|
||||||
|
|
||||||
|
|
||||||
def read_freqs(filename, cutoff=0, lang=None):
|
def read_values(filename, cutoff=0, lang=None):
|
||||||
"""
|
"""
|
||||||
Read words and their frequencies from a CSV file.
|
Read words and their frequency or count values from a CSV file. Returns
|
||||||
|
a dictionary of values and the total of all values.
|
||||||
|
|
||||||
Only words with a frequency greater than or equal to `cutoff` are returned.
|
Only words with a value greater than or equal to `cutoff` are returned.
|
||||||
|
|
||||||
If `cutoff` is greater than 0, the csv file must be sorted by frequency
|
If `cutoff` is greater than 0, the csv file must be sorted by value
|
||||||
in descending order.
|
in descending order.
|
||||||
|
|
||||||
If lang is given, read_freqs will apply language specific preprocessing
|
If lang is given, it will apply language specific preprocessing
|
||||||
operations.
|
operations.
|
||||||
"""
|
"""
|
||||||
raw_counts = defaultdict(float)
|
values = defaultdict(float)
|
||||||
total = 0.
|
total = 0.
|
||||||
with open(filename, encoding='utf-8', newline='') as infile:
|
with open(filename, encoding='utf-8', newline='') as infile:
|
||||||
for key, strval in csv.reader(infile):
|
for key, strval in csv.reader(infile):
|
||||||
@ -56,13 +57,29 @@ def read_freqs(filename, cutoff=0, lang=None):
|
|||||||
for token in tokens:
|
for token in tokens:
|
||||||
# Use += so that, if we give the reader concatenated files with
|
# Use += so that, if we give the reader concatenated files with
|
||||||
# duplicates, it does the right thing
|
# duplicates, it does the right thing
|
||||||
raw_counts[token] += val
|
values[token] += val
|
||||||
total += val
|
total += val
|
||||||
|
return values, total
|
||||||
|
|
||||||
for word in raw_counts:
|
|
||||||
raw_counts[word] /= total
|
|
||||||
|
|
||||||
return raw_counts
|
def read_freqs(filename, cutoff=0, lang=None):
|
||||||
|
"""
|
||||||
|
Read words and their frequencies from a CSV file, normalizing the
|
||||||
|
frequencies to add up to 1.
|
||||||
|
|
||||||
|
Only words with a frequency greater than or equal to `cutoff` are returned.
|
||||||
|
|
||||||
|
If `cutoff` is greater than 0, the csv file must be sorted by frequency
|
||||||
|
in descending order.
|
||||||
|
|
||||||
|
If lang is given, read_freqs will apply language specific preprocessing
|
||||||
|
operations.
|
||||||
|
"""
|
||||||
|
values, total = read_values(filename, cutoff, lang)
|
||||||
|
for word in values:
|
||||||
|
values[word] /= total
|
||||||
|
|
||||||
|
return values
|
||||||
|
|
||||||
|
|
||||||
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
|
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
|
||||||
|
Loading…
Reference in New Issue
Block a user