mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
parent
8035df998a
commit
d0ada70355
@ -174,14 +174,23 @@ following reasonable conditions:
|
||||
|
||||
`data/source-lists/subtlex` contains the following files:
|
||||
|
||||
- `subtlex.en-US.txt`, which was downloaded from [here][subtlex-us],
|
||||
- `subtlex.de.txt`, which was downloaded as [SUBTLEX-DE raw file.xlsx][subtlex-de],
|
||||
and exported from Excel format to tab-separated UTF-8 using LibreOffice
|
||||
- `subtlex.el.txt`, which was downloaded as [SUBTLEX-GR\_CD.txt][subtlex-gr]
|
||||
- `subtlex.en-US.txt`, which was downloaded as [subtlexus5.zip][subtlex-us],
|
||||
extracted, and converted from ISO-8859-1 to UTF-8
|
||||
- `subtlex.en-GB.txt`, which was exported as tab-separated UTF-8
|
||||
from [this Excel file][subtlex-uk]
|
||||
- `subtlex.zh.txt`, which was downloaded and extracted from
|
||||
[here][subtlex-ch]
|
||||
- `subtlex.en-GB.txt`, which was downloaded as
|
||||
[SUBTLEX-UK\_all.xlsx][subtlex-uk], and exported from Excel format to
|
||||
tab-separated UTF-8 using LibreOffice
|
||||
- `subtlex.nl.txt`, which was downloaded as
|
||||
[SUBTLEX-NL.cd-above2.txt.zip][subtlex-nl] and extracted
|
||||
- `subtlex.zh.txt`, which was downloaded as
|
||||
[subtlexch131210.zip][subtlex-ch] and extracted
|
||||
|
||||
[subtlex-de]: http://crr.ugent.be/SUBTLEX-DE/SUBTLEX-DE%20raw%20file.xlsx
|
||||
[subtlex-gr]: http://www.bcbl.eu/bcbl-corporativa/wp-content/uploads/2013/01/SUBTLEX-GR_CD.txt
|
||||
[subtlex-us]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexus/subtlexus5.zip
|
||||
[subtlex-uk]: http://crr.ugent.be/papers/SUBTLEX-UK_all.xlsx
|
||||
[subtlex-nl]: http://crr.ugent.be/subtlex-nl/SUBTLEX-NL.cd-above2.txt.zip
|
||||
[subtlex-ch]: http://www.ugent.be/pp/experimentele-psychologie/en/research/documents/subtlexch/subtlexch131210.zip
|
||||
|
||||
|
@ -56,10 +56,11 @@ rule convert_leeds
|
||||
rule convert_opensubtitles
|
||||
command = tr ' ' ',' < $in > $out
|
||||
|
||||
# To convert SUBTLEX, we take the 1st and Nth columns, strip the header, convert
|
||||
# tabs to commas and commas to nothing, and remove obvious mojibake.
|
||||
# To convert SUBTLEX, we take the 1st and Nth columns, strip the header,
|
||||
# run it through ftfy, convert tabs to commas and spurious CSV formatting to
|
||||
# and remove lines with unfixable half-mojibake.
|
||||
rule convert_subtlex
|
||||
command = cut -f 1,$col $in | tail -n +2 | tr ' ,' ', ' | grep -v 'â,' > $out
|
||||
command = cut -f $textcol,$freqcol $in | tail -n +$startrow | ftfy | tr ' ",' ', ' | grep -v 'â,' > $out
|
||||
|
||||
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
|
||||
# the input files, keep only the single words and their counts, and only keep
|
||||
|
@ -1,11 +1,12 @@
|
||||
from wordfreq_builder.word_counts import read_freqs, merge_counts, write_wordlist
|
||||
from wordfreq_builder.word_counts import read_values, merge_counts, write_wordlist
|
||||
import argparse
|
||||
|
||||
|
||||
def merge_lists(input_names, output_name):
|
||||
count_dicts = []
|
||||
for input_name in input_names:
|
||||
count_dicts.append(read_freqs(input_name, cutoff=0))
|
||||
values, total = read_values(input_name, cutoff=0)
|
||||
count_dicts.append(values)
|
||||
merged = merge_counts(count_dicts)
|
||||
write_wordlist(merged, output_name)
|
||||
|
||||
|
@ -13,6 +13,7 @@ CONFIG = {
|
||||
# 'th' when we get tokenization for it
|
||||
# 'hi' when we stop messing up its tokenization
|
||||
# 'tl' because it's probably ready right now
|
||||
# 'pl' because we have 3 sources for it
|
||||
'twitter': [
|
||||
'ar', 'de', 'el', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
|
||||
'pt', 'ru', 'tr'
|
||||
@ -38,7 +39,7 @@ CONFIG = {
|
||||
# Russian, Spanish, and (Simplified) Chinese.
|
||||
],
|
||||
'subtlex-en': ['en'],
|
||||
'subtlex-zh': ['zh'],
|
||||
'subtlex-other': ['de', 'el', 'nl', 'zh'],
|
||||
},
|
||||
# Subtlex languages that need to be pre-processed
|
||||
'wordlist_paths': {
|
||||
@ -48,7 +49,7 @@ CONFIG = {
|
||||
'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
|
||||
'google-books': 'generated/google-books/google_books_{lang}.{ext}',
|
||||
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||
'subtlex-zh': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||
'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||
'combined': 'generated/combined/combined_{lang}.{ext}',
|
||||
'combined-dist': 'dist/combined_{lang}.{ext}',
|
||||
'twitter-dist': 'dist/twitter_{lang}.{ext}'
|
||||
|
@ -84,9 +84,9 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
||||
)
|
||||
)
|
||||
lines.extend(
|
||||
subtlex_zh_deps(
|
||||
subtlex_other_deps(
|
||||
data_filename('source-lists/subtlex'),
|
||||
CONFIG['sources']['subtlex-zh']
|
||||
CONFIG['sources']['subtlex-other']
|
||||
)
|
||||
)
|
||||
lines.extend(combine_lists(all_languages()))
|
||||
@ -208,6 +208,17 @@ def opensubtitles_deps(dirname_in, languages):
|
||||
return lines
|
||||
|
||||
|
||||
# Which columns of the SUBTLEX data files do the word and its frequency appear
|
||||
# in?
|
||||
SUBTLEX_COLUMN_MAP = {
|
||||
'de': (1, 3),
|
||||
'el': (2, 3),
|
||||
'en': (1, 2),
|
||||
'nl': (1, 2),
|
||||
'zh': (1, 5)
|
||||
}
|
||||
|
||||
|
||||
def subtlex_en_deps(dirname_in, languages):
|
||||
lines = []
|
||||
assert languages == ['en']
|
||||
@ -217,11 +228,12 @@ def subtlex_en_deps(dirname_in, languages):
|
||||
input_file = '{prefix}/subtlex.{region}.txt'.format(
|
||||
prefix=dirname_in, region=region
|
||||
)
|
||||
textcol, freqcol = SUBTLEX_COLUMN_MAP['en']
|
||||
processed_file = wordlist_filename('subtlex-en', region, 'processed.txt')
|
||||
processed_files.append(processed_file)
|
||||
add_dep(
|
||||
lines, 'convert_subtlex', input_file, processed_file,
|
||||
params={'col': 2}
|
||||
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2}
|
||||
)
|
||||
|
||||
output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
|
||||
@ -230,17 +242,25 @@ def subtlex_en_deps(dirname_in, languages):
|
||||
return lines
|
||||
|
||||
|
||||
def subtlex_zh_deps(dirname_in, languages):
|
||||
def subtlex_other_deps(dirname_in, languages):
|
||||
lines = []
|
||||
for language in languages:
|
||||
input_file = '{prefix}/subtlex.{lang}.txt'.format(
|
||||
prefix=dirname_in, lang=language
|
||||
)
|
||||
processed_file = wordlist_filename('subtlex-zh', language, 'processed.txt')
|
||||
output_file = wordlist_filename('subtlex-zh', language, 'counts.txt')
|
||||
processed_file = wordlist_filename('subtlex-other', language, 'processed.txt')
|
||||
output_file = wordlist_filename('subtlex-other', language, 'counts.txt')
|
||||
textcol, freqcol = SUBTLEX_COLUMN_MAP[language]
|
||||
|
||||
# Greek has three extra header lines for no reason
|
||||
if language == 'el':
|
||||
startrow = 5
|
||||
else:
|
||||
startrow = 2
|
||||
|
||||
add_dep(
|
||||
lines, 'convert_subtlex', input_file, processed_file,
|
||||
params={'col': 5}
|
||||
params={'textcol': textcol, 'freqcol': freqcol, 'startrow': startrow}
|
||||
)
|
||||
add_dep(
|
||||
lines, 'merge_counts', processed_file, output_file
|
||||
@ -259,7 +279,7 @@ def combine_lists(languages):
|
||||
output_file = wordlist_filename('combined', language)
|
||||
add_dep(lines, 'merge', input_files, output_file,
|
||||
extra='wordfreq_builder/word_counts.py',
|
||||
params={'cutoff': 2})
|
||||
params={'cutoff': 0})
|
||||
|
||||
output_cBpack = wordlist_filename(
|
||||
'combined-dist', language, 'msgpack.gz')
|
||||
|
@ -32,19 +32,20 @@ def count_tokens(filename):
|
||||
return counts
|
||||
|
||||
|
||||
def read_freqs(filename, cutoff=0, lang=None):
|
||||
def read_values(filename, cutoff=0, lang=None):
|
||||
"""
|
||||
Read words and their frequencies from a CSV file.
|
||||
Read words and their frequency or count values from a CSV file. Returns
|
||||
a dictionary of values and the total of all values.
|
||||
|
||||
Only words with a frequency greater than or equal to `cutoff` are returned.
|
||||
Only words with a value greater than or equal to `cutoff` are returned.
|
||||
|
||||
If `cutoff` is greater than 0, the csv file must be sorted by frequency
|
||||
If `cutoff` is greater than 0, the csv file must be sorted by value
|
||||
in descending order.
|
||||
|
||||
If lang is given, read_freqs will apply language specific preprocessing
|
||||
If lang is given, it will apply language specific preprocessing
|
||||
operations.
|
||||
"""
|
||||
raw_counts = defaultdict(float)
|
||||
values = defaultdict(float)
|
||||
total = 0.
|
||||
with open(filename, encoding='utf-8', newline='') as infile:
|
||||
for key, strval in csv.reader(infile):
|
||||
@ -56,13 +57,29 @@ def read_freqs(filename, cutoff=0, lang=None):
|
||||
for token in tokens:
|
||||
# Use += so that, if we give the reader concatenated files with
|
||||
# duplicates, it does the right thing
|
||||
raw_counts[token] += val
|
||||
values[token] += val
|
||||
total += val
|
||||
return values, total
|
||||
|
||||
for word in raw_counts:
|
||||
raw_counts[word] /= total
|
||||
|
||||
return raw_counts
|
||||
def read_freqs(filename, cutoff=0, lang=None):
|
||||
"""
|
||||
Read words and their frequencies from a CSV file, normalizing the
|
||||
frequencies to add up to 1.
|
||||
|
||||
Only words with a frequency greater than or equal to `cutoff` are returned.
|
||||
|
||||
If `cutoff` is greater than 0, the csv file must be sorted by frequency
|
||||
in descending order.
|
||||
|
||||
If lang is given, read_freqs will apply language specific preprocessing
|
||||
operations.
|
||||
"""
|
||||
values, total = read_values(filename, cutoff, lang)
|
||||
for word in values:
|
||||
values[word] /= total
|
||||
|
||||
return values
|
||||
|
||||
|
||||
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
|
||||
|
Loading…
Reference in New Issue
Block a user