mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Add SUBTLEX as a source of English and Chinese data
Meanwhile, fix up the dependency graph thingy. It's actually kind of
legible now.
Former-commit-id: 2d58ba94f2
This commit is contained in:
parent
4e8c15cb71
commit
f66d03b1b9
@ -1,30 +1,39 @@
|
|||||||
""" This file generates a graph of the dependencies for the ninja build."""
|
""" This file generates a graph of the dependencies for the ninja build."""
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
def ninja_to_dot():
|
def ninja_to_dot():
|
||||||
def last_component(path):
|
def simplified_filename(path):
|
||||||
return path.split('/')[-1]
|
component = path.split('/')[-1]
|
||||||
|
return re.sub(
|
||||||
|
r'[0-9]+-of', 'NN-of',
|
||||||
|
re.sub(r'part[0-9]+', 'partNN', component)
|
||||||
|
)
|
||||||
|
|
||||||
print("digraph G {")
|
print("digraph G {")
|
||||||
print('rankdir="LR";')
|
print('rankdir="LR";')
|
||||||
|
seen_edges = set()
|
||||||
for line in sys.stdin:
|
for line in sys.stdin:
|
||||||
line = line.rstrip()
|
line = line.rstrip()
|
||||||
if line.startswith('build'):
|
if line.startswith('build'):
|
||||||
# the output file is the first argument; strip off the colon that
|
# the output file is the first argument; strip off the colon that
|
||||||
# comes from ninja syntax
|
# comes from ninja syntax
|
||||||
output_text, input_text = line.split(':')
|
output_text, input_text = line.split(':')
|
||||||
outfiles = [last_component(part) for part in output_text.split(' ')[1:]]
|
outfiles = [simplified_filename(part) for part in output_text.split(' ')[1:]]
|
||||||
inputs = input_text.strip().split(' ')
|
inputs = input_text.strip().split(' ')
|
||||||
infiles = [last_component(part) for part in inputs[1:]]
|
infiles = [simplified_filename(part) for part in inputs[1:]]
|
||||||
operation = inputs[0]
|
operation = inputs[0]
|
||||||
for infile in infiles:
|
for infile in infiles:
|
||||||
if infile == '|':
|
if infile == '|':
|
||||||
# external dependencies start here; let's not graph those
|
# external dependencies start here; let's not graph those
|
||||||
break
|
break
|
||||||
for outfile in outfiles:
|
for outfile in outfiles:
|
||||||
print('"%s" -> "%s" [label="%s"]' % (infile, outfile, operation))
|
edge = '"%s" -> "%s" [label="%s"]' % (infile, outfile, operation)
|
||||||
|
if edge not in seen_edges:
|
||||||
|
seen_edges.add(edge)
|
||||||
|
print(edge)
|
||||||
print("}")
|
print("}")
|
||||||
|
|
||||||
|
|
||||||
|
BIN
wordfreq_builder/build.png
Normal file
BIN
wordfreq_builder/build.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.9 MiB |
@ -1 +0,0 @@
|
|||||||
ef54b21e931c530f5b75c1cd87c5841cc4691e43
|
|
@ -56,6 +56,11 @@ rule convert_leeds
|
|||||||
rule convert_opensubtitles
|
rule convert_opensubtitles
|
||||||
command = tr ' ' ',' < $in > $out
|
command = tr ' ' ',' < $in > $out
|
||||||
|
|
||||||
|
# To convert SUBTLEX, we take the 1st and Nth columns, strip the header, convert
|
||||||
|
# tabs to commas and commas to nothing, and remove obvious mojibake.
|
||||||
|
rule convert_subtlex
|
||||||
|
command = cut -f 1,$col $in | tail -n +2 | tr ' ,' ', ' | grep -v 'â,' > $out
|
||||||
|
|
||||||
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
|
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
|
||||||
# the input files, keep only the single words and their counts, and only keep
|
# the input files, keep only the single words and their counts, and only keep
|
||||||
# lines with counts of 100 or more.
|
# lines with counts of 100 or more.
|
||||||
@ -71,7 +76,10 @@ rule count
|
|||||||
command = python -m wordfreq_builder.cli.count_tokens $in $out
|
command = python -m wordfreq_builder.cli.count_tokens $in $out
|
||||||
|
|
||||||
rule merge
|
rule merge
|
||||||
command = python -m wordfreq_builder.cli.combine_lists -o $out $in
|
command = python -m wordfreq_builder.cli.combine_lists -o $out -c $cutoff $in
|
||||||
|
|
||||||
|
rule merge_counts
|
||||||
|
command = python -m wordfreq_builder.cli.merge_counts -o $out $in
|
||||||
|
|
||||||
rule freqs2cB
|
rule freqs2cB
|
||||||
command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out
|
command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
|
from wordfreq_builder.word_counts import read_freqs, merge_counts, write_wordlist
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
def merge_lists(input_names, output_name):
|
def merge_lists(input_names, output_name):
|
||||||
freq_dicts = []
|
count_dicts = []
|
||||||
for input_name in input_names:
|
for input_name in input_names:
|
||||||
freq_dicts.append(read_freqs(input_name, cutoff=2))
|
count_dicts.append(read_freqs(input_name, cutoff=0))
|
||||||
merged = merge_freqs(freq_dicts)
|
merged = merge_counts(count_dicts)
|
||||||
write_wordlist(merged, output_name)
|
write_wordlist(merged, output_name)
|
||||||
|
|
||||||
|
|
20
wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
Normal file
20
wordfreq_builder/wordfreq_builder/cli/merge_freqs.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def merge_lists(input_names, output_name, cutoff):
|
||||||
|
freq_dicts = []
|
||||||
|
for input_name in input_names:
|
||||||
|
freq_dicts.append(read_freqs(input_name, cutoff=cutoff))
|
||||||
|
merged = merge_freqs(freq_dicts)
|
||||||
|
write_wordlist(merged, output_name)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
|
||||||
|
parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
|
||||||
|
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
||||||
|
args = parser.parse_args()
|
||||||
|
merge_lists(args.inputs, args.output, args.cutoff)
|
||||||
|
|
@ -11,12 +11,12 @@ CONFIG = {
|
|||||||
'twitter': [
|
'twitter': [
|
||||||
'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
|
'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
|
||||||
'pt', 'ru',
|
'pt', 'ru',
|
||||||
# can be added later: 'th', 'tr'
|
# can be added later: 'el', 'tr'
|
||||||
],
|
],
|
||||||
'wikipedia': [
|
'wikipedia': [
|
||||||
'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
|
'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
|
||||||
'pt', 'ru'
|
'pt', 'ru'
|
||||||
# many more can be added
|
# consider adding 'el' and 'tr'
|
||||||
],
|
],
|
||||||
'opensubtitles': [
|
'opensubtitles': [
|
||||||
# All languages where the most common word in OpenSubtitles
|
# All languages where the most common word in OpenSubtitles
|
||||||
@ -33,14 +33,19 @@ CONFIG = {
|
|||||||
'en',
|
'en',
|
||||||
# Using the 2012 data, we could get French, German, Italian,
|
# Using the 2012 data, we could get French, German, Italian,
|
||||||
# Russian, Spanish, and (Simplified) Chinese.
|
# Russian, Spanish, and (Simplified) Chinese.
|
||||||
]
|
],
|
||||||
|
'subtlex-en': ['en'],
|
||||||
|
'subtlex-zh': ['zh'],
|
||||||
},
|
},
|
||||||
|
# Subtlex languages that need to be pre-processed
|
||||||
'wordlist_paths': {
|
'wordlist_paths': {
|
||||||
'twitter': 'generated/twitter/tweets-2014.{lang}.{ext}',
|
'twitter': 'generated/twitter/tweets-2014.{lang}.{ext}',
|
||||||
'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}',
|
'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}',
|
||||||
'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}',
|
'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}',
|
||||||
'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
|
'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
|
||||||
'google-books': 'generated/google-books/google_books_{lang}.{ext}',
|
'google-books': 'generated/google-books/google_books_{lang}.{ext}',
|
||||||
|
'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||||
|
'subtlex-zh': 'generated/subtlex/subtlex_{lang}.{ext}',
|
||||||
'combined': 'generated/combined/combined_{lang}.{ext}',
|
'combined': 'generated/combined/combined_{lang}.{ext}',
|
||||||
'combined-dist': 'dist/combined_{lang}.{ext}',
|
'combined-dist': 'dist/combined_{lang}.{ext}',
|
||||||
'twitter-dist': 'dist/twitter_{lang}.{ext}'
|
'twitter-dist': 'dist/twitter_{lang}.{ext}'
|
||||||
|
@ -5,7 +5,8 @@ import sys
|
|||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
HEADER = """# This file is automatically generated. Do not edit it.
|
HEADER = """# This file is automatically generated. Do not edit it.
|
||||||
# You can regenerate it using the 'wordfreq-build-deps' command.
|
# You can change its behavior by editing wordfreq_builder/ninja.py,
|
||||||
|
# and regenerate it by running 'make'.
|
||||||
"""
|
"""
|
||||||
TMPDIR = data_filename('tmp')
|
TMPDIR = data_filename('tmp')
|
||||||
|
|
||||||
@ -76,6 +77,18 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
|||||||
CONFIG['sources']['opensubtitles']
|
CONFIG['sources']['opensubtitles']
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
lines.extend(
|
||||||
|
subtlex_en_deps(
|
||||||
|
data_filename('source-lists/subtlex'),
|
||||||
|
CONFIG['sources']['subtlex-en']
|
||||||
|
)
|
||||||
|
)
|
||||||
|
lines.extend(
|
||||||
|
subtlex_zh_deps(
|
||||||
|
data_filename('source-lists/subtlex'),
|
||||||
|
CONFIG['sources']['subtlex-zh']
|
||||||
|
)
|
||||||
|
)
|
||||||
lines.extend(combine_lists(all_languages()))
|
lines.extend(combine_lists(all_languages()))
|
||||||
|
|
||||||
print('\n'.join(lines), file=out)
|
print('\n'.join(lines), file=out)
|
||||||
@ -188,12 +201,53 @@ def opensubtitles_deps(dirname_in, languages):
|
|||||||
prefix=dirname_in, lang=language
|
prefix=dirname_in, lang=language
|
||||||
)
|
)
|
||||||
reformatted_file = wordlist_filename(
|
reformatted_file = wordlist_filename(
|
||||||
'opensubtitles', language, 'counts.txt')
|
'opensubtitles', language, 'counts.txt'
|
||||||
|
)
|
||||||
add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
|
add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
|
||||||
|
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def subtlex_en_deps(dirname_in, languages):
|
||||||
|
lines = []
|
||||||
|
assert languages == ['en']
|
||||||
|
regions = ['en-US', 'en-GB']
|
||||||
|
processed_files = []
|
||||||
|
for region in regions:
|
||||||
|
input_file = '{prefix}/subtlex.{region}.txt'.format(
|
||||||
|
prefix=dirname_in, region=region
|
||||||
|
)
|
||||||
|
processed_file = wordlist_filename('subtlex-en', region, 'processed.txt')
|
||||||
|
processed_files.append(processed_file)
|
||||||
|
add_dep(
|
||||||
|
lines, 'convert_subtlex', input_file, processed_file,
|
||||||
|
params={'col': 2}
|
||||||
|
)
|
||||||
|
|
||||||
|
output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt')
|
||||||
|
add_dep(lines, 'merge_counts', processed_files, output_file)
|
||||||
|
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
|
def subtlex_zh_deps(dirname_in, languages):
|
||||||
|
lines = []
|
||||||
|
for language in languages:
|
||||||
|
input_file = '{prefix}/subtlex.{lang}.txt'.format(
|
||||||
|
prefix=dirname_in, lang=language
|
||||||
|
)
|
||||||
|
processed_file = wordlist_filename('subtlex-zh', language, 'processed.txt')
|
||||||
|
output_file = wordlist_filename('subtlex-zh', language, 'counts.txt')
|
||||||
|
add_dep(
|
||||||
|
lines, 'convert_subtlex', input_file, processed_file,
|
||||||
|
params={'col': 5}
|
||||||
|
)
|
||||||
|
add_dep(
|
||||||
|
lines, 'merge_counts', processed_file, output_file
|
||||||
|
)
|
||||||
|
return lines
|
||||||
|
|
||||||
|
|
||||||
def combine_lists(languages):
|
def combine_lists(languages):
|
||||||
lines = []
|
lines = []
|
||||||
for language in languages:
|
for language in languages:
|
||||||
@ -204,7 +258,8 @@ def combine_lists(languages):
|
|||||||
]
|
]
|
||||||
output_file = wordlist_filename('combined', language)
|
output_file = wordlist_filename('combined', language)
|
||||||
add_dep(lines, 'merge', input_files, output_file,
|
add_dep(lines, 'merge', input_files, output_file,
|
||||||
extra='wordfreq_builder/word_counts.py')
|
extra='wordfreq_builder/word_counts.py',
|
||||||
|
params={'cutoff': 2})
|
||||||
|
|
||||||
output_cBpack = wordlist_filename(
|
output_cBpack = wordlist_filename(
|
||||||
'combined-dist', language, 'msgpack.gz')
|
'combined-dist', language, 'msgpack.gz')
|
||||||
|
@ -49,13 +49,14 @@ def read_freqs(filename, cutoff=0, lang=None):
|
|||||||
with open(filename, encoding='utf-8', newline='') as infile:
|
with open(filename, encoding='utf-8', newline='') as infile:
|
||||||
for key, strval in csv.reader(infile):
|
for key, strval in csv.reader(infile):
|
||||||
val = float(strval)
|
val = float(strval)
|
||||||
|
key = fix_text(key)
|
||||||
if val < cutoff:
|
if val < cutoff:
|
||||||
break
|
break
|
||||||
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
|
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
# Use += so that, if we give the reader concatenated files with
|
# Use += so that, if we give the reader concatenated files with
|
||||||
# duplicates, it does the right thing
|
# duplicates, it does the right thing
|
||||||
raw_counts[fix_text(token)] += val
|
raw_counts[token] += val
|
||||||
total += val
|
total += val
|
||||||
|
|
||||||
for word in raw_counts:
|
for word in raw_counts:
|
||||||
@ -96,6 +97,17 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
|
|||||||
msgpack.dump(cBpack_data, outfile)
|
msgpack.dump(cBpack_data, outfile)
|
||||||
|
|
||||||
|
|
||||||
|
def merge_counts(count_dicts):
|
||||||
|
"""
|
||||||
|
Merge multiple dictionaries of counts by adding their entries.
|
||||||
|
"""
|
||||||
|
merged = defaultdict(int)
|
||||||
|
for count_dict in count_dicts:
|
||||||
|
for term, count in count_dict.items():
|
||||||
|
merged[term] += count
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
def merge_freqs(freq_dicts):
|
def merge_freqs(freq_dicts):
|
||||||
"""
|
"""
|
||||||
Merge multiple dictionaries of frequencies, representing each word with
|
Merge multiple dictionaries of frequencies, representing each word with
|
||||||
|
Loading…
Reference in New Issue
Block a user