mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 09:51:38 +00:00
add rules to count wikipedia tokens
This commit is contained in:
parent
c55e44e486
commit
7c09fec692
@ -35,7 +35,11 @@ rule tokenize_twitter
|
||||
# This rule uses command-line tools to take in a file with one token per line,
|
||||
# and output a comma-separated file with the token counts:
|
||||
#
|
||||
# * 'sort $in | uniq -c' does the actual counting.
|
||||
# * 'LANG=C' disables fancy Unicode sorting and instead just sorts by byte
|
||||
# order, which is fine because we only need this order so we can run
|
||||
# 'uniq'.
|
||||
# * 'sort $in -T $tmp | uniq -c' does the actual counting. The possibly
|
||||
# large amount of temporary output goes in $tmp.
|
||||
# * 'sort -nrk 1' sorts the result in reverse numeric order by the first field
|
||||
# (the count).
|
||||
# * The 'sed' command rearranges the lines to be comma-separated values with
|
||||
@ -43,7 +47,7 @@ rule tokenize_twitter
|
||||
# number at the start of the line.
|
||||
#
|
||||
rule count
|
||||
command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out
|
||||
command = mkdir -p $tmp && mkdir -p $$(dirname $out) && LANG=C sort $in -T $tmp | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out
|
||||
|
||||
rule cat
|
||||
command = cat $in > $out
|
||||
|
@ -5,6 +5,7 @@ import pathlib
|
||||
HEADER = """# This file is automatically generated. Do not edit it.
|
||||
# You can regenerate it using the 'wordfreq-build-deps' command.
|
||||
"""
|
||||
TMPDIR = data_filename('tmp')
|
||||
|
||||
|
||||
def make_ninja_deps(rules_filename, out=sys.stdout):
|
||||
@ -17,13 +18,13 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
||||
print(rulesfile.read(), file=out)
|
||||
|
||||
lines = (
|
||||
language_detect_and_tokenize_deps(
|
||||
twitter_deps(
|
||||
data_filename('raw-input/twitter/all-2014.txt'),
|
||||
slice_prefix=data_filename('slices/twitter/tweets-2014'),
|
||||
combined_prefix=data_filename('generated/twitter/tweets-2014'),
|
||||
combined_prefix=data_filename('intermediate/twitter/tweets-2014'),
|
||||
slices=40
|
||||
) +
|
||||
wiki_parse_deps(
|
||||
wikipedia_deps(
|
||||
data_filename('raw-input/wikipedia'),
|
||||
data_filename('generated/wikipedia'),
|
||||
CONFIG['wp_languages']
|
||||
@ -32,7 +33,7 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
||||
print('\n'.join(lines), file=out)
|
||||
|
||||
|
||||
def wiki_parse_deps(dirname_in, dirname_out, languages):
|
||||
def wikipedia_deps(dirname_in, dirname_out, languages):
|
||||
lines = []
|
||||
path_in = pathlib.Path(dirname_in)
|
||||
path_out = pathlib.Path(dirname_out)
|
||||
@ -51,11 +52,19 @@ def wiki_parse_deps(dirname_in, dirname_out, languages):
|
||||
outs=output_file, ins=input_file
|
||||
)
|
||||
lines.append(build_rule)
|
||||
|
||||
token_file = output_file
|
||||
output_file = path_out / 'wikipedia_{}.counts.txt'.format(language)
|
||||
build_rule = "build {outs}: count {ins}".format(
|
||||
outs=output_file, ins=token_file
|
||||
)
|
||||
lines.append(build_rule)
|
||||
lines.append(" tmp = {}".format(TMPDIR))
|
||||
return lines
|
||||
|
||||
|
||||
def language_detect_and_tokenize_deps(input_filename, slice_prefix,
|
||||
combined_prefix, slices):
|
||||
def twitter_deps(input_filename, slice_prefix,
|
||||
combined_prefix, slices):
|
||||
lines = []
|
||||
# split the input into slices
|
||||
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
|
||||
|
Loading…
Reference in New Issue
Block a user