mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-25 02:05:24 +00:00
add rules to count wikipedia tokens
This commit is contained in:
parent
c55e44e486
commit
7c09fec692
@ -35,7 +35,11 @@ rule tokenize_twitter
|
|||||||
# This rule uses command-line tools to take in a file with one token per line,
|
# This rule uses command-line tools to take in a file with one token per line,
|
||||||
# and output a comma-separated file with the token counts:
|
# and output a comma-separated file with the token counts:
|
||||||
#
|
#
|
||||||
# * 'sort $in | uniq -c' does the actual counting.
|
# * 'LANG=C' disables fancy Unicode sorting and instead just sorts by byte
|
||||||
|
# order, which is fine because we only need this order so we can run
|
||||||
|
# 'uniq'.
|
||||||
|
# * 'sort $in -T $tmp | uniq -c' does the actual counting. The possibly
|
||||||
|
# large amount of temporary output goes in $tmp.
|
||||||
# * 'sort -nrk 1' sorts the result in reverse numeric order by the first field
|
# * 'sort -nrk 1' sorts the result in reverse numeric order by the first field
|
||||||
# (the count).
|
# (the count).
|
||||||
# * The 'sed' command rearranges the lines to be comma-separated values with
|
# * The 'sed' command rearranges the lines to be comma-separated values with
|
||||||
@ -43,7 +47,7 @@ rule tokenize_twitter
|
|||||||
# number at the start of the line.
|
# number at the start of the line.
|
||||||
#
|
#
|
||||||
rule count
|
rule count
|
||||||
command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out
|
command = mkdir -p $tmp && mkdir -p $$(dirname $out) && LANG=C sort $in -T $tmp | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out
|
||||||
|
|
||||||
rule cat
|
rule cat
|
||||||
command = cat $in > $out
|
command = cat $in > $out
|
||||||
|
@ -5,6 +5,7 @@ import pathlib
|
|||||||
HEADER = """# This file is automatically generated. Do not edit it.
|
HEADER = """# This file is automatically generated. Do not edit it.
|
||||||
# You can regenerate it using the 'wordfreq-build-deps' command.
|
# You can regenerate it using the 'wordfreq-build-deps' command.
|
||||||
"""
|
"""
|
||||||
|
TMPDIR = data_filename('tmp')
|
||||||
|
|
||||||
|
|
||||||
def make_ninja_deps(rules_filename, out=sys.stdout):
|
def make_ninja_deps(rules_filename, out=sys.stdout):
|
||||||
@ -17,13 +18,13 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
|||||||
print(rulesfile.read(), file=out)
|
print(rulesfile.read(), file=out)
|
||||||
|
|
||||||
lines = (
|
lines = (
|
||||||
language_detect_and_tokenize_deps(
|
twitter_deps(
|
||||||
data_filename('raw-input/twitter/all-2014.txt'),
|
data_filename('raw-input/twitter/all-2014.txt'),
|
||||||
slice_prefix=data_filename('slices/twitter/tweets-2014'),
|
slice_prefix=data_filename('slices/twitter/tweets-2014'),
|
||||||
combined_prefix=data_filename('generated/twitter/tweets-2014'),
|
combined_prefix=data_filename('intermediate/twitter/tweets-2014'),
|
||||||
slices=40
|
slices=40
|
||||||
) +
|
) +
|
||||||
wiki_parse_deps(
|
wikipedia_deps(
|
||||||
data_filename('raw-input/wikipedia'),
|
data_filename('raw-input/wikipedia'),
|
||||||
data_filename('generated/wikipedia'),
|
data_filename('generated/wikipedia'),
|
||||||
CONFIG['wp_languages']
|
CONFIG['wp_languages']
|
||||||
@ -32,7 +33,7 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
|
|||||||
print('\n'.join(lines), file=out)
|
print('\n'.join(lines), file=out)
|
||||||
|
|
||||||
|
|
||||||
def wiki_parse_deps(dirname_in, dirname_out, languages):
|
def wikipedia_deps(dirname_in, dirname_out, languages):
|
||||||
lines = []
|
lines = []
|
||||||
path_in = pathlib.Path(dirname_in)
|
path_in = pathlib.Path(dirname_in)
|
||||||
path_out = pathlib.Path(dirname_out)
|
path_out = pathlib.Path(dirname_out)
|
||||||
@ -51,11 +52,19 @@ def wiki_parse_deps(dirname_in, dirname_out, languages):
|
|||||||
outs=output_file, ins=input_file
|
outs=output_file, ins=input_file
|
||||||
)
|
)
|
||||||
lines.append(build_rule)
|
lines.append(build_rule)
|
||||||
|
|
||||||
|
token_file = output_file
|
||||||
|
output_file = path_out / 'wikipedia_{}.counts.txt'.format(language)
|
||||||
|
build_rule = "build {outs}: count {ins}".format(
|
||||||
|
outs=output_file, ins=token_file
|
||||||
|
)
|
||||||
|
lines.append(build_rule)
|
||||||
|
lines.append(" tmp = {}".format(TMPDIR))
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
def language_detect_and_tokenize_deps(input_filename, slice_prefix,
|
def twitter_deps(input_filename, slice_prefix,
|
||||||
combined_prefix, slices):
|
combined_prefix, slices):
|
||||||
lines = []
|
lines = []
|
||||||
# split the input into slices
|
# split the input into slices
|
||||||
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
|
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
|
||||||
|
Loading…
Reference in New Issue
Block a user