add rules to count wikipedia tokens

2024-12-24 09:51:38 +00:00 · 2015-05-05 15:21:24 -04:00 · 2015-05-05 15:21:24 -04:00 · 7c09fec692
commit 7c09fec692
parent c55e44e486
2 changed files with 21 additions and 8 deletions
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -35,7 +35,11 @@ rule tokenize_twitter
 # This rule uses command-line tools to take in a file with one token per line,
 # and output a comma-separated file with the token counts:
 #
-#   * 'sort $in | uniq -c' does the actual counting.
+#   * 'LANG=C' disables fancy Unicode sorting and instead just sorts by byte
+#     order, which is fine because we only need this order so we can run
+#     'uniq'.
+#   * 'sort $in -T $tmp | uniq -c' does the actual counting. The possibly
+#     large amount of temporary output goes in $tmp.
 #   * 'sort -nrk 1' sorts the result in reverse numeric order by the first field
 #     (the count).
 #   * The 'sed' command rearranges the lines to be comma-separated values with
@ -43,7 +47,7 @@ rule tokenize_twitter
 #     number at the start of the line.
 #
 rule count
-  command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out
+  command = mkdir -p $tmp && mkdir -p $$(dirname $out) && LANG=C sort $in -T $tmp | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out

 rule cat
  command = cat $in > $out
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -5,6 +5,7 @@ import pathlib
 HEADER = """# This file is automatically generated. Do not edit it.
 # You can regenerate it using the 'wordfreq-build-deps' command.
 """
+TMPDIR = data_filename('tmp')


 def make_ninja_deps(rules_filename, out=sys.stdout):
@ -17,13 +18,13 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
        print(rulesfile.read(), file=out)

    lines = (
-        language_detect_and_tokenize_deps(
+        twitter_deps(
            data_filename('raw-input/twitter/all-2014.txt'),
            slice_prefix=data_filename('slices/twitter/tweets-2014'),
-            combined_prefix=data_filename('generated/twitter/tweets-2014'),
+            combined_prefix=data_filename('intermediate/twitter/tweets-2014'),
            slices=40
        ) +
-        wiki_parse_deps(
+        wikipedia_deps(
            data_filename('raw-input/wikipedia'),
            data_filename('generated/wikipedia'),
            CONFIG['wp_languages']
@ -32,7 +33,7 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
    print('\n'.join(lines), file=out)


-def wiki_parse_deps(dirname_in, dirname_out, languages):
+def wikipedia_deps(dirname_in, dirname_out, languages):
    lines = []
    path_in = pathlib.Path(dirname_in)
    path_out = pathlib.Path(dirname_out)
@ -51,11 +52,19 @@ def wiki_parse_deps(dirname_in, dirname_out, languages):
            outs=output_file, ins=input_file
        )
        lines.append(build_rule)
+
+        token_file = output_file
+        output_file = path_out / 'wikipedia_{}.counts.txt'.format(language)
+        build_rule = "build {outs}: count {ins}".format(
+            outs=output_file, ins=token_file
+        )
+        lines.append(build_rule)
+        lines.append("  tmp = {}".format(TMPDIR))
    return lines


-def language_detect_and_tokenize_deps(input_filename, slice_prefix,
-                                      combined_prefix, slices):
+def twitter_deps(input_filename, slice_prefix,
+                 combined_prefix, slices):
    lines = []
    # split the input into slices
    slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)