add and adjust some build steps

- more build steps for Wikipedia - rename 'tokenize_twitter' to 'pretokenize_twitter' to indicate that the results are preliminary
2024-12-24 01:41:39 +00:00 · 2015-05-05 13:59:21 -04:00 · 2015-05-05 13:59:21 -04:00 · 59409266ca
commit 59409266ca
parent 33c5f78c07
6 changed files with 101 additions and 14 deletions
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -20,15 +20,30 @@ DATA = ./data
 rule split
  command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix

-# wiki2text is a tool I wrote using the development version of Nim, which
-# extracts plain text from Wikipedia dumps obtained from dumps.wikimedia.org.
-# The code is at https://github.com/rspeer/wiki2text, but right now it'll
-# take a bit of setup to get it to run.
+# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from
+# Wikipedia dumps obtained from dumps.wikimedia.org.  The code is at
+# https://github.com/rspeer/wiki2text.
 rule wiki2text
  command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out

+rule wiki2tokens
+  command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out
+
 rule tokenize_twitter
  command = mkdir -p $$(dirname $prefix) && wordfreq-tokenize-twitter $in $prefix

+# This rule uses command-line tools to take in a file with one token per line,
+# and output a comma-separated file with the token counts:
+#
+#   * 'sort $in | uniq -c' does the actual counting.
+#   * 'sort -nrk 1' sorts the result in reverse numeric order by the first field
+#     (the count).
+#   * The 'sed' command rearranges the lines to be comma-separated values with
+#     the count coming second, instead of the count being a right-justified
+#     number at the start of the line.
+#
+rule count
+  command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)$/\2,\1/' > $out
+
 rule cat
  command = cat $in > $out
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@ -11,7 +11,8 @@ setup(
    packages=['wordfreq_builder'],
    entry_points={
        'console_scripts': [
-            'wordfreq-tokenize-twitter = wordfreq_builder.cli.tokenize_twitter:main',
+            'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
+            'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main',
            'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
        ]
    }
--- a/wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py
+++ b/wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py
@ -1,10 +1,10 @@
-from wordfreq_builder.tokenizers import rosette_surface_tokenizer, tokenize_file
+from wordfreq_builder.tokenizers import rosette_surface_tokenizer, pretokenize_file
 import argparse


-def tokenize_twitter(in_filename, out_prefix):
-    tokenize_file(in_filename, out_prefix,
-                  tokenizer=rosette_surface_tokenizer)
+def pretokenize_twitter(in_filename, out_prefix):
+    pretokenize_file(in_filename, out_prefix,
+                     tokenizer=rosette_surface_tokenizer)


 def main():
@ -12,7 +12,7 @@ def main():
    parser.add_argument('filename', help='filename of input file containing one tweet per line')
    parser.add_argument('outprefix', help='prefix of output filenames')
    args = parser.parse_args()
-    tokenize_twitter(args.filename, args.outprefix)
+    pretokenize_twitter(args.filename, args.outprefix)


 if __name__ == '__main__':
--- a/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py
+++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py
@ -0,0 +1,30 @@
+from wordfreq_builder.tokenizers import rosette_surface_tokenizer, monolingual_tokenize_file
+import argparse
+
+
+def tokenize_wikipedia(in_filename, out_filename, language, proportion):
+    monolingual_tokenize_file(
+        in_filename, out_filename,
+        language=language,
+        tokenizer=rosette_surface_tokenizer,
+        line_reader=strip_headings,
+        sample_proportion=proportion
+    )
+
+
+def strip_headings(text):
+    return text.strip().strip('=')
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('in_filename', help='filename of input file')
+    parser.add_argument('out_filename', help='filename of output file')
+    parser.add_argument('language', help='the language code of the text')
+    parser.add_argument('-p', '--proportion', help='process 1/n of the lines (default 100)', type=int, default=100)
+    args = parser.parse_args()
+    tokenize_wikipedia(args.in_filename, args.out_filename, args.language, args.proportion)
+
+
+if __name__ == '__main__':
+    main()
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -46,6 +46,11 @@ def wiki_parse_deps(dirname_in, dirname_out, languages):
            outs=output_file, ins=input_file
        )
        lines.append(build_rule)
+        output_file = path_out / 'wikipedia_{}.tokens.txt'.format(language)
+        build_rule = "build {outs}: wiki2tokens {ins}".format(
+            outs=output_file, ins=input_file
+        )
+        lines.append(build_rule)
    return lines


@ -69,7 +74,7 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix,
            '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language)
            for language in CONFIG['languages']
        ]
-        build_rule = "build {outs}: tokenize_twitter {ins} | wordfreq_builder/tokenizers.py".format(
+        build_rule = "build {outs}: tokenize_twitter {ins}".format(
            outs=' '.join(language_outputs), ins=slice_file
        )
        lines.append(build_rule)
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -1,5 +1,6 @@
 from lumi_science.text_readers.rosette_readers import RosetteReader
 import re
+import unicodedata


 ROSETTE = RosetteReader()
@ -15,6 +16,9 @@ ROSETTE_LANG_MAP = {
 }


+NON_PUNCT_RE = re.compile('[0-9A-Za-z\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff０-９Ａ-Ｚａ-ｚ\uff66-\U0002ffff]')
+
+
 def last_tab(line):
    """
    Read lines by keeping only the last tab-separated value.
@ -22,11 +26,26 @@ def last_tab(line):
    return line.split('\t')[-1].strip()


-def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
+def non_punct_filter(token):
+    if NON_PUNCT_RE.search(token):
+        return token.lower()
+    else:
+        return None
+
+
+def pretokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
+    """
+    Process a file by running it through the given tokenizer, sorting the
+    results by the language of each line, and inserting spaces into lines
+    to mark the token boundaries. This computes the 'hard part' of
+    tokenization and allows the results to be saved, so that we can change
+    the finer details of the output without re-running everything.
+    """
    out_files = {}
    for line in open(in_filename, encoding='utf-8'):
        text = line_reader(line)
-        tokenized, language = tokenizer(text)
+        tokens, language = tokenizer(text)
+        tokenized = ' '.join(tokens)
        if language is not None:
            out_filename = '%s.%s.txt' % (out_prefix, language)
            if out_filename in out_files:
@ -39,6 +58,23 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
        out_file.close()


+def monolingual_tokenize_file(in_filename, out_filename, language,
+                              tokenizer, line_reader=last_tab,
+                              token_filter=non_punct_filter,
+                              sample_proportion=100):
+    with open(in_filename, encoding='utf-8', errors='replace') as in_file:
+        with open(out_filename, 'w', encoding='utf-8') as out_file:
+            for i, line in enumerate(in_file):
+                if i % sample_proportion == 0:
+                    text = line_reader(line)
+                    tokens, line_language = tokenizer(text)
+                    if line_language == language:
+                        filtered = [token_filter(t) for t in tokens]
+                        filtered = [t for t in filtered if t is not None]
+                        for token in filtered:
+                            print(token, file=out_file)
+
+
 def rosette_surface_tokenizer(text):
    try:
        analysis, lang = ROSETTE.rosette.analyze(text)
@ -50,7 +86,7 @@ def rosette_surface_tokenizer(text):
    for (stem, pos, span) in analysis:
        surface_text = text[span[0]:span[1]]
        tokens.append(surface_text)
-    return ' '.join(tokens), language
+    return tokens, language


 def treebank_surface_tokenizer(text, language='en'):