From 815d393b74918cb6f700b05f2c508078754bcfbd Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Wed, 29 Apr 2015 15:22:04 -0400 Subject: [PATCH 01/16] move commands into cli/ directory --- wordfreq_builder/wordfreq_builder/cli/__init__.py | 0 .../{cmd_combine_lists.py => cli/combine_lists.py} | 0 .../{cmd_count_twitter.py => cli/count_twitter.py} | 0 .../{cmd_count_wikipedia.py => cli/count_wikipedia.py} | 0 4 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 wordfreq_builder/wordfreq_builder/cli/__init__.py rename wordfreq_builder/wordfreq_builder/{cmd_combine_lists.py => cli/combine_lists.py} (100%) rename wordfreq_builder/wordfreq_builder/{cmd_count_twitter.py => cli/count_twitter.py} (100%) rename wordfreq_builder/wordfreq_builder/{cmd_count_wikipedia.py => cli/count_wikipedia.py} (100%) diff --git a/wordfreq_builder/wordfreq_builder/cli/__init__.py b/wordfreq_builder/wordfreq_builder/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wordfreq_builder/wordfreq_builder/cmd_combine_lists.py b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py similarity index 100% rename from wordfreq_builder/wordfreq_builder/cmd_combine_lists.py rename to wordfreq_builder/wordfreq_builder/cli/combine_lists.py diff --git a/wordfreq_builder/wordfreq_builder/cmd_count_twitter.py b/wordfreq_builder/wordfreq_builder/cli/count_twitter.py similarity index 100% rename from wordfreq_builder/wordfreq_builder/cmd_count_twitter.py rename to wordfreq_builder/wordfreq_builder/cli/count_twitter.py diff --git a/wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py b/wordfreq_builder/wordfreq_builder/cli/count_wikipedia.py similarity index 100% rename from wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py rename to wordfreq_builder/wordfreq_builder/cli/count_wikipedia.py From 14e445a937915ab801544f7542b1fbf7b7985649 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Wed, 29 Apr 2015 15:59:06 -0400 Subject: [PATCH 02/16] WIP on Ninja build automation --- wordfreq_builder/rules.ninja | 28 ++++++++++++++++++++++++++++ wordfreq_builder/wordfreq.cfg | 4 ++++ 2 files changed, 32 insertions(+) create mode 100644 wordfreq_builder/rules.ninja create mode 100644 wordfreq_builder/wordfreq.cfg diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja new file mode 100644 index 0000000..ac8f386 --- /dev/null +++ b/wordfreq_builder/rules.ninja @@ -0,0 +1,28 @@ +# This defines the rules on how to build parts of the wordfreq lists, using the +# Ninja build system: +# +# http://martine.github.io/ninja/manual.html +# +# Ninja is available in the 'ninja-build' Ubuntu package. It's like make with +# better parallelism and the ability for build steps to produce multiple +# outputs. The tradeoff is that its rule syntax isn't full of magic for +# expanding wildcards and finding dependencies, so in general you have to +# write the dependencies using a script. +# +# This file will become the header of the larger build.ninja file, which also +# contains the programatically-defined dependency graph. + +# Variables +DATA = ./data + +# Splits the single file $in into $slices parts, whose names will be +# $prefix plus a two-digit numeric suffix. +rule split + command = split -d -n $slices $in $prefix + +# wiki2text is a tool I wrote using the development version of Nim, which +# extracts plain text from Wikipedia dumps obtained from dumps.wikimedia.org. +# The code is at https://github.com/rspeer/wiki2text, but right now it'll +# take a bit of setup to get it to run. +rule wiki2text + command = bunzip2 -c $in | wiki2text > $out diff --git a/wordfreq_builder/wordfreq.cfg b/wordfreq_builder/wordfreq.cfg new file mode 100644 index 0000000..3146062 --- /dev/null +++ b/wordfreq_builder/wordfreq.cfg @@ -0,0 +1,4 @@ +[wordfreq] +version = 0.8 +data_dir = ./data +languages = en, es, fr, de, pt, nl, ru, it, ko, ja, zh-TW, zh-CN, ar, ms From 4dae2f8caff82dbb50f924e817a612704841a8a9 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Wed, 29 Apr 2015 17:13:58 -0400 Subject: [PATCH 03/16] define some ninja rules --- wordfreq_builder/rules.ninja | 12 +++- wordfreq_builder/wordfreq_builder/config.py | 14 ++++ wordfreq_builder/wordfreq_builder/ninja.py | 69 +++++++++++++++++++ .../wordfreq_builder/tokenizers.py | 52 ++++++++++---- 4 files changed, 129 insertions(+), 18 deletions(-) create mode 100644 wordfreq_builder/wordfreq_builder/config.py create mode 100644 wordfreq_builder/wordfreq_builder/ninja.py diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index ac8f386..2810f1e 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -2,7 +2,7 @@ # Ninja build system: # # http://martine.github.io/ninja/manual.html -# +# # Ninja is available in the 'ninja-build' Ubuntu package. It's like make with # better parallelism and the ability for build steps to produce multiple # outputs. The tradeoff is that its rule syntax isn't full of magic for @@ -18,11 +18,17 @@ DATA = ./data # Splits the single file $in into $slices parts, whose names will be # $prefix plus a two-digit numeric suffix. rule split - command = split -d -n $slices $in $prefix + command = mkdir -p $$(dirname $prefix) && split -d -n $slices $in $prefix # wiki2text is a tool I wrote using the development version of Nim, which # extracts plain text from Wikipedia dumps obtained from dumps.wikimedia.org. # The code is at https://github.com/rspeer/wiki2text, but right now it'll # take a bit of setup to get it to run. rule wiki2text - command = bunzip2 -c $in | wiki2text > $out + command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out + +rule tokenize_twitter + command = mkdir -p $$(dirname $prefix) && wordfreq-tokenize-twitter $in $prefix + +rule cat + command = cat $in > $out diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py new file mode 100644 index 0000000..829e07a --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -0,0 +1,14 @@ +import os + +CONFIG = { + 'version': '0.8', + 'data_dir': './data', + 'languages': [ + 'en', 'es', 'fr', 'de', 'pt', 'nl', 'ru', 'it', 'ar', 'ms', 'id', + 'ja', 'ko', 'zh-TW', 'zh-CN', + ] +} + + +def data_filename(filename): + return os.path.join(CONFIG['data_dir'], filename) diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py new file mode 100644 index 0000000..21ff6c7 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -0,0 +1,69 @@ +from wordfreq_builder.config import CONFIG, data_filename +import sys + +HEADER = """# This file is automatically generated. Do not edit it. +# You can regenerate it using the 'wordfreq-build-deps' command. +""" + + +def make_ninja_deps(rules_filename, out=sys.stdout): + """ + Output a complete Ninja file describing how to build the wordfreq data. + """ + print(HEADER, file=out) + # Copy in the rules section + with open(rules_filename, encoding='utf-8') as rulesfile: + print(rulesfile.read(), file=out) + + language_detect_and_tokenize_deps( + data_filename('raw-input/twitter/all-2014.txt'), + slice_prefix='slices/twitter/tweets-2014', + combined_prefix='generated/twitter/tweets-2014', + out=out, slices=10 + ) + + +def language_detect_and_tokenize_deps(input_filename, slice_prefix, + combined_prefix, out, slices=10): + lines = [] + # split the input into slices + slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num) + for num in range(slices)] + build_rule = "build {outs}: split {ins}".format( + outs=' '.join(slice_files), ins=input_filename + ) + lines.append(build_rule) + lines.append(" prefix = {}".format(slice_prefix)) + lines.append(" slices = {}".format(slices)) + lines.append("") + + for slicenum in range(slices): + slice_file = slice_files[slicenum] + language_outputs = [ + '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language) + for language in CONFIG['languages'] + ] + build_rule = "build {outs}: tokenize_twitter {ins}".format( + outs=' '.join(language_outputs), ins=slice_file + ) + lines.append(build_rule) + lines.append(" prefix = {}".format(slice_file)) + lines.append("") + + for language in CONFIG['languages']: + combined_output = '{prefix}.{lang}.txt'.format(prefix=combined_prefix, lang=language) + language_inputs = [ + '{prefix}.{lang}'.format(prefix=slice_files[slicenum], lang=language) + for slicenum in range(slices) + ] + build_rule = "build {outs}: cat {ins}".format( + outs=combined_output, + ins=' '.join(language_inputs) + ) + lines.append(build_rule) + + print('\n'.join(lines), file=out) + + +if __name__ == '__main__': + make_ninja_deps('rules.ninja') diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index e20c96f..4a1117f 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -5,29 +5,47 @@ import re ROSETTE = RosetteReader() -def rosette_tokenizer(text): - analysis, lang = ROSETTE.rosette.analyze(text) - # I'm aware this doesn't do the right things with multi-word stems. - # Wordfreq doesn't either. And wordfreq isn't designed to look up - # multiple words anyway. - tokens = [] - for (stem, pos, span) in analysis: - for subtoken in stem.split(' '): - tokens.append(subtoken + '|' + lang) - return tokens +# Rosette labels the orthographies of Chinese incorrectly +ROSETTE_LANG_MAP = { + 'zh_sc': 'zh-CN', + 'zh_tc': 'zh-TW' +} + + +def last_tab(line): + """ + Read lines by keeping only the last tab-separated value. + """ + return line.split('\t')[-1].strip() + + +def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab): + out_files = {} + for line in open(in_filename, encoding='utf-8'): + text = line_reader(line) + tokenized, language = tokenizer(text) + out_filename = '%s.%s.txt' % (out_prefix, language) + if out_filename in out_files: + out_file = out_files[out_filename] + else: + out_file = open(out_filename, 'w', encoding='utf-8') + out_files[out_filename] = out_file + print(tokenized, file=out_file) + for out_file in out_files.values(): + out_file.close() def rosette_surface_tokenizer(text): analysis, lang = ROSETTE.rosette.analyze(text) + language = ROSETTE_LANG_MAP.get(lang, lang) tokens = [] for (stem, pos, span) in analysis: surface_text = text[span[0]:span[1]] - for subtoken in surface_text.split(' '): - tokens.append(subtoken + '|' + lang) - return tokens + tokens.append(surface_text) + return ' '.join(tokens), language -def treebank_surface_tokenizer(text): +def treebank_surface_tokenizer(text, language='en'): """ This is a simplified version of the Treebank tokenizer in NLTK. @@ -45,6 +63,10 @@ def treebank_surface_tokenizer(text): as a result -- for example, it splits "wanna" into "wan" and "na", which are supposed to be considered unusual surface forms of "want" and "to". We just leave it as the word "wanna". + + The language will just be returned, as this function isn't doing any + language detection. It defaults to 'en', as English is the language that + Treebank tokenization is designed for. """ #starting quotes text = re.sub(r'^\"', r'``', text) @@ -80,4 +102,4 @@ def treebank_surface_tokenizer(text): text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ", text) - return text.split() + return text.split(), language From 2a1b16b55c66b3f2693c6db3e5c53a0768602098 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Thu, 30 Apr 2015 13:02:58 -0400 Subject: [PATCH 04/16] use script codes for Chinese --- wordfreq_builder/wordfreq_builder/config.py | 2 +- wordfreq_builder/wordfreq_builder/tokenizers.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index 829e07a..527c4e9 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -5,7 +5,7 @@ CONFIG = { 'data_dir': './data', 'languages': [ 'en', 'es', 'fr', 'de', 'pt', 'nl', 'ru', 'it', 'ar', 'ms', 'id', - 'ja', 'ko', 'zh-TW', 'zh-CN', + 'ja', 'ko', 'zh-Hans', 'zh-Hant', ] } diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index 4a1117f..9140cf4 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -7,8 +7,8 @@ ROSETTE = RosetteReader() # Rosette labels the orthographies of Chinese incorrectly ROSETTE_LANG_MAP = { - 'zh_sc': 'zh-CN', - 'zh_tc': 'zh-TW' + 'zh_sc': 'zh-Hans', + 'zh_tc': 'zh-Hant' } From 5437bb4e8598ce3a28c81a871052ebe657921133 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Thu, 30 Apr 2015 16:24:28 -0400 Subject: [PATCH 05/16] WIP on new build system --- wordfreq_builder/.gitignore | 4 +++ wordfreq_builder/Makefile | 12 ++++++++ wordfreq_builder/rules.ninja | 2 +- wordfreq_builder/setup.py | 6 ++++ .../wordfreq_builder/cli/build_deps.py | 15 ++++++++++ .../wordfreq_builder/cli/tokenize_twitter.py | 19 +++++++++++++ wordfreq_builder/wordfreq_builder/config.py | 8 ++++-- wordfreq_builder/wordfreq_builder/ninja.py | 20 +++++++------ .../wordfreq_builder/tokenizers.py | 28 ++++++++++++------- 9 files changed, 92 insertions(+), 22 deletions(-) create mode 100644 wordfreq_builder/Makefile create mode 100644 wordfreq_builder/wordfreq_builder/cli/build_deps.py create mode 100644 wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py diff --git a/wordfreq_builder/.gitignore b/wordfreq_builder/.gitignore index 46c58ff..a1da2e9 100644 --- a/wordfreq_builder/.gitignore +++ b/wordfreq_builder/.gitignore @@ -6,3 +6,7 @@ dist *.egg-info build _build +build.ninja +data +.ninja_deps +.ninja_log diff --git a/wordfreq_builder/Makefile b/wordfreq_builder/Makefile new file mode 100644 index 0000000..0820c5c --- /dev/null +++ b/wordfreq_builder/Makefile @@ -0,0 +1,12 @@ +PYTHON = python + +all: build.ninja + +# make sure this package is in 'develop' mode and up to date +wordfreq_builder.egg-info/PKG-INFO: setup.py + $(PYTHON) setup.py develop + +# build the Ninja file that will take over the build process +build.ninja: rules.ninja wordfreq_builder/ninja.py wordfreq_builder/config.py wordfreq_builder.egg-info/PKG-INFO + wordfreq-build-deps rules.ninja > build.ninja + diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index 2810f1e..0355a0f 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -18,7 +18,7 @@ DATA = ./data # Splits the single file $in into $slices parts, whose names will be # $prefix plus a two-digit numeric suffix. rule split - command = mkdir -p $$(dirname $prefix) && split -d -n $slices $in $prefix + command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix # wiki2text is a tool I wrote using the development version of Nim, which # extracts plain text from Wikipedia dumps obtained from dumps.wikimedia.org. diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py index e57c58e..1de97b5 100755 --- a/wordfreq_builder/setup.py +++ b/wordfreq_builder/setup.py @@ -9,4 +9,10 @@ setup( platforms=["any"], description="Turns raw data into word frequency lists", packages=['wordfreq_builder'], + entry_points={ + 'console_scripts': [ + 'wordfreq-tokenize-twitter = wordfreq_builder.cli.tokenize_twitter:main', + 'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main' + ] + } ) diff --git a/wordfreq_builder/wordfreq_builder/cli/build_deps.py b/wordfreq_builder/wordfreq_builder/cli/build_deps.py new file mode 100644 index 0000000..3fd74ad --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/build_deps.py @@ -0,0 +1,15 @@ +from wordfreq_builder.ninja import make_ninja_deps +import argparse + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('in_filename', help='filename of rules file') + args = parser.parse_args() + + # Make the complete ninja file and write it to standard out + make_ninja_deps(args.in_filename) + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py new file mode 100644 index 0000000..aa1e061 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py @@ -0,0 +1,19 @@ +from wordfreq_builder.tokenizers import rosette_surface_tokenizer, tokenize_file +import argparse + + +def tokenize_twitter(in_filename, out_prefix): + tokenize_file(in_filename, out_prefix, + tokenizer=rosette_surface_tokenizer) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('filename', help='filename of input file containing one tweet per line') + parser.add_argument('outprefix', help='prefix of output filenames') + args = parser.parse_args() + tokenize_twitter(args.filename, args.outprefix) + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index 527c4e9..c3e5cff 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -1,12 +1,14 @@ import os CONFIG = { - 'version': '0.8', - 'data_dir': './data', + 'version': '0.9.0', + # data_dir is a relative or absolute path to where the wordlist data + # is stored + 'data_dir': 'data', 'languages': [ 'en', 'es', 'fr', 'de', 'pt', 'nl', 'ru', 'it', 'ar', 'ms', 'id', 'ja', 'ko', 'zh-Hans', 'zh-Hant', - ] + ], } diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 21ff6c7..19d39d2 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -17,14 +17,14 @@ def make_ninja_deps(rules_filename, out=sys.stdout): language_detect_and_tokenize_deps( data_filename('raw-input/twitter/all-2014.txt'), - slice_prefix='slices/twitter/tweets-2014', - combined_prefix='generated/twitter/tweets-2014', - out=out, slices=10 + slice_prefix=data_filename('slices/twitter/tweets-2014'), + combined_prefix=data_filename('generated/twitter/tweets-2014'), + out=out, slices=40 ) def language_detect_and_tokenize_deps(input_filename, slice_prefix, - combined_prefix, out, slices=10): + combined_prefix, out, slices): lines = [] # split the input into slices slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num) @@ -33,7 +33,7 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix, outs=' '.join(slice_files), ins=input_filename ) lines.append(build_rule) - lines.append(" prefix = {}".format(slice_prefix)) + lines.append(" prefix = {}.part".format(slice_prefix)) lines.append(" slices = {}".format(slices)) lines.append("") @@ -43,7 +43,7 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix, '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language) for language in CONFIG['languages'] ] - build_rule = "build {outs}: tokenize_twitter {ins}".format( + build_rule = "build {outs}: tokenize_twitter {ins} | wordfreq_builder/tokenizers.py".format( outs=' '.join(language_outputs), ins=slice_file ) lines.append(build_rule) @@ -53,7 +53,7 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix, for language in CONFIG['languages']: combined_output = '{prefix}.{lang}.txt'.format(prefix=combined_prefix, lang=language) language_inputs = [ - '{prefix}.{lang}'.format(prefix=slice_files[slicenum], lang=language) + '{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language) for slicenum in range(slices) ] build_rule = "build {outs}: cat {ins}".format( @@ -65,5 +65,9 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix, print('\n'.join(lines), file=out) -if __name__ == '__main__': +def main(): make_ninja_deps('rules.ninja') + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index 9140cf4..d8df0f7 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -5,10 +5,13 @@ import re ROSETTE = RosetteReader() -# Rosette labels the orthographies of Chinese incorrectly +# Some of Rosette's language codes are incorrect. For example, 'zh_sc' should +# mean "Chinese as used in Seychelles", which is kind of nonsense. What Rosette +# really means is "Simplified Chinese", whose code is 'zh-Hans'. ROSETTE_LANG_MAP = { 'zh_sc': 'zh-Hans', - 'zh_tc': 'zh-Hant' + 'zh_tc': 'zh-Hant', + 'en_uc': 'en', } @@ -24,19 +27,24 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab): for line in open(in_filename, encoding='utf-8'): text = line_reader(line) tokenized, language = tokenizer(text) - out_filename = '%s.%s.txt' % (out_prefix, language) - if out_filename in out_files: - out_file = out_files[out_filename] - else: - out_file = open(out_filename, 'w', encoding='utf-8') - out_files[out_filename] = out_file - print(tokenized, file=out_file) + if language is not None: + out_filename = '%s.%s.txt' % (out_prefix, language) + if out_filename in out_files: + out_file = out_files[out_filename] + else: + out_file = open(out_filename, 'w', encoding='utf-8') + out_files[out_filename] = out_file + print(tokenized, file=out_file) for out_file in out_files.values(): out_file.close() def rosette_surface_tokenizer(text): - analysis, lang = ROSETTE.rosette.analyze(text) + try: + analysis, lang = ROSETTE.rosette.analyze(text) + except (RuntimeError, UnicodeError) as e: + # Our Rosette interface throws errors given arbitrary data. :( + return text, None language = ROSETTE_LANG_MAP.get(lang, lang) tokens = [] for (stem, pos, span) in analysis: From 34400de35a57e5a7b73d99f3fde0aa4021f1a03d Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Thu, 30 Apr 2015 16:25:42 -0400 Subject: [PATCH 06/16] not using wordfreq.cfg anymore --- wordfreq_builder/wordfreq.cfg | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 wordfreq_builder/wordfreq.cfg diff --git a/wordfreq_builder/wordfreq.cfg b/wordfreq_builder/wordfreq.cfg deleted file mode 100644 index 3146062..0000000 --- a/wordfreq_builder/wordfreq.cfg +++ /dev/null @@ -1,4 +0,0 @@ -[wordfreq] -version = 0.8 -data_dir = ./data -languages = en, es, fr, de, pt, nl, ru, it, ko, ja, zh-TW, zh-CN, ar, ms From 61b9440e3d0960db04edd591ec977de331bb6f26 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Mon, 4 May 2015 13:25:01 -0400 Subject: [PATCH 07/16] add wiki-parsing process --- wordfreq_builder/wordfreq_builder/config.py | 5 +++ wordfreq_builder/wordfreq_builder/ninja.py | 40 +++++++++++++++++---- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index c3e5cff..bb19bdf 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -9,6 +9,11 @@ CONFIG = { 'en', 'es', 'fr', 'de', 'pt', 'nl', 'ru', 'it', 'ar', 'ms', 'id', 'ja', 'ko', 'zh-Hans', 'zh-Hant', ], + # Skip the Chinese Wikipedia until we know what to do with it + 'wp_languages': [ + 'en', 'es', 'fr', 'de', 'pt', 'nl', 'ru', 'it', 'ar', 'ms', 'id', + 'ja', 'ko' + ] } diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 19d39d2..f3abb9c 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -1,5 +1,6 @@ from wordfreq_builder.config import CONFIG, data_filename import sys +import pathlib HEADER = """# This file is automatically generated. Do not edit it. # You can regenerate it using the 'wordfreq-build-deps' command. @@ -15,16 +16,41 @@ def make_ninja_deps(rules_filename, out=sys.stdout): with open(rules_filename, encoding='utf-8') as rulesfile: print(rulesfile.read(), file=out) - language_detect_and_tokenize_deps( - data_filename('raw-input/twitter/all-2014.txt'), - slice_prefix=data_filename('slices/twitter/tweets-2014'), - combined_prefix=data_filename('generated/twitter/tweets-2014'), - out=out, slices=40 + lines = ( + language_detect_and_tokenize_deps( + data_filename('raw-input/twitter/all-2014.txt'), + slice_prefix=data_filename('slices/twitter/tweets-2014'), + combined_prefix=data_filename('generated/twitter/tweets-2014'), + slices=40 + ) + + wiki_parse_deps( + data_filename('raw-input/wikipedia'), + data_filename('generated/wikipedia'), + CONFIG['wp_languages'] + ) ) + print('\n'.join(lines), file=out) + + +def wiki_parse_deps(dirname_in, dirname_out, languages): + lines = [] + path_in = pathlib.Path(dirname_in) + path_out = pathlib.Path(dirname_out) + for language in languages: + # Find the most recent file for this language + input_file = max(path_in.glob( + '{}wiki*.bz2'.format(language) + )) + output_file = path_out / 'wikipedia_{}.txt'.format(language) + build_rule = "build {outs}: wiki2text {ins}".format( + outs=output_file, ins=input_file + ) + lines.append(build_rule) + return lines def language_detect_and_tokenize_deps(input_filename, slice_prefix, - combined_prefix, out, slices): + combined_prefix, slices): lines = [] # split the input into slices slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num) @@ -62,7 +88,7 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix, ) lines.append(build_rule) - print('\n'.join(lines), file=out) + return lines def main(): From 5787b6bb73cda3458d270e06c0813a2e15c415eb Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Tue, 5 May 2015 13:59:21 -0400 Subject: [PATCH 08/16] add and adjust some build steps - more build steps for Wikipedia - rename 'tokenize_twitter' to 'pretokenize_twitter' to indicate that the results are preliminary --- wordfreq_builder/rules.ninja | 23 ++++++++-- wordfreq_builder/setup.py | 3 +- ...nize_twitter.py => pretokenize_twitter.py} | 10 ++--- .../cli/tokenize_wikipedia.py | 30 +++++++++++++ wordfreq_builder/wordfreq_builder/ninja.py | 7 +++- .../wordfreq_builder/tokenizers.py | 42 +++++++++++++++++-- 6 files changed, 101 insertions(+), 14 deletions(-) rename wordfreq_builder/wordfreq_builder/cli/{tokenize_twitter.py => pretokenize_twitter.py} (62%) create mode 100644 wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index 0355a0f..9be5bd1 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -20,15 +20,30 @@ DATA = ./data rule split command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix -# wiki2text is a tool I wrote using the development version of Nim, which -# extracts plain text from Wikipedia dumps obtained from dumps.wikimedia.org. -# The code is at https://github.com/rspeer/wiki2text, but right now it'll -# take a bit of setup to get it to run. +# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from +# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at +# https://github.com/rspeer/wiki2text. rule wiki2text command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out +rule wiki2tokens + command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out + rule tokenize_twitter command = mkdir -p $$(dirname $prefix) && wordfreq-tokenize-twitter $in $prefix +# This rule uses command-line tools to take in a file with one token per line, +# and output a comma-separated file with the token counts: +# +# * 'sort $in | uniq -c' does the actual counting. +# * 'sort -nrk 1' sorts the result in reverse numeric order by the first field +# (the count). +# * The 'sed' command rearranges the lines to be comma-separated values with +# the count coming second, instead of the count being a right-justified +# number at the start of the line. +# +rule count + command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)$/\2,\1/' > $out + rule cat command = cat $in > $out diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py index 1de97b5..3f3b902 100755 --- a/wordfreq_builder/setup.py +++ b/wordfreq_builder/setup.py @@ -11,7 +11,8 @@ setup( packages=['wordfreq_builder'], entry_points={ 'console_scripts': [ - 'wordfreq-tokenize-twitter = wordfreq_builder.cli.tokenize_twitter:main', + 'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main', + 'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main', 'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main' ] } diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py b/wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py similarity index 62% rename from wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py rename to wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py index aa1e061..c179988 100644 --- a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py +++ b/wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py @@ -1,10 +1,10 @@ -from wordfreq_builder.tokenizers import rosette_surface_tokenizer, tokenize_file +from wordfreq_builder.tokenizers import rosette_surface_tokenizer, pretokenize_file import argparse -def tokenize_twitter(in_filename, out_prefix): - tokenize_file(in_filename, out_prefix, - tokenizer=rosette_surface_tokenizer) +def pretokenize_twitter(in_filename, out_prefix): + pretokenize_file(in_filename, out_prefix, + tokenizer=rosette_surface_tokenizer) def main(): @@ -12,7 +12,7 @@ def main(): parser.add_argument('filename', help='filename of input file containing one tweet per line') parser.add_argument('outprefix', help='prefix of output filenames') args = parser.parse_args() - tokenize_twitter(args.filename, args.outprefix) + pretokenize_twitter(args.filename, args.outprefix) if __name__ == '__main__': diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py new file mode 100644 index 0000000..fa97543 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py @@ -0,0 +1,30 @@ +from wordfreq_builder.tokenizers import rosette_surface_tokenizer, monolingual_tokenize_file +import argparse + + +def tokenize_wikipedia(in_filename, out_filename, language, proportion): + monolingual_tokenize_file( + in_filename, out_filename, + language=language, + tokenizer=rosette_surface_tokenizer, + line_reader=strip_headings, + sample_proportion=proportion + ) + + +def strip_headings(text): + return text.strip().strip('=') + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('in_filename', help='filename of input file') + parser.add_argument('out_filename', help='filename of output file') + parser.add_argument('language', help='the language code of the text') + parser.add_argument('-p', '--proportion', help='process 1/n of the lines (default 100)', type=int, default=100) + args = parser.parse_args() + tokenize_wikipedia(args.in_filename, args.out_filename, args.language, args.proportion) + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index f3abb9c..ab74ec8 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -46,6 +46,11 @@ def wiki_parse_deps(dirname_in, dirname_out, languages): outs=output_file, ins=input_file ) lines.append(build_rule) + output_file = path_out / 'wikipedia_{}.tokens.txt'.format(language) + build_rule = "build {outs}: wiki2tokens {ins}".format( + outs=output_file, ins=input_file + ) + lines.append(build_rule) return lines @@ -69,7 +74,7 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix, '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language) for language in CONFIG['languages'] ] - build_rule = "build {outs}: tokenize_twitter {ins} | wordfreq_builder/tokenizers.py".format( + build_rule = "build {outs}: tokenize_twitter {ins}".format( outs=' '.join(language_outputs), ins=slice_file ) lines.append(build_rule) diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index d8df0f7..29be251 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -1,5 +1,6 @@ from lumi_science.text_readers.rosette_readers import RosetteReader import re +import unicodedata ROSETTE = RosetteReader() @@ -15,6 +16,9 @@ ROSETTE_LANG_MAP = { } +NON_PUNCT_RE = re.compile('[0-9A-Za-z\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff0-9A-Za-z\uff66-\U0002ffff]') + + def last_tab(line): """ Read lines by keeping only the last tab-separated value. @@ -22,11 +26,26 @@ def last_tab(line): return line.split('\t')[-1].strip() -def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab): +def non_punct_filter(token): + if NON_PUNCT_RE.search(token): + return token.lower() + else: + return None + + +def pretokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab): + """ + Process a file by running it through the given tokenizer, sorting the + results by the language of each line, and inserting spaces into lines + to mark the token boundaries. This computes the 'hard part' of + tokenization and allows the results to be saved, so that we can change + the finer details of the output without re-running everything. + """ out_files = {} for line in open(in_filename, encoding='utf-8'): text = line_reader(line) - tokenized, language = tokenizer(text) + tokens, language = tokenizer(text) + tokenized = ' '.join(tokens) if language is not None: out_filename = '%s.%s.txt' % (out_prefix, language) if out_filename in out_files: @@ -39,6 +58,23 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab): out_file.close() +def monolingual_tokenize_file(in_filename, out_filename, language, + tokenizer, line_reader=last_tab, + token_filter=non_punct_filter, + sample_proportion=100): + with open(in_filename, encoding='utf-8', errors='replace') as in_file: + with open(out_filename, 'w', encoding='utf-8') as out_file: + for i, line in enumerate(in_file): + if i % sample_proportion == 0: + text = line_reader(line) + tokens, line_language = tokenizer(text) + if line_language == language: + filtered = [token_filter(t) for t in tokens] + filtered = [t for t in filtered if t is not None] + for token in filtered: + print(token, file=out_file) + + def rosette_surface_tokenizer(text): try: analysis, lang = ROSETTE.rosette.analyze(text) @@ -50,7 +86,7 @@ def rosette_surface_tokenizer(text): for (stem, pos, span) in analysis: surface_text = text[span[0]:span[1]] tokens.append(surface_text) - return ' '.join(tokens), language + return tokens, language def treebank_surface_tokenizer(text, language='en'): From bd579e231915b5b9834272aec4ce4982eb6f7466 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Tue, 5 May 2015 14:06:13 -0400 Subject: [PATCH 09/16] fix the 'count' ninja rule --- wordfreq_builder/rules.ninja | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index 9be5bd1..032d80a 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -43,7 +43,7 @@ rule tokenize_twitter # number at the start of the line. # rule count - command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)$/\2,\1/' > $out + command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out rule cat command = cat $in > $out From 16928ed182c8dd7196ac83223abe9d1aacd1792e Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Tue, 5 May 2015 15:21:24 -0400 Subject: [PATCH 10/16] add rules to count wikipedia tokens --- wordfreq_builder/rules.ninja | 8 ++++++-- wordfreq_builder/wordfreq_builder/ninja.py | 21 +++++++++++++++------ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index 032d80a..a1dc1c7 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -35,7 +35,11 @@ rule tokenize_twitter # This rule uses command-line tools to take in a file with one token per line, # and output a comma-separated file with the token counts: # -# * 'sort $in | uniq -c' does the actual counting. +# * 'LANG=C' disables fancy Unicode sorting and instead just sorts by byte +# order, which is fine because we only need this order so we can run +# 'uniq'. +# * 'sort $in -T $tmp | uniq -c' does the actual counting. The possibly +# large amount of temporary output goes in $tmp. # * 'sort -nrk 1' sorts the result in reverse numeric order by the first field # (the count). # * The 'sed' command rearranges the lines to be comma-separated values with @@ -43,7 +47,7 @@ rule tokenize_twitter # number at the start of the line. # rule count - command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out + command = mkdir -p $tmp && mkdir -p $$(dirname $out) && LANG=C sort $in -T $tmp | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out rule cat command = cat $in > $out diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index ab74ec8..1c58154 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -5,6 +5,7 @@ import pathlib HEADER = """# This file is automatically generated. Do not edit it. # You can regenerate it using the 'wordfreq-build-deps' command. """ +TMPDIR = data_filename('tmp') def make_ninja_deps(rules_filename, out=sys.stdout): @@ -17,13 +18,13 @@ def make_ninja_deps(rules_filename, out=sys.stdout): print(rulesfile.read(), file=out) lines = ( - language_detect_and_tokenize_deps( + twitter_deps( data_filename('raw-input/twitter/all-2014.txt'), slice_prefix=data_filename('slices/twitter/tweets-2014'), - combined_prefix=data_filename('generated/twitter/tweets-2014'), + combined_prefix=data_filename('intermediate/twitter/tweets-2014'), slices=40 ) + - wiki_parse_deps( + wikipedia_deps( data_filename('raw-input/wikipedia'), data_filename('generated/wikipedia'), CONFIG['wp_languages'] @@ -32,7 +33,7 @@ def make_ninja_deps(rules_filename, out=sys.stdout): print('\n'.join(lines), file=out) -def wiki_parse_deps(dirname_in, dirname_out, languages): +def wikipedia_deps(dirname_in, dirname_out, languages): lines = [] path_in = pathlib.Path(dirname_in) path_out = pathlib.Path(dirname_out) @@ -51,11 +52,19 @@ def wiki_parse_deps(dirname_in, dirname_out, languages): outs=output_file, ins=input_file ) lines.append(build_rule) + + token_file = output_file + output_file = path_out / 'wikipedia_{}.counts.txt'.format(language) + build_rule = "build {outs}: count {ins}".format( + outs=output_file, ins=token_file + ) + lines.append(build_rule) + lines.append(" tmp = {}".format(TMPDIR)) return lines -def language_detect_and_tokenize_deps(input_filename, slice_prefix, - combined_prefix, slices): +def twitter_deps(input_filename, slice_prefix, + combined_prefix, slices): lines = [] # split the input into slices slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num) From d2f9c607764c4ebdede2a1ef81ed62d9000367d7 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Thu, 7 May 2015 16:49:53 -0400 Subject: [PATCH 11/16] WIP on more build steps --- wordfreq_builder/Makefile | 2 +- wordfreq_builder/rules.ninja | 38 +++-- wordfreq_builder/setup.py | 1 + .../wordfreq_builder/cli/combine_lists.py | 16 +- .../wordfreq_builder/cli/count_tokens.py | 16 ++ .../wordfreq_builder/cli/count_twitter.py | 28 ---- .../wordfreq_builder/cli/count_wikipedia.py | 24 --- .../wordfreq_builder/cli/format_twitter.py | 14 ++ wordfreq_builder/wordfreq_builder/config.py | 53 +++++-- wordfreq_builder/wordfreq_builder/ninja.py | 74 ++++++--- .../wordfreq_builder/tokenizers.py | 66 +++++++- .../wordfreq_builder/word_counts.py | 146 ++++++------------ 12 files changed, 268 insertions(+), 210 deletions(-) create mode 100644 wordfreq_builder/wordfreq_builder/cli/count_tokens.py delete mode 100644 wordfreq_builder/wordfreq_builder/cli/count_twitter.py delete mode 100644 wordfreq_builder/wordfreq_builder/cli/count_wikipedia.py create mode 100644 wordfreq_builder/wordfreq_builder/cli/format_twitter.py diff --git a/wordfreq_builder/Makefile b/wordfreq_builder/Makefile index 0820c5c..206c432 100644 --- a/wordfreq_builder/Makefile +++ b/wordfreq_builder/Makefile @@ -8,5 +8,5 @@ wordfreq_builder.egg-info/PKG-INFO: setup.py # build the Ninja file that will take over the build process build.ninja: rules.ninja wordfreq_builder/ninja.py wordfreq_builder/config.py wordfreq_builder.egg-info/PKG-INFO - wordfreq-build-deps rules.ninja > build.ninja + $(PYTHON) -m wordfreq_builder.cli.build_deps rules.ninja > build.ninja diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index a1dc1c7..6b6c018 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -29,25 +29,29 @@ rule wiki2text rule wiki2tokens command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out -rule tokenize_twitter - command = mkdir -p $$(dirname $prefix) && wordfreq-tokenize-twitter $in $prefix +rule tokenize_japanese + command = mkdir -p $$(dirname $out) && mecab < $in | cut -f 1 | grep -v "EOS" + +rule tokenize_twitter + command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.pretokenize_twitter $in $prefix + +rule format_twitter + command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.format_twitter $in $out + +# To convert the Leeds corpus, look for space-separated lines that start with +# an integer and a decimal. The integer is the rank, which we discard. The +# decimal is the frequency, and the remaining text is the term. Use sed -n +# with /p to output only lines where the match was successful. +rule convert_leeds + command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in > $out + +# To convert the OpenSubtitles frequency data, simply replace spaces with +# commas. +rule convert_opensubtitles + command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out -# This rule uses command-line tools to take in a file with one token per line, -# and output a comma-separated file with the token counts: -# -# * 'LANG=C' disables fancy Unicode sorting and instead just sorts by byte -# order, which is fine because we only need this order so we can run -# 'uniq'. -# * 'sort $in -T $tmp | uniq -c' does the actual counting. The possibly -# large amount of temporary output goes in $tmp. -# * 'sort -nrk 1' sorts the result in reverse numeric order by the first field -# (the count). -# * The 'sed' command rearranges the lines to be comma-separated values with -# the count coming second, instead of the count being a right-justified -# number at the start of the line. -# rule count - command = mkdir -p $tmp && mkdir -p $$(dirname $out) && LANG=C sort $in -T $tmp | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)/\2,\1/' > $out + command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out rule cat command = cat $in > $out diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py index 3f3b902..88b6d49 100755 --- a/wordfreq_builder/setup.py +++ b/wordfreq_builder/setup.py @@ -12,6 +12,7 @@ setup( entry_points={ 'console_scripts': [ 'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main', + 'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main', 'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main', 'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main' ] diff --git a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py index 7b67375..61d9674 100644 --- a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py +++ b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py @@ -1,21 +1,19 @@ -from wordfreq_builder.word_counts import read_counts, write_counts, merge_counts -from pathlib import Path +from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist import argparse -def merge_lists(input_names, output_name, balance=False): - count_dicts = [] +def merge_lists(input_names, output_name): + freq_dicts = [] for input_name in input_names: - count_dicts.append(read_counts(Path(input_name))) - merged = merge_counts(count_dicts, balance=balance) - write_counts(merged, Path(output_name)) + freq_dicts.append(read_freqs(input_name)) + merged = merge_freqs(freq_dicts) + write_wordlist(merged, output_name) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv') - parser.add_argument('-b', '--balance', action='store_true', help='Automatically balance unequally-sampled word frequencies') parser.add_argument('inputs', help='names of input files to merge', nargs='+') args = parser.parse_args() - merge_lists(args.inputs, args.output, balance=args.balance) + merge_lists(args.inputs, args.output) diff --git a/wordfreq_builder/wordfreq_builder/cli/count_tokens.py b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py new file mode 100644 index 0000000..4aeba5b --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py @@ -0,0 +1,16 @@ +from wordfreq_builder.word_counts import count_tokens, write_wordlist +import argparse + + +def handle_counts(filename_in, filename_out): + counts = count_tokens(filename_in) + write_wordlist(counts, filename_out) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('filename_in', help='name of input file containing tokens') + parser.add_argument('filename_out', help='name of output file') + args = parser.parse_args() + handle_counts(args.filename_in, args.filename_out) + diff --git a/wordfreq_builder/wordfreq_builder/cli/count_twitter.py b/wordfreq_builder/wordfreq_builder/cli/count_twitter.py deleted file mode 100644 index 7613d2d..0000000 --- a/wordfreq_builder/wordfreq_builder/cli/count_twitter.py +++ /dev/null @@ -1,28 +0,0 @@ -from wordfreq_builder.word_counts import WordCountBuilder -from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer -from pathlib import Path -import argparse - - -def count_twitter(pathname, offset=0, nsplit=1, surface=False): - path = Path(pathname) - if surface == True: - tokenizer = rosette_surface_tokenizer - else: - tokenizer = rosette_tokenizer - builder = WordCountBuilder(tokenizer=tokenizer) - save_filename = 'twitter-counts-%d.csv' % offset - save_pathname = path.parent / save_filename - builder.count_twitter(path, offset, nsplit) - builder.save_wordlist(save_pathname) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('filename', help='filename of input file containing one tweet per line') - parser.add_argument('offset', type=int) - parser.add_argument('nsplit', type=int) - parser.add_argument('-s', '--surface', action='store_true', help='Use surface text instead of stems') - args = parser.parse_args() - count_twitter(args.filename, args.offset, args.nsplit, surface=args.surface) - diff --git a/wordfreq_builder/wordfreq_builder/cli/count_wikipedia.py b/wordfreq_builder/wordfreq_builder/cli/count_wikipedia.py deleted file mode 100644 index da51519..0000000 --- a/wordfreq_builder/wordfreq_builder/cli/count_wikipedia.py +++ /dev/null @@ -1,24 +0,0 @@ -from wordfreq_builder.word_counts import WordCountBuilder -from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer -from pathlib import Path -import argparse - - -def count_wikipedia(filename, surface=False): - path = Path(filename) - if surface == True: - tokenizer = rosette_surface_tokenizer - else: - tokenizer = rosette_tokenizer - builder = WordCountBuilder(tokenizer=tokenizer, unique_docs=False) - builder.count_wikipedia(path) - builder.save_wordlist(path.parent / 'counts.csv') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('filename', help='flat text file containing extracted Wikipedia text') - parser.add_argument('-s', '--surface', action='store_true', help='Use surface text instead of stems') - args = parser.parse_args() - count_wikipedia(args.filename, surface=args.surface) - diff --git a/wordfreq_builder/wordfreq_builder/cli/format_twitter.py b/wordfreq_builder/wordfreq_builder/cli/format_twitter.py new file mode 100644 index 0000000..224c5a1 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/format_twitter.py @@ -0,0 +1,14 @@ +from wordfreq_builder.tokenizers import retokenize_file +import argparse + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('in_filename', help='filename of input file containing one tweet per line') + parser.add_argument('out_filename', help='filename of output file') + args = parser.parse_args() + retokenize_file(args.in_filename, args.out_filename) + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index bb19bdf..ec62634 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -5,17 +5,52 @@ CONFIG = { # data_dir is a relative or absolute path to where the wordlist data # is stored 'data_dir': 'data', - 'languages': [ - 'en', 'es', 'fr', 'de', 'pt', 'nl', 'ru', 'it', 'ar', 'ms', 'id', - 'ja', 'ko', 'zh-Hans', 'zh-Hant', - ], - # Skip the Chinese Wikipedia until we know what to do with it - 'wp_languages': [ - 'en', 'es', 'fr', 'de', 'pt', 'nl', 'ru', 'it', 'ar', 'ms', 'id', - 'ja', 'ko' - ] + 'sources': { + # A list of language codes (possibly un-standardized) that we'll + # look up in filenames for these various data sources. + 'twitter': [ + 'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', + 'pt', 'ru' + ], + 'wikipedia': [ + 'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', + 'pt', 'ru' + ], + 'opensubtitles': [ + # All languages where the most common word in OpenSubtitles + # appears at least 5000 times + 'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', + 'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'id', 'is', 'it', 'lt', 'lv', + 'mk', 'ms', 'nb', 'nl', 'pl', 'pt', 'ro', 'sk', 'sl', 'sq', 'sr', + 'sv', 'tr', 'uk', 'zh' + ], + 'leeds': [ + 'ar', 'de', 'el', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh' + ] + }, + 'wordlist_paths': { + 'twitter': 'generated/twitter/tweets-2014.{lang}{ext}.txt', + 'wikipedia': 'generated/wikipedia/wikipedia_{lang}{ext}.txt', + 'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}{ext}.txt', + 'leeds': 'generated/leeds/leeds_internet_{lang}{ext}.txt' + } } def data_filename(filename): return os.path.join(CONFIG['data_dir'], filename) + + +def wordlist_filename(source, language, extension=''): + path = CONFIG['wordlist_paths'][source].format( + lang=language, ext=extension + ) + return data_filename(path) + + +def source_names(language): + """ + Get the names of data sources that supply data for the given language. + """ + return sorted([key for key in CONFIG['sources'] + if language in CONFIG['sources'][key]]) diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 1c58154..3770c41 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -1,4 +1,4 @@ -from wordfreq_builder.config import CONFIG, data_filename +from wordfreq_builder.config import CONFIG, data_filename, wordlist_filename import sys import pathlib @@ -8,6 +8,10 @@ HEADER = """# This file is automatically generated. Do not edit it. TMPDIR = data_filename('tmp') +# Set this to True to rebuild the Twitter tokenization (which takes days) +PRETOKENIZE_TWITTER = False + + def make_ninja_deps(rules_filename, out=sys.stdout): """ Output a complete Ninja file describing how to build the wordfreq data. @@ -17,44 +21,53 @@ def make_ninja_deps(rules_filename, out=sys.stdout): with open(rules_filename, encoding='utf-8') as rulesfile: print(rulesfile.read(), file=out) - lines = ( + lines = [] + if PRETOKENIZE_TWITTER: + lines.extend( + twitter_preprocess_deps( + data_filename('raw-input/twitter/all-2014.txt'), + slice_prefix=data_filename('slices/twitter/tweets-2014'), + combined_prefix=data_filename('intermediate/twitter/tweets-2014'), + slices=40, + languages=CONFIG['sources']['twitter'] + ) + ) + lines.extend( twitter_deps( - data_filename('raw-input/twitter/all-2014.txt'), - slice_prefix=data_filename('slices/twitter/tweets-2014'), - combined_prefix=data_filename('intermediate/twitter/tweets-2014'), - slices=40 - ) + + data_filename('intermediate/twitter/tweets-2014'), + languages=CONFIG['sources']['twitter'] + ) + ) + lines.extend( wikipedia_deps( data_filename('raw-input/wikipedia'), - data_filename('generated/wikipedia'), - CONFIG['wp_languages'] + CONFIG['sources']['wikipedia'] ) ) print('\n'.join(lines), file=out) -def wikipedia_deps(dirname_in, dirname_out, languages): +def wikipedia_deps(dirname_in, languages): lines = [] path_in = pathlib.Path(dirname_in) - path_out = pathlib.Path(dirname_out) for language in languages: # Find the most recent file for this language input_file = max(path_in.glob( '{}wiki*.bz2'.format(language) )) - output_file = path_out / 'wikipedia_{}.txt'.format(language) + output_file = wordlist_filename('wikipedia', language, '') build_rule = "build {outs}: wiki2text {ins}".format( outs=output_file, ins=input_file ) lines.append(build_rule) - output_file = path_out / 'wikipedia_{}.tokens.txt'.format(language) + output_file = wordlist_filename('wikipedia', language, '.tokens') build_rule = "build {outs}: wiki2tokens {ins}".format( outs=output_file, ins=input_file ) lines.append(build_rule) token_file = output_file - output_file = path_out / 'wikipedia_{}.counts.txt'.format(language) + output_file = wordlist_filename('wikipedia', language, '.counts') build_rule = "build {outs}: count {ins}".format( outs=output_file, ins=token_file ) @@ -63,12 +76,13 @@ def wikipedia_deps(dirname_in, dirname_out, languages): return lines -def twitter_deps(input_filename, slice_prefix, - combined_prefix, slices): +def twitter_preprocess_deps(input_filename, slice_prefix, + combined_prefix, slices, languages): lines = [] - # split the input into slices + slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num) for num in range(slices)] + # split the input into slices build_rule = "build {outs}: split {ins}".format( outs=' '.join(slice_files), ins=input_filename ) @@ -81,7 +95,7 @@ def twitter_deps(input_filename, slice_prefix, slice_file = slice_files[slicenum] language_outputs = [ '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language) - for language in CONFIG['languages'] + for language in languages ] build_rule = "build {outs}: tokenize_twitter {ins}".format( outs=' '.join(language_outputs), ins=slice_file @@ -90,8 +104,9 @@ def twitter_deps(input_filename, slice_prefix, lines.append(" prefix = {}".format(slice_file)) lines.append("") - for language in CONFIG['languages']: + for language in languages: combined_output = '{prefix}.{lang}.txt'.format(prefix=combined_prefix, lang=language) + language_inputs = [ '{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language) for slicenum in range(slices) @@ -102,6 +117,27 @@ def twitter_deps(input_filename, slice_prefix, ) lines.append(build_rule) + +def twitter_deps(prefix_in, languages): + lines = [] + for language in languages: + input_file = '{prefix}.{lang}.txt'.format(prefix=prefix_in, lang=language) + output_file = wordlist_filename('twitter', language, '.tokens') + build_rule = "build {outs}: format_twitter {ins} | {deps}".format( + outs=output_file, + ins=input_file, + deps='wordfreq_builder/tokenizers.py' + ) + lines.append(build_rule) + + token_file = output_file + output_file = wordlist_filename('twitter', language, '.counts') + build_rule = "build {outs}: count {ins}".format( + outs=output_file, ins=token_file + ) + lines.append(build_rule) + lines.append(" tmp = {}".format(TMPDIR)) + return lines diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index 29be251..e4ea914 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -1,6 +1,6 @@ from lumi_science.text_readers.rosette_readers import RosetteReader +from html.entities import name2codepoint import re -import unicodedata ROSETTE = RosetteReader() @@ -18,6 +18,9 @@ ROSETTE_LANG_MAP = { NON_PUNCT_RE = re.compile('[0-9A-Za-z\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff0-9A-Za-z\uff66-\U0002ffff]') +EMOTICON_RANGE = '\u2600-\u26ff\U0001F000-\U0001F7FF' +RETOKENIZE_RE = re.compile('[{0}#@/]|[^{0}#@/ ]+'.format(EMOTICON_RANGE)) + def last_tab(line): """ @@ -26,13 +29,17 @@ def last_tab(line): return line.split('\t')[-1].strip() -def non_punct_filter(token): +def lowercase_text_filter(token): if NON_PUNCT_RE.search(token): return token.lower() else: return None +def is_url(token): + return token.startswith('http:') or token.startswith('https:') + + def pretokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab): """ Process a file by running it through the given tokenizer, sorting the @@ -58,9 +65,60 @@ def pretokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab): out_file.close() +ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;') + + +def fix_entities(text): + """ + Fix the few HTML entities that Twitter uses -- even if they've + already been tokenized. + """ + def replace_entity(match): + return chr(name2codepoint[match.group(1)]) + return ENTITY_RE.sub(replace_entity, text) + + +def retokenize(text): + text = fix_entities(text) + tokens = RETOKENIZE_RE.findall(text) + skip_next = False + for token in tokens: + if token == '/' or token == '@': + # Avoid idiosyncratic tokens such as URLs and + # usernames + skip_next = True + elif skip_next: + skip_next = False + else: + if not is_url(token): + filtered = lowercase_text_filter(token) + if filtered: + yield filtered + + +def retokenize_file(in_filename, out_filename): + """ + Process a file that has been tokenized (by inserting spaces) in a + language-specific way by Rosette. + """ + with open(in_filename, encoding='utf-8') as in_file: + with open(out_filename, 'w', encoding='utf-8') as out_file: + for line in in_file: + skip_next = False + for token in retokenize(line.strip()): + if skip_next: + skip_next = False + elif token == '/' or token == '@': + # Avoid idiosyncratic tokens such as URLs and + # usernames + skip_next = True + elif lowercase_text_filter(token): + print(token, file=out_file) + + def monolingual_tokenize_file(in_filename, out_filename, language, tokenizer, line_reader=last_tab, - token_filter=non_punct_filter, + token_filter=lowercase_text_filter, sample_proportion=100): with open(in_filename, encoding='utf-8', errors='replace') as in_file: with open(out_filename, 'w', encoding='utf-8') as out_file: @@ -78,7 +136,7 @@ def monolingual_tokenize_file(in_filename, out_filename, language, def rosette_surface_tokenizer(text): try: analysis, lang = ROSETTE.rosette.analyze(text) - except (RuntimeError, UnicodeError) as e: + except (RuntimeError, UnicodeError): # Our Rosette interface throws errors given arbitrary data. :( return text, None language = ROSETTE_LANG_MAP.get(lang, lang) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index a379e8e..b150ddd 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -1,116 +1,64 @@ -from wordfreq_builder.tokenizers import treebank_surface_tokenizer +from wordfreq_builder.tokenizers import retokenize from collections import defaultdict from operator import itemgetter -from pathlib import Path -from unicodedata import normalize +from ftfy import fix_text import csv -import sys -def read_counts(path): +def count_tokens(filename): counts = defaultdict(int) - with path.open(encoding='utf-8', newline='') as infile: - reader = csv.reader(infile) - for key, strval in reader: - val = float(strval) - # Use += so that, if we give the reader concatenated files with - # duplicates, it does the right thing - counts[key] += val + with open(filename, encoding='utf-8') as infile: + for line in infile: + for token in retokenize(line.strip()): + counts[token] += 1 return counts -def count_languages(counts): - langcounts = defaultdict(int) - for key, strval in counts.items(): - val = int(strval) - text, lang = key.rsplit('|', 1) - langcounts[lang] += val - return langcounts +def read_freqs(filename, cutoff=2): + raw_counts = defaultdict(float) + total = 0. + with open(filename, encoding='utf-8', newline='') as infile: + reader = csv.reader(infile) + for key, strval in reader: + val = float(strval) + if val < cutoff: + break + for token in retokenize(key): + token = fix_text(token) + total += val + # Use += so that, if we give the reader concatenated files with + # duplicates, it does the right thing + raw_counts[token] += val + + freqs = {key: raw_count / total + for (key, raw_count) in raw_counts.items()} + return freqs -def merge_counts(count_dicts, balance=False): +def merge_freqs(freq_dicts): + vocab = set() + for freq_dict in freq_dicts: + vocab |= set(freq_dict) + merged = defaultdict(float) - maxweight = None - for counts in count_dicts: - if balance: - if maxweight is None: - maxweight = max(counts.values()) - weight = maxweight / max(counts.values()) / len(count_dicts) - else: - weight = 1. - for key, val in counts.items(): - merged[key] += val * weight + N = len(freq_dicts) + for term in vocab: + term_total = 0. + for freq_dict in freq_dicts: + term_total += freq_dict.get(term, 0.) + merged[term] = term_total / N + return merged -def write_counts(counts, path, cutoff=2): - print("Writing to %s" % path) - with path.open('w', encoding='utf-8', newline='') as outfile: +def write_wordlist(freqs, filename): + """ + Write a dictionary of either raw counts or frequencies to a file of + comma-separated values. + """ + with open(filename, 'w', encoding='utf-8', newline='\n') as outfile: writer = csv.writer(outfile) - items = sorted(counts.items(), key=itemgetter(1), reverse=True) - for word, count in items: - if count < cutoff: - # Don't write all the terms that appeared too infrequently - break + items = sorted(freqs.items(), key=itemgetter(1), reverse=True) + for word, freq in items: if not ('"' in word or ',' in word): - writer.writerow([word, str(int(count))]) - - -class WordCountBuilder: - def __init__(self, unique_docs=True, tokenizer=None): - self.counts = defaultdict(int) - self.unique_docs = unique_docs - if tokenizer is None: - self.tokenizer = treebank_surface_tokenizer - else: - self.tokenizer = tokenizer - - def add_text(self, text): - text = normalize('NFKC', text).lower() - try: - tokens = self.tokenizer(text) - # print(' '.join(tokens)) - except Exception as e: - print("Couldn't tokenize due to %r: %s" % (e, text), file=sys.stderr) - return - if self.unique_docs: - tokens = set(tokens) - for tok in tokens: - self.counts[tok] += 1 - - def count_wikipedia(self, path): - """ - Read a directory of extracted Wikipedia articles. The articles can be - grouped together into files, in which case they should be separated by - lines beginning with ##. - """ - with path.open(encoding='utf-8') as file: - article_lines = [] - for line in file: - line = line.strip() - if line.startswith('= ') and line.endswith(' ='): - # Fake level-1 headings indicate boundaries between articles - print(line) - self.try_wiki_article(' '.join(article_lines)) - article_lines.clear() - else: - # Skip other headings, so that "external" doesn't look - # ridiculously common, for example - if not (line.startswith('==') and line.endswith('==')): - article_lines.append(line) - self.try_wiki_article(' '.join(article_lines)) - - def try_wiki_article(self, text): - if len(text) > 1000: - self.add_text(text) - - def count_twitter(self, path, offset, nsplit): - with path.open(encoding='utf-8') as file: - for i, line in enumerate(file): - if i % nsplit == offset: - line = line.strip() - text = line.split('\t')[-1] - self.add_text(text) - - def save_wordlist(self, path): - write_counts(self.counts, path) + writer.writerow([word, str(freq)]) From 7e238cf547391fed9d532cb79ca70041681bac17 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Thu, 7 May 2015 16:59:28 -0400 Subject: [PATCH 12/16] abstract how we define build rules a bit --- wordfreq_builder/wordfreq_builder/ninja.py | 89 ++++++++++------------ 1 file changed, 40 insertions(+), 49 deletions(-) diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 3770c41..4099049 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -12,6 +12,27 @@ TMPDIR = data_filename('tmp') PRETOKENIZE_TWITTER = False +def add_dep(lines, rule, input, output, extra=None, params=None): + if isinstance(output, list): + output = ' '.join(output) + if isinstance(input, list): + input = ' '.join(input) + if extra: + if isinstance(extra, list): + extra = ' '.join(extra) + extrastr = ' | ' + extra + else: + extrastr = '' + build_rule = "build {output}: {rule} {input}{extra}".format( + output=output, rule=rule, input=input, extra=extrastr + ) + lines.append(build_rule) + if params: + for key, val in params.items(): + lines.append(" {key} = {val}".format(locals())) + lines.append("") + + def make_ninja_deps(rules_filename, out=sys.stdout): """ Output a complete Ninja file describing how to build the wordfreq data. @@ -55,24 +76,13 @@ def wikipedia_deps(dirname_in, languages): input_file = max(path_in.glob( '{}wiki*.bz2'.format(language) )) - output_file = wordlist_filename('wikipedia', language, '') - build_rule = "build {outs}: wiki2text {ins}".format( - outs=output_file, ins=input_file - ) - lines.append(build_rule) - output_file = wordlist_filename('wikipedia', language, '.tokens') - build_rule = "build {outs}: wiki2tokens {ins}".format( - outs=output_file, ins=input_file - ) - lines.append(build_rule) + raw_file = wordlist_filename('wikipedia', language, '') + token_file = wordlist_filename('wikipedia', language, '.tokens') + count_file = wordlist_filename('wikipedia', language, '.counts') - token_file = output_file - output_file = wordlist_filename('wikipedia', language, '.counts') - build_rule = "build {outs}: count {ins}".format( - outs=output_file, ins=token_file - ) - lines.append(build_rule) - lines.append(" tmp = {}".format(TMPDIR)) + add_dep(lines, 'wiki2text', input_file, raw_file) + add_dep(lines, 'wiki2tokens', input_file, token_file) + add_dep(lines, 'count', token_file, count_file) return lines @@ -83,13 +93,10 @@ def twitter_preprocess_deps(input_filename, slice_prefix, slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num) for num in range(slices)] # split the input into slices - build_rule = "build {outs}: split {ins}".format( - outs=' '.join(slice_files), ins=input_filename - ) - lines.append(build_rule) - lines.append(" prefix = {}.part".format(slice_prefix)) - lines.append(" slices = {}".format(slices)) - lines.append("") + add_dep(lines, + 'split', input_filename, slice_files, + {'prefix': '{}.part'.format(slice_prefix), + 'slices': slices}) for slicenum in range(slices): slice_file = slice_files[slicenum] @@ -97,12 +104,8 @@ def twitter_preprocess_deps(input_filename, slice_prefix, '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language) for language in languages ] - build_rule = "build {outs}: tokenize_twitter {ins}".format( - outs=' '.join(language_outputs), ins=slice_file - ) - lines.append(build_rule) - lines.append(" prefix = {}".format(slice_file)) - lines.append("") + add_dep(lines, 'tokenize_twitter', slice_file, language_outputs, + {'prefix': slice_file}) for language in languages: combined_output = '{prefix}.{lang}.txt'.format(prefix=combined_prefix, lang=language) @@ -111,32 +114,20 @@ def twitter_preprocess_deps(input_filename, slice_prefix, '{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language) for slicenum in range(slices) ] - build_rule = "build {outs}: cat {ins}".format( - outs=combined_output, - ins=' '.join(language_inputs) - ) - lines.append(build_rule) + add_dep(lines, 'cat', language_inputs, combined_output) def twitter_deps(prefix_in, languages): lines = [] for language in languages: input_file = '{prefix}.{lang}.txt'.format(prefix=prefix_in, lang=language) - output_file = wordlist_filename('twitter', language, '.tokens') - build_rule = "build {outs}: format_twitter {ins} | {deps}".format( - outs=output_file, - ins=input_file, - deps='wordfreq_builder/tokenizers.py' - ) - lines.append(build_rule) + token_file = wordlist_filename('twitter', language, '.tokens') + add_dep(lines, + 'format_twitter', input_file, token_file, + extra='wordfreq_builder/tokenizers.py') - token_file = output_file - output_file = wordlist_filename('twitter', language, '.counts') - build_rule = "build {outs}: count {ins}".format( - outs=output_file, ins=token_file - ) - lines.append(build_rule) - lines.append(" tmp = {}".format(TMPDIR)) + count_file = wordlist_filename('twitter', language, '.counts') + add_dep(lines, 'count', token_file, count_file) return lines From 02d8b321195e0cbf00b5da00cdacb35a5cf043d3 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Thu, 7 May 2015 17:07:33 -0400 Subject: [PATCH 13/16] process leeds and opensubtitles --- wordfreq_builder/wordfreq_builder/ninja.py | 38 ++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 4099049..c1e0560 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -65,6 +65,19 @@ def make_ninja_deps(rules_filename, out=sys.stdout): CONFIG['sources']['wikipedia'] ) ) + lines.extend( + leeds_deps( + data_filename('source-lists/leeds'), + CONFIG['sources']['leeds'] + ) + ) + lines.extend( + opensubtitles_deps( + data_filename('source-lists/opensubtitles'), + CONFIG['sources']['opensubtitles'] + ) + ) + print('\n'.join(lines), file=out) @@ -115,6 +128,7 @@ def twitter_preprocess_deps(input_filename, slice_prefix, for slicenum in range(slices) ] add_dep(lines, 'cat', language_inputs, combined_output) + return lines def twitter_deps(prefix_in, languages): @@ -132,6 +146,30 @@ def twitter_deps(prefix_in, languages): return lines +def leeds_deps(dirname_in, languages): + lines = [] + for language in languages: + input_file = '{prefix}/internet-{lang}-forms.num'.format( + prefix=dirname_in, lang=language + ) + reformatted_file = wordlist_filename('leeds', language, '.counts') + add_dep(lines, 'convert_leeds', input_file, reformatted_file) + + return lines + + +def opensubtitles_deps(dirname_in, languages): + lines = [] + for language in languages: + input_file = '{prefix}/{lang}.txt'.format( + prefix=dirname_in, lang=language + ) + reformatted_file = wordlist_filename('opensubtitles', language, '.counts') + add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file) + + return lines + + def main(): make_ninja_deps('rules.ninja') From abb0e059c83f0842a0549314560ce5b7f2f8986d Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Thu, 7 May 2015 19:38:33 -0400 Subject: [PATCH 14/16] a reasonably complete build process --- wordfreq_builder/rules.ninja | 6 +++ wordfreq_builder/setup.py | 1 + .../wordfreq_builder/cli/combine_lists.py | 2 +- .../wordfreq_builder/cli/freqs_to_dB.py | 11 ++++++ wordfreq_builder/wordfreq_builder/config.py | 19 +++++++--- wordfreq_builder/wordfreq_builder/ninja.py | 37 +++++++++++++++---- .../wordfreq_builder/word_counts.py | 25 ++++++++++++- 7 files changed, 85 insertions(+), 16 deletions(-) create mode 100644 wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index 6b6c018..12c0360 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -53,5 +53,11 @@ rule convert_opensubtitles rule count command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out +rule merge + command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in + +rule freqs2dB + command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_dB $in $out + rule cat command = cat $in > $out diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py index 88b6d49..1998708 100755 --- a/wordfreq_builder/setup.py +++ b/wordfreq_builder/setup.py @@ -9,6 +9,7 @@ setup( platforms=["any"], description="Turns raw data into word frequency lists", packages=['wordfreq_builder'], + install_requires=['msgpack'], entry_points={ 'console_scripts': [ 'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main', diff --git a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py index 61d9674..61d4b1d 100644 --- a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py +++ b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py @@ -5,7 +5,7 @@ import argparse def merge_lists(input_names, output_name): freq_dicts = [] for input_name in input_names: - freq_dicts.append(read_freqs(input_name)) + freq_dicts.append(read_freqs(input_name, cutoff=2)) merged = merge_freqs(freq_dicts) write_wordlist(merged, output_name) diff --git a/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py b/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py new file mode 100644 index 0000000..81a4dde --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py @@ -0,0 +1,11 @@ +from wordfreq_builder.word_counts import freqs_to_dBpack +import argparse + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('filename_in', help='name of input file containing tokens') + parser.add_argument('filename_out', help='name of output file') + args = parser.parse_args() + freqs_to_dBpack(args.filename_in, args.filename_out) + diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index ec62634..dafd1c0 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -29,10 +29,11 @@ CONFIG = { ] }, 'wordlist_paths': { - 'twitter': 'generated/twitter/tweets-2014.{lang}{ext}.txt', - 'wikipedia': 'generated/wikipedia/wikipedia_{lang}{ext}.txt', - 'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}{ext}.txt', - 'leeds': 'generated/leeds/leeds_internet_{lang}{ext}.txt' + 'twitter': 'generated/twitter/tweets-2014.{lang}.{ext}', + 'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}', + 'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}', + 'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}', + 'combined': 'generated/combined/combined_{lang}.{ext}' } } @@ -41,7 +42,7 @@ def data_filename(filename): return os.path.join(CONFIG['data_dir'], filename) -def wordlist_filename(source, language, extension=''): +def wordlist_filename(source, language, extension='txt'): path = CONFIG['wordlist_paths'][source].format( lang=language, ext=extension ) @@ -54,3 +55,11 @@ def source_names(language): """ return sorted([key for key in CONFIG['sources'] if language in CONFIG['sources'][key]]) + + +def all_languages(): + languages = set() + for langlist in CONFIG['sources'].values(): + languages |= set(langlist) + return sorted(languages) + diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index c1e0560..1059ba3 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -1,4 +1,6 @@ -from wordfreq_builder.config import CONFIG, data_filename, wordlist_filename +from wordfreq_builder.config import ( + CONFIG, data_filename, wordlist_filename, all_languages, source_names +) import sys import pathlib @@ -77,6 +79,7 @@ def make_ninja_deps(rules_filename, out=sys.stdout): CONFIG['sources']['opensubtitles'] ) ) + lines.extend(combine_lists(all_languages())) print('\n'.join(lines), file=out) @@ -89,9 +92,9 @@ def wikipedia_deps(dirname_in, languages): input_file = max(path_in.glob( '{}wiki*.bz2'.format(language) )) - raw_file = wordlist_filename('wikipedia', language, '') - token_file = wordlist_filename('wikipedia', language, '.tokens') - count_file = wordlist_filename('wikipedia', language, '.counts') + raw_file = wordlist_filename('wikipedia', language, 'txt') + token_file = wordlist_filename('wikipedia', language, 'tokens.txt') + count_file = wordlist_filename('wikipedia', language, 'counts.txt') add_dep(lines, 'wiki2text', input_file, raw_file) add_dep(lines, 'wiki2tokens', input_file, token_file) @@ -135,12 +138,12 @@ def twitter_deps(prefix_in, languages): lines = [] for language in languages: input_file = '{prefix}.{lang}.txt'.format(prefix=prefix_in, lang=language) - token_file = wordlist_filename('twitter', language, '.tokens') + token_file = wordlist_filename('twitter', language, 'tokens.txt') add_dep(lines, 'format_twitter', input_file, token_file, extra='wordfreq_builder/tokenizers.py') - count_file = wordlist_filename('twitter', language, '.counts') + count_file = wordlist_filename('twitter', language, 'counts.txt') add_dep(lines, 'count', token_file, count_file) return lines @@ -152,7 +155,7 @@ def leeds_deps(dirname_in, languages): input_file = '{prefix}/internet-{lang}-forms.num'.format( prefix=dirname_in, lang=language ) - reformatted_file = wordlist_filename('leeds', language, '.counts') + reformatted_file = wordlist_filename('leeds', language, 'counts.txt') add_dep(lines, 'convert_leeds', input_file, reformatted_file) return lines @@ -164,12 +167,30 @@ def opensubtitles_deps(dirname_in, languages): input_file = '{prefix}/{lang}.txt'.format( prefix=dirname_in, lang=language ) - reformatted_file = wordlist_filename('opensubtitles', language, '.counts') + reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt') add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file) return lines +def combine_lists(languages): + lines = [] + for language in languages: + sources = source_names(language) + input_files = [ + wordlist_filename(source, language, 'counts.txt') + for source in sources + ] + output_file = wordlist_filename('combined', language) + add_dep(lines, 'merge', input_files, output_file, + extra='wordfreq_builder/word_counts.py') + + output_dBpack = wordlist_filename('combined', language, 'msgpack.gz') + add_dep(lines, 'freqs2dB', output_file, output_dBpack, + extra='wordfreq_builder/word_counts.py') + return lines + + def main(): make_ninja_deps('rules.ninja') diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index b150ddd..be49288 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -2,7 +2,10 @@ from wordfreq_builder.tokenizers import retokenize from collections import defaultdict from operator import itemgetter from ftfy import fix_text +import math import csv +import msgpack +import gzip def count_tokens(filename): @@ -14,7 +17,7 @@ def count_tokens(filename): return counts -def read_freqs(filename, cutoff=2): +def read_freqs(filename, cutoff=0): raw_counts = defaultdict(float) total = 0. with open(filename, encoding='utf-8', newline='') as infile: @@ -35,6 +38,22 @@ def read_freqs(filename, cutoff=2): return freqs +def freqs_to_dBpack(in_filename, out_filename, cutoff=-60): + freq_cutoff = 10 ** (cutoff / 10.) + freqs = read_freqs(in_filename, freq_cutoff) + dBpack = [] + for token, freq in freqs.items(): + dB = round(math.log10(freq) * 10) + if dB >= cutoff: + neg_dB = -dB + while neg_dB >= len(dBpack): + dBpack.append([]) + dBpack[neg_dB].append(token) + + with gzip.open(out_filename, 'wb') as outfile: + msgpack.dump(dBpack, outfile) + + def merge_freqs(freq_dicts): vocab = set() for freq_dict in freq_dicts: @@ -51,7 +70,7 @@ def merge_freqs(freq_dicts): return merged -def write_wordlist(freqs, filename): +def write_wordlist(freqs, filename, cutoff=1e-8): """ Write a dictionary of either raw counts or frequencies to a file of comma-separated values. @@ -60,5 +79,7 @@ def write_wordlist(freqs, filename): writer = csv.writer(outfile) items = sorted(freqs.items(), key=itemgetter(1), reverse=True) for word, freq in items: + if freq < cutoff: + break if not ('"' in word or ',' in word): writer.writerow([word, str(freq)]) From 1b7a2b9d0bcff1c79bd1f8a534528aaf3407fdaa Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Thu, 7 May 2015 23:55:57 -0400 Subject: [PATCH 15/16] fix dependency --- wordfreq_builder/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py index 1998708..1466d35 100755 --- a/wordfreq_builder/setup.py +++ b/wordfreq_builder/setup.py @@ -9,7 +9,7 @@ setup( platforms=["any"], description="Turns raw data into word frequency lists", packages=['wordfreq_builder'], - install_requires=['msgpack'], + install_requires=['msgpack-python'], entry_points={ 'console_scripts': [ 'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main', From 2f14417bcf21e74dfacf0cf637ff699b09fe83f5 Mon Sep 17 00:00:00 2001 From: Rob Speer Date: Thu, 7 May 2015 23:59:04 -0400 Subject: [PATCH 16/16] limit final builds to languages with >= 2 sources --- wordfreq_builder/wordfreq_builder/config.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index dafd1c0..b6af74d 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -10,7 +10,8 @@ CONFIG = { # look up in filenames for these various data sources. 'twitter': [ 'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', - 'pt', 'ru' + 'pt', 'ru', + # can be added later: 'th', 'tr' ], 'wikipedia': [ 'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', @@ -34,7 +35,8 @@ CONFIG = { 'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}', 'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}', 'combined': 'generated/combined/combined_{lang}.{ext}' - } + }, + 'min_sources': 2 } @@ -61,5 +63,7 @@ def all_languages(): languages = set() for langlist in CONFIG['sources'].values(): languages |= set(langlist) - return sorted(languages) + return [lang for lang in sorted(languages) + if len(source_names(lang)) + >= CONFIG['min_sources']]