diff --git a/wordfreq_builder/.gitignore b/wordfreq_builder/.gitignore new file mode 100644 index 0000000..a1da2e9 --- /dev/null +++ b/wordfreq_builder/.gitignore @@ -0,0 +1,12 @@ +*.pyc +__pycache__ +.coverage +.idea +dist +*.egg-info +build +_build +build.ninja +data +.ninja_deps +.ninja_log diff --git a/wordfreq_builder/Makefile b/wordfreq_builder/Makefile new file mode 100644 index 0000000..626cf46 --- /dev/null +++ b/wordfreq_builder/Makefile @@ -0,0 +1,8 @@ +PYTHON = python + +all: build.ninja + +# build the Ninja file that will take over the build process +build.ninja: rules.ninja wordfreq_builder/ninja.py wordfreq_builder/config.py wordfreq_builder.egg-info/PKG-INFO + $(PYTHON) -m wordfreq_builder.cli.build_deps rules.ninja > build.ninja + diff --git a/wordfreq_builder/README.md b/wordfreq_builder/README.md new file mode 100644 index 0000000..f1bde22 --- /dev/null +++ b/wordfreq_builder/README.md @@ -0,0 +1,160 @@ +# wordfreq\_builder + +This package builds the data files for [wordfreq](https://github.com/LuminosoInsight/wordfreq). + +It requires a fair amount of external input data (42 GB of it, as of this +writing), which is unfortunately not version-controlled. We'd like to remedy +this situation using some sort of framework, but this requires sorting things +out with Tools. + +## How to build it + +Set up your external hard disk, your networked file system, or whatever thing +you have that's got a couple hundred GB of space free. Let's suppose the +directory of it that you want to use is called `/ext/data`. + +Copy the input data: + + cp -rv /nfs/broadway/data/wordfreq_builder /ext/data/ + +Make a symbolic link so that `data/` in this directory points to +your copy of the input data: + + ln -s /ext/data/wordfreq_builder data + +Install the Ninja build system: + + sudo apt-get install ninja-build + +We need to build a Ninja build file using the Python code in +`wordfreq_builder/ninja.py`. We could do this with Ninja, but... you see the +chicken-and-egg problem, don't you. So this is the one thing the Makefile +knows how to do. + + make + +Start the build, and find something else to do for a few hours: + + ninja -v + +You can copy the results into wordfreq with this command (supposing that +$WORDFREQ points to your wordfreq repo): + + cp data/generated/combined/*.msgpack.gz $WORDFREQ/wordfreq/data/ + + +## The dBpack data format + +We pack the wordlists into a small amount of space using a format that I +call "dBpack". This is the data that's found in the .msgpack.gz files that +are output at the end. The format is as follows: + +- The file on disk is a gzipped file in msgpack format, which decodes to a + list of lists of words. + +- Each inner list of words corresponds to a particular word frequency, + rounded to the nearest decibel. 0 dB represents a word that occurs with + probability 1, so it is the only word in the data (this of course doesn't + happen). -20 dB represents a word that occurs once per 100 tokens, -30 dB + represents a word that occurs once per 1000 tokens, and so on. + +- The index of each list within the overall list is the negative of its + frequency in decibels. + +- Each inner list is sorted in alphabetical order. + +As an example, consider a corpus consisting only of the words "red fish +blue fish". The word "fish" occurs as 50% of tokens (-3 dB), while "red" +and "blue" occur as 25% of tokens (-6 dB). The dBpack file of their word +frequencies would decode to this list: + + [[], [], [], ['fish'], [], [], ['blue', 'red']] + + +## The Ninja build process + +Ninja is a lot like Make, except with one big {drawback|advantage}: instead of +writing bizarre expressions in an idiosyncratic language to let Make calculate +which files depend on which other files... + +...you just tell Ninja which files depend on which other files. + +The Ninja documentation suggests using your favorite scripting language to +create the dependency list, so that's what we've done in `ninja.py`. + +Dependencies in Ninja refer to build rules. These do need to be written by hand +in Ninja's own format, but the task is simpler. In this project, the build +rules are defined in `rules.ninja`. They'll be concatenated with the +Python-generated dependency definitions to form the complete build file, +`build.ninja`, which is the default file that Ninja looks at when you run +`ninja`. + +So a lot of the interesting work in this package is done in `rules.ninja`. +This file defines shorthand names for long commands. As a simple example, +the rule named `format_twitter` applies the command + + python -m wordfreq_builder.cli.format_twitter $in $out + +to the dependency file `$in` and the output file `$out`. + +The specific rules are described by the comments in `rules.ninja`. + +## Data sources + +### Leeds Internet Corpus + +Also known as the "Web as Corpus" project, this is a University of Leeds +project that collected wordlists in assorted languages by crawling the Web. +The results are messy, but they're something. We've been using them for quite +a while. + +The original files are in `data/source-lists/leeds`, and they're processed +by the `convert_leeds` rule in `rules.ninja`. + +### Twitter + +The file `data/raw-input/twitter/all-2014.txt` contains about 72 million tweets +collected by the `ftfy.streamtester` package in 2014. + +It takes a lot of work to convert these tweets into data that's usable for +wordfreq. They have to be language-detected and then tokenized. So the result +of language-detection and tokenization is stored in `data/intermediate/twitter`. + +### Google Books + +We use English word frequencies from [Google Books Syntactic Ngrams][gbsn]. +We pretty much ignore the syntactic information, and only use this version +because it's cleaner. The data comes in the form of 99 gzipped text files in +`data/raw-input/google-books`. + +[gbsn]: http://commondatastorage.googleapis.com/books/syntactic-ngrams/index.html + +### OpenSubtitles + +[Some guy](https://invokeit.wordpress.com/frequency-word-lists/) made word +frequency lists out of the subtitle text on OpenSubtitles. This data was +used to make Wiktionary word frequency lists at one point, but it's been +updated significantly since the version Wiktionary got. + +The wordlists are in `data/source-lists/opensubtitles`. + +In order to fit into the wordfreq pipeline, we renamed lists with different variants +of the same language code, to distinguish them fully according to BCP 47. Then we +concatenated the different variants into a single list, as follows: + +* `zh_tw.txt` was renamed to `zh-Hant.txt` +* `zh_cn.txt` was renamed to `zh-Hans.txt` +* `zh.txt` was renamed to `zh-Hani.txt` +* `zh-Hant.txt`, `zh-Hans.txt`, and `zh-Hani.txt` were concatenated into `zh.txt` +* `pt.txt` was renamed to `pt-PT.txt` +* `pt_br.txt` was renamed to `pt-BR.txt` +* `pt-BR.txt` and `pt-PT.txt` were concatenated into `pt.txt` + +We also edited the English data to re-add "'t" to words that had obviously lost +it, such as "didn" in the place of "didn't". We applied this to words that +became much less common words in the process, which means this wordlist no +longer represents the words 'don' and 'won', as we assume most of their +frequency comes from "don't" and "won't". Words that turned into similarly +common words, however, were left alone: this list doesn't represent "can't" +because the word was left as "can". + diff --git a/wordfreq_builder/build.png b/wordfreq_builder/build.png new file mode 100644 index 0000000..47fdebd Binary files /dev/null and b/wordfreq_builder/build.png differ diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja new file mode 100644 index 0000000..d693f52 --- /dev/null +++ b/wordfreq_builder/rules.ninja @@ -0,0 +1,80 @@ +# This defines the rules on how to build parts of the wordfreq lists, using the +# Ninja build system: +# +# http://martine.github.io/ninja/manual.html +# +# Ninja is available in the 'ninja-build' Ubuntu package. It's like make with +# better parallelism and the ability for build steps to produce multiple +# outputs. The tradeoff is that its rule syntax isn't full of magic for +# expanding wildcards and finding dependencies, so in general you have to +# write the dependencies using a script. +# +# This file will become the header of the larger build.ninja file, which also +# contains the programatically-defined dependency graph. + +# Variables +DATA = ./data + +# How to build the build.ninja file itself. (Use the Makefile to get it the +# first time.) +rule build_deps + command = python -m wordfreq_builder.cli.build_deps $in > $out + +# Splits the single file $in into $slices parts, whose names will be +# $prefix plus a two-digit numeric suffix. +rule split + command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix + +# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from +# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at +# https://github.com/rspeer/wiki2text. +rule wiki2text + command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out + +# To tokenize Japanese, we run it through Mecab and take the first column. +# We don't have a plan for tokenizing Chinese yet. +rule tokenize_japanese + command = mkdir -p $$(dirname $out) && mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out + +# Tokenizing text from Twitter requires us to language-detect and tokenize +# in the same step. +rule tokenize_twitter + command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.tokenize_twitter $in $prefix + +# To convert the Leeds corpus, look for space-separated lines that start with +# an integer and a decimal. The integer is the rank, which we discard. The +# decimal is the frequency, and the remaining text is the term. Use sed -n +# with /p to output only lines where the match was successful. +# +# Grep out the term "EOS", an indication that Leeds used MeCab and didn't +# strip out the EOS lines. +rule convert_leeds + command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out + +# To convert the OpenSubtitles frequency data, simply replace spaces with +# commas. +rule convert_opensubtitles + command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out + +# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all +# the input files, keep only the single words and their counts, and only keep +# lines with counts of 100 or more. +# +# (These will still be repeated as the word appears in different grammatical +# roles, information that the source data provides that we're discarding. The +# source data was already filtered to only show words in roles with at least +# two-digit counts of occurences.) +rule convert_google_syntactic_ngrams + command = mkdir -p $$(dirname $out) && zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out + +rule count + command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out + +rule merge + command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in + +rule freqs2cB + command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_cB $in $out + +rule cat + command = cat $in > $out diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py new file mode 100755 index 0000000..c7232cc --- /dev/null +++ b/wordfreq_builder/setup.py @@ -0,0 +1,20 @@ +from setuptools import setup + +setup( + name="wordfreq_builder", + version='0.1', + maintainer='Luminoso Technologies, Inc.', + maintainer_email='info@luminoso.com', + url='http://github.com/LuminosoInsight/wordfreq_builder', + platforms=["any"], + description="Turns raw data into word frequency lists", + packages=['wordfreq_builder'], + install_requires=['msgpack-python', 'pycld2'], + entry_points={ + 'console_scripts': [ + 'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main', + 'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main', + 'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main' + ] + } +) diff --git a/wordfreq_builder/wordfreq_builder/__init__.py b/wordfreq_builder/wordfreq_builder/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wordfreq_builder/wordfreq_builder/cli/__init__.py b/wordfreq_builder/wordfreq_builder/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wordfreq_builder/wordfreq_builder/cli/build_deps.py b/wordfreq_builder/wordfreq_builder/cli/build_deps.py new file mode 100644 index 0000000..3fd74ad --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/build_deps.py @@ -0,0 +1,15 @@ +from wordfreq_builder.ninja import make_ninja_deps +import argparse + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('in_filename', help='filename of rules file') + args = parser.parse_args() + + # Make the complete ninja file and write it to standard out + make_ninja_deps(args.in_filename) + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py new file mode 100644 index 0000000..61d4b1d --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py @@ -0,0 +1,19 @@ +from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist +import argparse + + +def merge_lists(input_names, output_name): + freq_dicts = [] + for input_name in input_names: + freq_dicts.append(read_freqs(input_name, cutoff=2)) + merged = merge_freqs(freq_dicts) + write_wordlist(merged, output_name) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv') + parser.add_argument('inputs', help='names of input files to merge', nargs='+') + args = parser.parse_args() + merge_lists(args.inputs, args.output) + diff --git a/wordfreq_builder/wordfreq_builder/cli/count_tokens.py b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py new file mode 100644 index 0000000..4aeba5b --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py @@ -0,0 +1,16 @@ +from wordfreq_builder.word_counts import count_tokens, write_wordlist +import argparse + + +def handle_counts(filename_in, filename_out): + counts = count_tokens(filename_in) + write_wordlist(counts, filename_out) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('filename_in', help='name of input file containing tokens') + parser.add_argument('filename_out', help='name of output file') + args = parser.parse_args() + handle_counts(args.filename_in, args.filename_out) + diff --git a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py new file mode 100644 index 0000000..6bf3957 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py @@ -0,0 +1,11 @@ +from wordfreq_builder.word_counts import freqs_to_cBpack +import argparse + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('filename_in', help='name of input file containing tokens') + parser.add_argument('filename_out', help='name of output file') + args = parser.parse_args() + freqs_to_cBpack(args.filename_in, args.filename_out) + diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py new file mode 100644 index 0000000..df2cb6b --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py @@ -0,0 +1,19 @@ +from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_file +import argparse + + +def tokenize_twitter(in_filename, out_prefix): + tokenize_file(in_filename, out_prefix, + tokenizer=cld2_surface_tokenizer) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('filename', help='filename of input file containing one tweet per line') + parser.add_argument('outprefix', help='prefix of output filenames') + args = parser.parse_args() + tokenize_twitter(args.filename, args.outprefix) + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py new file mode 100644 index 0000000..a80c327 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -0,0 +1,87 @@ +import os + +CONFIG = { + 'version': '1.0b', + # data_dir is a relative or absolute path to where the wordlist data + # is stored + 'data_dir': 'data', + 'sources': { + # A list of language codes (possibly un-standardized) that we'll + # look up in filenames for these various data sources. + 'twitter': [ + 'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', + 'pt', 'ru', + # can be added later: 'th', 'tr' + ], + 'wikipedia': [ + 'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', + 'pt', 'ru' + # many more can be added + ], + 'opensubtitles': [ + # All languages where the most common word in OpenSubtitles + # appears at least 5000 times + 'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', + 'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'id', 'is', 'it', 'lt', 'lv', + 'mk', 'ms', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', + 'sr', 'sv', 'tr', 'uk', 'zh' + ], + 'leeds': [ + 'ar', 'de', 'el', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh' + ], + 'google-books': [ + 'en', + # Using the 2012 data, we could get French, German, Italian, + # Russian, Spanish, and (Simplified) Chinese. + ] + }, + 'wordlist_paths': { + 'twitter': 'generated/twitter/tweets-2014.{lang}.{ext}', + 'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}', + 'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}', + 'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}', + 'google-books': 'generated/google-books/google_books_{lang}.{ext}', + 'combined': 'generated/combined/combined_{lang}.{ext}', + 'combined-dist': 'dist/combined_{lang}.{ext}', + 'twitter-dist': 'dist/twitter_{lang}.{ext}' + }, + 'min_sources': 2 +} + + +def data_filename(filename): + """ + Convert a relative filename to a path inside the configured data_dir. + """ + return os.path.join(CONFIG['data_dir'], filename) + + +def wordlist_filename(source, language, extension='txt'): + """ + Get the path where a particular built wordlist should go, parameterized by + its language and its file extension. + """ + path = CONFIG['wordlist_paths'][source].format( + lang=language, ext=extension + ) + return data_filename(path) + + +def source_names(language): + """ + Get the names of data sources that supply data for the given language. + """ + return sorted(key for key in CONFIG['sources'] + if language in CONFIG['sources'][key]) + + +def all_languages(): + """ + Get all languages that should have their data built, which is those that + are supported by at least `min_sources` sources. + """ + languages = set() + for langlist in CONFIG['sources'].values(): + languages |= set(langlist) + return [lang for lang in sorted(languages) + if len(source_names(lang)) >= CONFIG['min_sources']] diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py new file mode 100644 index 0000000..ec59716 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -0,0 +1,231 @@ +from wordfreq_builder.config import ( + CONFIG, data_filename, wordlist_filename, all_languages, source_names +) +import sys +import pathlib + +HEADER = """# This file is automatically generated. Do not edit it. +# You can regenerate it using the 'wordfreq-build-deps' command. +""" +TMPDIR = data_filename('tmp') + + +# Set this to True to rebuild the Twitter tokenization (which takes days) +TOKENIZE_TWITTER = True + + +def add_dep(lines, rule, input, output, extra=None, params=None): + if isinstance(output, list): + output = ' '.join(output) + if isinstance(input, list): + input = ' '.join(input) + if extra: + if isinstance(extra, list): + extra = ' '.join(extra) + extrastr = ' | ' + extra + else: + extrastr = '' + build_rule = "build {output}: {rule} {input}{extra}".format( + output=output, rule=rule, input=input, extra=extrastr + ) + lines.append(build_rule) + if params: + for key, val in params.items(): + lines.append(" {key} = {val}".format(key=key, val=val)) + lines.append("") + + +def make_ninja_deps(rules_filename, out=sys.stdout): + """ + Output a complete Ninja file describing how to build the wordfreq data. + """ + print(HEADER, file=out) + # Copy in the rules section + with open(rules_filename, encoding='utf-8') as rulesfile: + print(rulesfile.read(), file=out) + + lines = [] + # The first dependency is to make sure the build file is up to date. + add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja', + extra='wordfreq_builder/ninja.py') + + if TOKENIZE_TWITTER: + lines.extend( + twitter_deps( + data_filename('raw-input/twitter/all-2014.txt'), + slice_prefix=data_filename('slices/twitter/tweets-2014'), + combined_prefix=data_filename('generated/twitter/tweets-2014'), + slices=40, + languages=CONFIG['sources']['twitter'] + ) + ) + lines.extend( + wikipedia_deps( + data_filename('raw-input/wikipedia'), + CONFIG['sources']['wikipedia'] + ) + ) + lines.extend( + google_books_deps( + data_filename('raw-input/google-books') + ) + ) + lines.extend( + leeds_deps( + data_filename('source-lists/leeds'), + CONFIG['sources']['leeds'] + ) + ) + lines.extend( + opensubtitles_deps( + data_filename('source-lists/opensubtitles'), + CONFIG['sources']['opensubtitles'] + ) + ) + lines.extend(combine_lists(all_languages())) + + print('\n'.join(lines), file=out) + + +def wikipedia_deps(dirname_in, languages): + lines = [] + path_in = pathlib.Path(dirname_in) + for language in languages: + # Find the most recent file for this language + # Skip over files that do not exist + input_file = max(path_in.glob( + '{}wiki*.bz2'.format(language) + )) + plain_text_file = wordlist_filename('wikipedia', language, 'txt') + count_file = wordlist_filename('wikipedia', language, 'counts.txt') + + add_dep(lines, 'wiki2text', input_file, plain_text_file) + if language == 'ja': + mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt') + add_dep(lines, 'tokenize_japanese', plain_text_file, mecab_token_file) + add_dep(lines, 'count', mecab_token_file, count_file) + else: + add_dep(lines, 'count', plain_text_file, count_file) + + return lines + + +def google_books_deps(dirname_in): + # Get English data from the split-up files of the Google Syntactic N-grams + # 2013 corpus. + lines = [] + + # Yes, the files are numbered 00 through 98 of 99. This is not an + # off-by-one error. Not on my part, anyway. + input_files = [ + '{}/nodes.{:>02d}-of-99.gz'.format(dirname_in, i) + for i in range(99) + ] + output_file = wordlist_filename('google-books', 'en', 'counts.txt') + add_dep(lines, 'convert_google_syntactic_ngrams', input_files, output_file) + return lines + + +def twitter_deps(input_filename, slice_prefix, + combined_prefix, slices, languages): + lines = [] + + slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num) + for num in range(slices)] + # split the input into slices + add_dep(lines, + 'split', input_filename, slice_files, + params={'prefix': '{}.part'.format(slice_prefix), + 'slices': slices}) + + for slicenum in range(slices): + slice_file = slice_files[slicenum] + language_outputs = [ + '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language) + for language in languages + ] + add_dep(lines, 'tokenize_twitter', slice_file, language_outputs, + params={'prefix': slice_file}) + + for language in languages: + combined_output = wordlist_filename('twitter', language, 'tokens.txt') + + language_inputs = [ + '{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language) + for slicenum in range(slices) + ] + + add_dep(lines, 'cat', language_inputs, combined_output) + + count_file = wordlist_filename('twitter', language, 'counts.txt') + + if language == 'ja': + mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt') + add_dep(lines, 'tokenize_japanese', combined_output, mecab_token_file) + add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py') + else: + add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py') + + return lines + + +def leeds_deps(dirname_in, languages): + lines = [] + for language in languages: + input_file = '{prefix}/internet-{lang}-forms.num'.format( + prefix=dirname_in, lang=language + ) + reformatted_file = wordlist_filename('leeds', language, 'counts.txt') + add_dep(lines, 'convert_leeds', input_file, reformatted_file) + + return lines + + +def opensubtitles_deps(dirname_in, languages): + lines = [] + for language in languages: + input_file = '{prefix}/{lang}.txt'.format( + prefix=dirname_in, lang=language + ) + reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt') + add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file) + + return lines + + +def combine_lists(languages): + lines = [] + for language in languages: + sources = source_names(language) + input_files = [ + wordlist_filename(source, language, 'counts.txt') + for source in sources + ] + output_file = wordlist_filename('combined', language) + add_dep(lines, 'merge', input_files, output_file, + extra='wordfreq_builder/word_counts.py') + + output_cBpack = wordlist_filename('combined-dist', language, 'msgpack.gz') + add_dep(lines, 'freqs2cB', output_file, output_cBpack, + extra='wordfreq_builder/word_counts.py') + + lines.append('default {}'.format(output_cBpack)) + + # Write standalone lists for Twitter frequency + if language in CONFIG['sources']['twitter']: + input_file = wordlist_filename('twitter', language, 'counts.txt') + output_cBpack = wordlist_filename('twitter-dist', language, 'msgpack.gz') + add_dep(lines, 'freqs2cB', input_file, output_cBpack, + extra='wordfreq_builder/word_counts.py') + + lines.append('default {}'.format(output_cBpack)) + + return lines + + +def main(): + make_ninja_deps('rules.ninja') + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/ninja2dot.py b/wordfreq_builder/wordfreq_builder/ninja2dot.py new file mode 100644 index 0000000..431ac09 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/ninja2dot.py @@ -0,0 +1,29 @@ +import sys + + +def ninja_to_dot(): + def last_component(path): + return path.split('/')[-1] + + print("digraph G {") + print('rankdir="LR";') + for line in sys.stdin: + line = line.rstrip() + parts = line.split(' ') + if parts[0] == 'build': + # the output file is the first argument; strip off the colon that + # comes from ninja syntax + outfile = last_component(parts[1][:-1]) + operation = parts[2] + infiles = [last_component(part) for part in parts[3:]] + for infile in infiles: + if infile == '|': + # external dependencies start here; let's not graph those + break + print('"%s" -> "%s" [label="%s"]' % (infile, outfile, operation)) + print("}") + + +if __name__ == '__main__': + ninja_to_dot() + diff --git a/wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py b/wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py new file mode 100644 index 0000000..a26feab --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py @@ -0,0 +1,51 @@ +from wordfreq_builder.tokenizers import cld2_surface_tokenizer, cld2_detect_language +from nose.tools import eq_ + + +def test_tokenizer_1(): + text = '"This is a test," she said, "and I\'ll bet y\'all $3.50 that it won\'t fail."' + tokens = [ + 'this', 'is', 'a', 'test', 'she', 'said', + 'and', "i'll", 'bet', "y'all", '3', '50', 'that', + 'it', "won't", 'fail', + ] + result = cld2_surface_tokenizer(text) + eq_(result[1], tokens) + eq_(result[0], 'en') + +def test_tokenizer_2(): + text = "i use punctuation informally...see?like this." + tokens = [ + 'i', 'use', 'punctuation', 'informally', 'see', + 'like', 'this' + ] + result = cld2_surface_tokenizer(text) + eq_(result[1], tokens) + eq_(result[0], 'en') + +def test_tokenizer_3(): + text = "@ExampleHandle This parser removes twitter handles!" + tokens = ['this', 'parser', 'removes', 'twitter', 'handles'] + result = cld2_surface_tokenizer(text) + eq_(result[1], tokens) + eq_(result[0], 'en') + +def test_tokenizer_4(): + text = "This is a really boring example tco http://t.co/n15ASlkase" + tokens = ['this', 'is', 'a', 'really', 'boring', 'example', 'tco'] + result = cld2_surface_tokenizer(text) + eq_(result[1], tokens) + eq_(result[0], 'en') + + +def test_language_recognizer_1(): + text = "Il est le meilleur livre que je ai jamais lu" + result = cld2_detect_language(text) + eq_(result, 'fr') + +def test_language_recognizer_2(): + text = """A nuvem de Oort, também chamada de nuvem de Öpik-Oort, + é uma nuvem esférica de planetesimais voláteis que se acredita + localizar-se a cerca de 50 000 UA, ou quase um ano-luz, do Sol.""" + result = cld2_detect_language(text) + eq_(result, 'pt') diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py new file mode 100644 index 0000000..733191d --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -0,0 +1,115 @@ +from html.entities import name2codepoint +from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE +import re +import pycld2 + +CLD2_BAD_CHAR_RANGE = "".join([ + '[', + '\x00-\x08', + '\x0b', + '\x0e-\x1f', + '\x7f-\x9f', + '\ud800-\udfff', + '\ufdd0-\ufdef'] + + [chr(65534+65536*x+y) for x in range(17) for y in range(2)] + + [']']) +CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE) + +TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE)) +TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+'.format(NON_PUNCT_RANGE)) + + +def cld2_surface_tokenizer(text): + """ + Uses CLD2 to detect the language and wordfreq tokenizer to create tokens + """ + text = remove_handles_and_urls(text) + lang = cld2_detect_language(text) + tokens = tokenize(text, lang) + return lang, tokens + +def cld2_detect_language(text): + """ + Uses CLD2 to detect the language + """ + text = CLD2_BAD_CHARS_RE.sub('', text) + return pycld2.detect(text)[2][0][1] + +def remove_handles_and_urls(text): + text = fix_entities(text) + text = TWITTER_HANDLE_RE.sub('', text) + text = TCO_RE.sub('', text) + return text + +def last_tab(line): + """ + Read lines by keeping only the last tab-separated value. + """ + return line.split('\t')[-1].strip() + +def lowercase_text_filter(token): + """ + If this looks like a token that we want to count, return it, lowercased. + If not, filter it out by returning None. + """ + if TOKEN_RE.search(token): + return token.lower() + else: + return None + +def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab): + """ + Process a file by running it through the given tokenizer, sorting the + results by the language of each line, and inserting newlines + to mark the token boundaries. + """ + out_files = {} + with open(in_filename, encoding='utf-8') as in_file: + for line in in_file: + text = line_reader(line) + language, tokens = tokenizer(text) + if language != 'un': + tokenized = '\n'.join(tokens) + out_filename = '%s.%s.txt' % (out_prefix, language) + if out_filename in out_files: + out_file = out_files[out_filename] + else: + out_file = open(out_filename, 'w', encoding='utf-8') + out_files[out_filename] = out_file + print(tokenized, file=out_file) + for out_file in out_files.values(): + out_file.close() + +ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;') + +def fix_entities(text): + """ + Fix the few HTML entities that Twitter uses -- even if they've + already been tokenized. + """ + def replace_entity(match): + return chr(name2codepoint[match.group(1)]) + return ENTITY_RE.sub(replace_entity, text) + +def monolingual_tokenize_file(in_filename, out_filename, language, + tokenizer, line_reader=last_tab, + sample_proportion=1): + """ + Process a file by running it through the given tokenizer, only keeping + lines of the language we're asking for, and inserting newlines + to mark the token boundaries. + + `line_reader` is applied to each line before it given to the tokenizer + + Only the first line out of every `sample_proportion` lines are run through + then tokenizer. + """ + with open(in_filename, encoding='utf-8', errors='replace') as in_file: + with open(out_filename, 'w', encoding='utf-8') as out_file: + for i, line in enumerate(in_file): + if i % sample_proportion == 0: + text = line_reader(line) + tokens, line_language = tokenizer(text) + if line_language == language: + for token in tokens: + print(token, file=out_file) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py new file mode 100644 index 0000000..8f4099c --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -0,0 +1,120 @@ +from wordfreq import simple_tokenize +from collections import defaultdict +from operator import itemgetter +from ftfy import fix_text +import math +import csv +import msgpack +import gzip + + +def count_tokens(filename): + """ + Count tokens that appear in a file, running each line through our + simple tokenizer. + + Unicode errors in the input data will become token boundaries. + """ + counts = defaultdict(int) + with open(filename, encoding='utf-8', errors='replace') as infile: + for line in infile: + for token in simple_tokenize(line.strip()): + counts[token] += 1 + return counts + + +def read_freqs(filename, cutoff=0): + """ + Read words and their frequencies from a CSV file. + + Only words with a frequency greater than `cutoff` are returned. + + If `cutoff` is greater than 0, the csv file must be sorted by frequency + in descending order. + """ + raw_counts = defaultdict(float) + total = 0. + with open(filename, encoding='utf-8', newline='') as infile: + reader = csv.reader(infile) + for key, strval in reader: + val = float(strval) + if val < cutoff: + break + for token in simple_tokenize(key): + token = fix_text(token) + total += val + # Use += so that, if we give the reader concatenated files with + # duplicates, it does the right thing + raw_counts[token] += val + + freqs = {key: raw_count / total + for (key, raw_count) in raw_counts.items()} + return freqs + + +def freqs_to_cBpack(in_filename, out_filename, cutoff=-600): + """ + Convert a csv file of words and their frequencies to a file in the + idiosyncratic 'cBpack' format. + + Only words with a frequency greater than `cutoff` centibels will be + written to the new file. + """ + freq_cutoff = 10 ** (cutoff / 100.) + freqs = read_freqs(in_filename, freq_cutoff) + cBpack = [] + for token, freq in freqs.items(): + cB = round(math.log10(freq) * 100) + if cB >= cutoff: + neg_cB = -cB + while neg_cB >= len(cBpack): + cBpack.append([]) + cBpack[neg_cB].append(token) + + for sublist in cBpack: + sublist.sort() + + # Write a "header" consisting of a dictionary at the start of the file + cBpack_data = [{'format': 'cB', 'version': 1}] + cBpack + + with gzip.open(out_filename, 'wb') as outfile: + msgpack.dump(cBpack_data, outfile) + + +def merge_freqs(freq_dicts): + """ + Merge multiple dictionaries of frequencies, representing each word with + the word's average frequency over all sources. + """ + vocab = set() + for freq_dict in freq_dicts: + vocab |= set(freq_dict) + + merged = defaultdict(float) + N = len(freq_dicts) + for term in vocab: + term_total = 0. + for freq_dict in freq_dicts: + term_total += freq_dict.get(term, 0.) + merged[term] = term_total / N + + return merged + + +def write_wordlist(freqs, filename, cutoff=1e-8): + """ + Write a dictionary of either raw counts or frequencies to a file of + comma-separated values. + + Keep the CSV format simple by explicitly skipping words containing + commas or quotation marks. We don't believe we want those in our tokens + anyway. + """ + with open(filename, 'w', encoding='utf-8', newline='\n') as outfile: + writer = csv.writer(outfile) + items = sorted(freqs.items(), key=itemgetter(1), reverse=True) + for word, freq in items: + if freq < cutoff: + break + if not ('"' in word or ',' in word): + writer.writerow([word, str(freq)])