diff --git a/wordfreq_builder/.gitignore b/wordfreq_builder/.gitignore index 46c58ff..a1da2e9 100644 --- a/wordfreq_builder/.gitignore +++ b/wordfreq_builder/.gitignore @@ -6,3 +6,7 @@ dist *.egg-info build _build +build.ninja +data +.ninja_deps +.ninja_log diff --git a/wordfreq_builder/Makefile b/wordfreq_builder/Makefile new file mode 100644 index 0000000..206c432 --- /dev/null +++ b/wordfreq_builder/Makefile @@ -0,0 +1,12 @@ +PYTHON = python + +all: build.ninja + +# make sure this package is in 'develop' mode and up to date +wordfreq_builder.egg-info/PKG-INFO: setup.py + $(PYTHON) setup.py develop + +# build the Ninja file that will take over the build process +build.ninja: rules.ninja wordfreq_builder/ninja.py wordfreq_builder/config.py wordfreq_builder.egg-info/PKG-INFO + $(PYTHON) -m wordfreq_builder.cli.build_deps rules.ninja > build.ninja + diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja new file mode 100644 index 0000000..12c0360 --- /dev/null +++ b/wordfreq_builder/rules.ninja @@ -0,0 +1,63 @@ +# This defines the rules on how to build parts of the wordfreq lists, using the +# Ninja build system: +# +# http://martine.github.io/ninja/manual.html +# +# Ninja is available in the 'ninja-build' Ubuntu package. It's like make with +# better parallelism and the ability for build steps to produce multiple +# outputs. The tradeoff is that its rule syntax isn't full of magic for +# expanding wildcards and finding dependencies, so in general you have to +# write the dependencies using a script. +# +# This file will become the header of the larger build.ninja file, which also +# contains the programatically-defined dependency graph. + +# Variables +DATA = ./data + +# Splits the single file $in into $slices parts, whose names will be +# $prefix plus a two-digit numeric suffix. +rule split + command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix + +# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from +# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at +# https://github.com/rspeer/wiki2text. +rule wiki2text + command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out + +rule wiki2tokens + command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out + +rule tokenize_japanese + command = mkdir -p $$(dirname $out) && mecab < $in | cut -f 1 | grep -v "EOS" + +rule tokenize_twitter + command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.pretokenize_twitter $in $prefix + +rule format_twitter + command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.format_twitter $in $out + +# To convert the Leeds corpus, look for space-separated lines that start with +# an integer and a decimal. The integer is the rank, which we discard. The +# decimal is the frequency, and the remaining text is the term. Use sed -n +# with /p to output only lines where the match was successful. +rule convert_leeds + command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in > $out + +# To convert the OpenSubtitles frequency data, simply replace spaces with +# commas. +rule convert_opensubtitles + command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out + +rule count + command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out + +rule merge + command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in + +rule freqs2dB + command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_dB $in $out + +rule cat + command = cat $in > $out diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py index e57c58e..1466d35 100755 --- a/wordfreq_builder/setup.py +++ b/wordfreq_builder/setup.py @@ -9,4 +9,13 @@ setup( platforms=["any"], description="Turns raw data into word frequency lists", packages=['wordfreq_builder'], + install_requires=['msgpack-python'], + entry_points={ + 'console_scripts': [ + 'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main', + 'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main', + 'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main', + 'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main' + ] + } ) diff --git a/wordfreq_builder/wordfreq_builder/cli/__init__.py b/wordfreq_builder/wordfreq_builder/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/wordfreq_builder/wordfreq_builder/cli/build_deps.py b/wordfreq_builder/wordfreq_builder/cli/build_deps.py new file mode 100644 index 0000000..3fd74ad --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/build_deps.py @@ -0,0 +1,15 @@ +from wordfreq_builder.ninja import make_ninja_deps +import argparse + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('in_filename', help='filename of rules file') + args = parser.parse_args() + + # Make the complete ninja file and write it to standard out + make_ninja_deps(args.in_filename) + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py new file mode 100644 index 0000000..61d4b1d --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py @@ -0,0 +1,19 @@ +from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist +import argparse + + +def merge_lists(input_names, output_name): + freq_dicts = [] + for input_name in input_names: + freq_dicts.append(read_freqs(input_name, cutoff=2)) + merged = merge_freqs(freq_dicts) + write_wordlist(merged, output_name) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv') + parser.add_argument('inputs', help='names of input files to merge', nargs='+') + args = parser.parse_args() + merge_lists(args.inputs, args.output) + diff --git a/wordfreq_builder/wordfreq_builder/cli/count_tokens.py b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py new file mode 100644 index 0000000..4aeba5b --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py @@ -0,0 +1,16 @@ +from wordfreq_builder.word_counts import count_tokens, write_wordlist +import argparse + + +def handle_counts(filename_in, filename_out): + counts = count_tokens(filename_in) + write_wordlist(counts, filename_out) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('filename_in', help='name of input file containing tokens') + parser.add_argument('filename_out', help='name of output file') + args = parser.parse_args() + handle_counts(args.filename_in, args.filename_out) + diff --git a/wordfreq_builder/wordfreq_builder/cli/format_twitter.py b/wordfreq_builder/wordfreq_builder/cli/format_twitter.py new file mode 100644 index 0000000..224c5a1 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/format_twitter.py @@ -0,0 +1,14 @@ +from wordfreq_builder.tokenizers import retokenize_file +import argparse + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('in_filename', help='filename of input file containing one tweet per line') + parser.add_argument('out_filename', help='filename of output file') + args = parser.parse_args() + retokenize_file(args.in_filename, args.out_filename) + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py b/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py new file mode 100644 index 0000000..81a4dde --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py @@ -0,0 +1,11 @@ +from wordfreq_builder.word_counts import freqs_to_dBpack +import argparse + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('filename_in', help='name of input file containing tokens') + parser.add_argument('filename_out', help='name of output file') + args = parser.parse_args() + freqs_to_dBpack(args.filename_in, args.filename_out) + diff --git a/wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py b/wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py new file mode 100644 index 0000000..c179988 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py @@ -0,0 +1,19 @@ +from wordfreq_builder.tokenizers import rosette_surface_tokenizer, pretokenize_file +import argparse + + +def pretokenize_twitter(in_filename, out_prefix): + pretokenize_file(in_filename, out_prefix, + tokenizer=rosette_surface_tokenizer) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('filename', help='filename of input file containing one tweet per line') + parser.add_argument('outprefix', help='prefix of output filenames') + args = parser.parse_args() + pretokenize_twitter(args.filename, args.outprefix) + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py new file mode 100644 index 0000000..fa97543 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py @@ -0,0 +1,30 @@ +from wordfreq_builder.tokenizers import rosette_surface_tokenizer, monolingual_tokenize_file +import argparse + + +def tokenize_wikipedia(in_filename, out_filename, language, proportion): + monolingual_tokenize_file( + in_filename, out_filename, + language=language, + tokenizer=rosette_surface_tokenizer, + line_reader=strip_headings, + sample_proportion=proportion + ) + + +def strip_headings(text): + return text.strip().strip('=') + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('in_filename', help='filename of input file') + parser.add_argument('out_filename', help='filename of output file') + parser.add_argument('language', help='the language code of the text') + parser.add_argument('-p', '--proportion', help='process 1/n of the lines (default 100)', type=int, default=100) + args = parser.parse_args() + tokenize_wikipedia(args.in_filename, args.out_filename, args.language, args.proportion) + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/cmd_combine_lists.py b/wordfreq_builder/wordfreq_builder/cmd_combine_lists.py deleted file mode 100644 index 7b67375..0000000 --- a/wordfreq_builder/wordfreq_builder/cmd_combine_lists.py +++ /dev/null @@ -1,21 +0,0 @@ -from wordfreq_builder.word_counts import read_counts, write_counts, merge_counts -from pathlib import Path -import argparse - - -def merge_lists(input_names, output_name, balance=False): - count_dicts = [] - for input_name in input_names: - count_dicts.append(read_counts(Path(input_name))) - merged = merge_counts(count_dicts, balance=balance) - write_counts(merged, Path(output_name)) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv') - parser.add_argument('-b', '--balance', action='store_true', help='Automatically balance unequally-sampled word frequencies') - parser.add_argument('inputs', help='names of input files to merge', nargs='+') - args = parser.parse_args() - merge_lists(args.inputs, args.output, balance=args.balance) - diff --git a/wordfreq_builder/wordfreq_builder/cmd_count_twitter.py b/wordfreq_builder/wordfreq_builder/cmd_count_twitter.py deleted file mode 100644 index 1086f1b..0000000 --- a/wordfreq_builder/wordfreq_builder/cmd_count_twitter.py +++ /dev/null @@ -1,27 +0,0 @@ -from wordfreq_builder.word_counts import WordCountBuilder -from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer -from pathlib import Path -import argparse - - -def count_twitter(pathname, offset=0, nsplit=1, surface=True): - path = Path(pathname) - if surface == True: - tokenizer = rosette_surface_tokenizer - else: - tokenizer = rosette_tokenizer - builder = WordCountBuilder(tokenizer=tokenizer) - save_filename = 'twitter-counts-%d.csv' % offset - save_pathname = path.parent / save_filename - builder.count_twitter(path, offset, nsplit) - builder.save_wordlist(save_pathname) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('filename', help='filename of input file containing one tweet per line') - parser.add_argument('offset', type=int) - parser.add_argument('nsplit', type=int) - args = parser.parse_args() - count_twitter(args.filename, args.offset, args.nsplit, surface=True) - diff --git a/wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py b/wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py deleted file mode 100644 index c362f03..0000000 --- a/wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py +++ /dev/null @@ -1,23 +0,0 @@ -from wordfreq_builder.word_counts import WordCountBuilder -from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer -from pathlib import Path -import argparse - - -def count_wikipedia(filename, surface=True): - path = Path(filename) - if surface == True: - tokenizer = rosette_surface_tokenizer - else: - tokenizer = rosette_tokenizer - builder = WordCountBuilder(tokenizer=tokenizer, unique_docs=False) - builder.count_wikipedia(path) - builder.save_wordlist(path.parent / 'counts.csv') - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('filename', help='flat text file containing extracted Wikipedia text') - args = parser.parse_args() - count_wikipedia(args.filename, surface=True) - diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py new file mode 100644 index 0000000..b6af74d --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -0,0 +1,69 @@ +import os + +CONFIG = { + 'version': '0.9.0', + # data_dir is a relative or absolute path to where the wordlist data + # is stored + 'data_dir': 'data', + 'sources': { + # A list of language codes (possibly un-standardized) that we'll + # look up in filenames for these various data sources. + 'twitter': [ + 'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', + 'pt', 'ru', + # can be added later: 'th', 'tr' + ], + 'wikipedia': [ + 'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl', + 'pt', 'ru' + ], + 'opensubtitles': [ + # All languages where the most common word in OpenSubtitles + # appears at least 5000 times + 'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', + 'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'id', 'is', 'it', 'lt', 'lv', + 'mk', 'ms', 'nb', 'nl', 'pl', 'pt', 'ro', 'sk', 'sl', 'sq', 'sr', + 'sv', 'tr', 'uk', 'zh' + ], + 'leeds': [ + 'ar', 'de', 'el', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh' + ] + }, + 'wordlist_paths': { + 'twitter': 'generated/twitter/tweets-2014.{lang}.{ext}', + 'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}', + 'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}', + 'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}', + 'combined': 'generated/combined/combined_{lang}.{ext}' + }, + 'min_sources': 2 +} + + +def data_filename(filename): + return os.path.join(CONFIG['data_dir'], filename) + + +def wordlist_filename(source, language, extension='txt'): + path = CONFIG['wordlist_paths'][source].format( + lang=language, ext=extension + ) + return data_filename(path) + + +def source_names(language): + """ + Get the names of data sources that supply data for the given language. + """ + return sorted([key for key in CONFIG['sources'] + if language in CONFIG['sources'][key]]) + + +def all_languages(): + languages = set() + for langlist in CONFIG['sources'].values(): + languages |= set(langlist) + return [lang for lang in sorted(languages) + if len(source_names(lang)) + >= CONFIG['min_sources']] + diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py new file mode 100644 index 0000000..1059ba3 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -0,0 +1,199 @@ +from wordfreq_builder.config import ( + CONFIG, data_filename, wordlist_filename, all_languages, source_names +) +import sys +import pathlib + +HEADER = """# This file is automatically generated. Do not edit it. +# You can regenerate it using the 'wordfreq-build-deps' command. +""" +TMPDIR = data_filename('tmp') + + +# Set this to True to rebuild the Twitter tokenization (which takes days) +PRETOKENIZE_TWITTER = False + + +def add_dep(lines, rule, input, output, extra=None, params=None): + if isinstance(output, list): + output = ' '.join(output) + if isinstance(input, list): + input = ' '.join(input) + if extra: + if isinstance(extra, list): + extra = ' '.join(extra) + extrastr = ' | ' + extra + else: + extrastr = '' + build_rule = "build {output}: {rule} {input}{extra}".format( + output=output, rule=rule, input=input, extra=extrastr + ) + lines.append(build_rule) + if params: + for key, val in params.items(): + lines.append(" {key} = {val}".format(locals())) + lines.append("") + + +def make_ninja_deps(rules_filename, out=sys.stdout): + """ + Output a complete Ninja file describing how to build the wordfreq data. + """ + print(HEADER, file=out) + # Copy in the rules section + with open(rules_filename, encoding='utf-8') as rulesfile: + print(rulesfile.read(), file=out) + + lines = [] + if PRETOKENIZE_TWITTER: + lines.extend( + twitter_preprocess_deps( + data_filename('raw-input/twitter/all-2014.txt'), + slice_prefix=data_filename('slices/twitter/tweets-2014'), + combined_prefix=data_filename('intermediate/twitter/tweets-2014'), + slices=40, + languages=CONFIG['sources']['twitter'] + ) + ) + lines.extend( + twitter_deps( + data_filename('intermediate/twitter/tweets-2014'), + languages=CONFIG['sources']['twitter'] + ) + ) + lines.extend( + wikipedia_deps( + data_filename('raw-input/wikipedia'), + CONFIG['sources']['wikipedia'] + ) + ) + lines.extend( + leeds_deps( + data_filename('source-lists/leeds'), + CONFIG['sources']['leeds'] + ) + ) + lines.extend( + opensubtitles_deps( + data_filename('source-lists/opensubtitles'), + CONFIG['sources']['opensubtitles'] + ) + ) + lines.extend(combine_lists(all_languages())) + + print('\n'.join(lines), file=out) + + +def wikipedia_deps(dirname_in, languages): + lines = [] + path_in = pathlib.Path(dirname_in) + for language in languages: + # Find the most recent file for this language + input_file = max(path_in.glob( + '{}wiki*.bz2'.format(language) + )) + raw_file = wordlist_filename('wikipedia', language, 'txt') + token_file = wordlist_filename('wikipedia', language, 'tokens.txt') + count_file = wordlist_filename('wikipedia', language, 'counts.txt') + + add_dep(lines, 'wiki2text', input_file, raw_file) + add_dep(lines, 'wiki2tokens', input_file, token_file) + add_dep(lines, 'count', token_file, count_file) + return lines + + +def twitter_preprocess_deps(input_filename, slice_prefix, + combined_prefix, slices, languages): + lines = [] + + slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num) + for num in range(slices)] + # split the input into slices + add_dep(lines, + 'split', input_filename, slice_files, + {'prefix': '{}.part'.format(slice_prefix), + 'slices': slices}) + + for slicenum in range(slices): + slice_file = slice_files[slicenum] + language_outputs = [ + '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language) + for language in languages + ] + add_dep(lines, 'tokenize_twitter', slice_file, language_outputs, + {'prefix': slice_file}) + + for language in languages: + combined_output = '{prefix}.{lang}.txt'.format(prefix=combined_prefix, lang=language) + + language_inputs = [ + '{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language) + for slicenum in range(slices) + ] + add_dep(lines, 'cat', language_inputs, combined_output) + return lines + + +def twitter_deps(prefix_in, languages): + lines = [] + for language in languages: + input_file = '{prefix}.{lang}.txt'.format(prefix=prefix_in, lang=language) + token_file = wordlist_filename('twitter', language, 'tokens.txt') + add_dep(lines, + 'format_twitter', input_file, token_file, + extra='wordfreq_builder/tokenizers.py') + + count_file = wordlist_filename('twitter', language, 'counts.txt') + add_dep(lines, 'count', token_file, count_file) + + return lines + + +def leeds_deps(dirname_in, languages): + lines = [] + for language in languages: + input_file = '{prefix}/internet-{lang}-forms.num'.format( + prefix=dirname_in, lang=language + ) + reformatted_file = wordlist_filename('leeds', language, 'counts.txt') + add_dep(lines, 'convert_leeds', input_file, reformatted_file) + + return lines + + +def opensubtitles_deps(dirname_in, languages): + lines = [] + for language in languages: + input_file = '{prefix}/{lang}.txt'.format( + prefix=dirname_in, lang=language + ) + reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt') + add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file) + + return lines + + +def combine_lists(languages): + lines = [] + for language in languages: + sources = source_names(language) + input_files = [ + wordlist_filename(source, language, 'counts.txt') + for source in sources + ] + output_file = wordlist_filename('combined', language) + add_dep(lines, 'merge', input_files, output_file, + extra='wordfreq_builder/word_counts.py') + + output_dBpack = wordlist_filename('combined', language, 'msgpack.gz') + add_dep(lines, 'freqs2dB', output_file, output_dBpack, + extra='wordfreq_builder/word_counts.py') + return lines + + +def main(): + make_ninja_deps('rules.ninja') + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index e20c96f..e4ea914 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -1,33 +1,153 @@ from lumi_science.text_readers.rosette_readers import RosetteReader +from html.entities import name2codepoint import re ROSETTE = RosetteReader() -def rosette_tokenizer(text): - analysis, lang = ROSETTE.rosette.analyze(text) - # I'm aware this doesn't do the right things with multi-word stems. - # Wordfreq doesn't either. And wordfreq isn't designed to look up - # multiple words anyway. - tokens = [] - for (stem, pos, span) in analysis: - for subtoken in stem.split(' '): - tokens.append(subtoken + '|' + lang) - return tokens +# Some of Rosette's language codes are incorrect. For example, 'zh_sc' should +# mean "Chinese as used in Seychelles", which is kind of nonsense. What Rosette +# really means is "Simplified Chinese", whose code is 'zh-Hans'. +ROSETTE_LANG_MAP = { + 'zh_sc': 'zh-Hans', + 'zh_tc': 'zh-Hant', + 'en_uc': 'en', +} + + +NON_PUNCT_RE = re.compile('[0-9A-Za-z\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff0-9A-Za-z\uff66-\U0002ffff]') + +EMOTICON_RANGE = '\u2600-\u26ff\U0001F000-\U0001F7FF' +RETOKENIZE_RE = re.compile('[{0}#@/]|[^{0}#@/ ]+'.format(EMOTICON_RANGE)) + + +def last_tab(line): + """ + Read lines by keeping only the last tab-separated value. + """ + return line.split('\t')[-1].strip() + + +def lowercase_text_filter(token): + if NON_PUNCT_RE.search(token): + return token.lower() + else: + return None + + +def is_url(token): + return token.startswith('http:') or token.startswith('https:') + + +def pretokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab): + """ + Process a file by running it through the given tokenizer, sorting the + results by the language of each line, and inserting spaces into lines + to mark the token boundaries. This computes the 'hard part' of + tokenization and allows the results to be saved, so that we can change + the finer details of the output without re-running everything. + """ + out_files = {} + for line in open(in_filename, encoding='utf-8'): + text = line_reader(line) + tokens, language = tokenizer(text) + tokenized = ' '.join(tokens) + if language is not None: + out_filename = '%s.%s.txt' % (out_prefix, language) + if out_filename in out_files: + out_file = out_files[out_filename] + else: + out_file = open(out_filename, 'w', encoding='utf-8') + out_files[out_filename] = out_file + print(tokenized, file=out_file) + for out_file in out_files.values(): + out_file.close() + + +ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;') + + +def fix_entities(text): + """ + Fix the few HTML entities that Twitter uses -- even if they've + already been tokenized. + """ + def replace_entity(match): + return chr(name2codepoint[match.group(1)]) + return ENTITY_RE.sub(replace_entity, text) + + +def retokenize(text): + text = fix_entities(text) + tokens = RETOKENIZE_RE.findall(text) + skip_next = False + for token in tokens: + if token == '/' or token == '@': + # Avoid idiosyncratic tokens such as URLs and + # usernames + skip_next = True + elif skip_next: + skip_next = False + else: + if not is_url(token): + filtered = lowercase_text_filter(token) + if filtered: + yield filtered + + +def retokenize_file(in_filename, out_filename): + """ + Process a file that has been tokenized (by inserting spaces) in a + language-specific way by Rosette. + """ + with open(in_filename, encoding='utf-8') as in_file: + with open(out_filename, 'w', encoding='utf-8') as out_file: + for line in in_file: + skip_next = False + for token in retokenize(line.strip()): + if skip_next: + skip_next = False + elif token == '/' or token == '@': + # Avoid idiosyncratic tokens such as URLs and + # usernames + skip_next = True + elif lowercase_text_filter(token): + print(token, file=out_file) + + +def monolingual_tokenize_file(in_filename, out_filename, language, + tokenizer, line_reader=last_tab, + token_filter=lowercase_text_filter, + sample_proportion=100): + with open(in_filename, encoding='utf-8', errors='replace') as in_file: + with open(out_filename, 'w', encoding='utf-8') as out_file: + for i, line in enumerate(in_file): + if i % sample_proportion == 0: + text = line_reader(line) + tokens, line_language = tokenizer(text) + if line_language == language: + filtered = [token_filter(t) for t in tokens] + filtered = [t for t in filtered if t is not None] + for token in filtered: + print(token, file=out_file) def rosette_surface_tokenizer(text): - analysis, lang = ROSETTE.rosette.analyze(text) + try: + analysis, lang = ROSETTE.rosette.analyze(text) + except (RuntimeError, UnicodeError): + # Our Rosette interface throws errors given arbitrary data. :( + return text, None + language = ROSETTE_LANG_MAP.get(lang, lang) tokens = [] for (stem, pos, span) in analysis: surface_text = text[span[0]:span[1]] - for subtoken in surface_text.split(' '): - tokens.append(subtoken + '|' + lang) - return tokens + tokens.append(surface_text) + return tokens, language -def treebank_surface_tokenizer(text): +def treebank_surface_tokenizer(text, language='en'): """ This is a simplified version of the Treebank tokenizer in NLTK. @@ -45,6 +165,10 @@ def treebank_surface_tokenizer(text): as a result -- for example, it splits "wanna" into "wan" and "na", which are supposed to be considered unusual surface forms of "want" and "to". We just leave it as the word "wanna". + + The language will just be returned, as this function isn't doing any + language detection. It defaults to 'en', as English is the language that + Treebank tokenization is designed for. """ #starting quotes text = re.sub(r'^\"', r'``', text) @@ -80,4 +204,4 @@ def treebank_surface_tokenizer(text): text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ", text) - return text.split() + return text.split(), language diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index a379e8e..be49288 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -1,116 +1,85 @@ -from wordfreq_builder.tokenizers import treebank_surface_tokenizer +from wordfreq_builder.tokenizers import retokenize from collections import defaultdict from operator import itemgetter -from pathlib import Path -from unicodedata import normalize +from ftfy import fix_text +import math import csv -import sys +import msgpack +import gzip -def read_counts(path): +def count_tokens(filename): counts = defaultdict(int) - with path.open(encoding='utf-8', newline='') as infile: - reader = csv.reader(infile) - for key, strval in reader: - val = float(strval) - # Use += so that, if we give the reader concatenated files with - # duplicates, it does the right thing - counts[key] += val + with open(filename, encoding='utf-8') as infile: + for line in infile: + for token in retokenize(line.strip()): + counts[token] += 1 return counts -def count_languages(counts): - langcounts = defaultdict(int) - for key, strval in counts.items(): - val = int(strval) - text, lang = key.rsplit('|', 1) - langcounts[lang] += val - return langcounts +def read_freqs(filename, cutoff=0): + raw_counts = defaultdict(float) + total = 0. + with open(filename, encoding='utf-8', newline='') as infile: + reader = csv.reader(infile) + for key, strval in reader: + val = float(strval) + if val < cutoff: + break + for token in retokenize(key): + token = fix_text(token) + total += val + # Use += so that, if we give the reader concatenated files with + # duplicates, it does the right thing + raw_counts[token] += val + + freqs = {key: raw_count / total + for (key, raw_count) in raw_counts.items()} + return freqs -def merge_counts(count_dicts, balance=False): +def freqs_to_dBpack(in_filename, out_filename, cutoff=-60): + freq_cutoff = 10 ** (cutoff / 10.) + freqs = read_freqs(in_filename, freq_cutoff) + dBpack = [] + for token, freq in freqs.items(): + dB = round(math.log10(freq) * 10) + if dB >= cutoff: + neg_dB = -dB + while neg_dB >= len(dBpack): + dBpack.append([]) + dBpack[neg_dB].append(token) + + with gzip.open(out_filename, 'wb') as outfile: + msgpack.dump(dBpack, outfile) + + +def merge_freqs(freq_dicts): + vocab = set() + for freq_dict in freq_dicts: + vocab |= set(freq_dict) + merged = defaultdict(float) - maxweight = None - for counts in count_dicts: - if balance: - if maxweight is None: - maxweight = max(counts.values()) - weight = maxweight / max(counts.values()) / len(count_dicts) - else: - weight = 1. - for key, val in counts.items(): - merged[key] += val * weight + N = len(freq_dicts) + for term in vocab: + term_total = 0. + for freq_dict in freq_dicts: + term_total += freq_dict.get(term, 0.) + merged[term] = term_total / N + return merged -def write_counts(counts, path, cutoff=2): - print("Writing to %s" % path) - with path.open('w', encoding='utf-8', newline='') as outfile: +def write_wordlist(freqs, filename, cutoff=1e-8): + """ + Write a dictionary of either raw counts or frequencies to a file of + comma-separated values. + """ + with open(filename, 'w', encoding='utf-8', newline='\n') as outfile: writer = csv.writer(outfile) - items = sorted(counts.items(), key=itemgetter(1), reverse=True) - for word, count in items: - if count < cutoff: - # Don't write all the terms that appeared too infrequently + items = sorted(freqs.items(), key=itemgetter(1), reverse=True) + for word, freq in items: + if freq < cutoff: break if not ('"' in word or ',' in word): - writer.writerow([word, str(int(count))]) - - -class WordCountBuilder: - def __init__(self, unique_docs=True, tokenizer=None): - self.counts = defaultdict(int) - self.unique_docs = unique_docs - if tokenizer is None: - self.tokenizer = treebank_surface_tokenizer - else: - self.tokenizer = tokenizer - - def add_text(self, text): - text = normalize('NFKC', text).lower() - try: - tokens = self.tokenizer(text) - # print(' '.join(tokens)) - except Exception as e: - print("Couldn't tokenize due to %r: %s" % (e, text), file=sys.stderr) - return - if self.unique_docs: - tokens = set(tokens) - for tok in tokens: - self.counts[tok] += 1 - - def count_wikipedia(self, path): - """ - Read a directory of extracted Wikipedia articles. The articles can be - grouped together into files, in which case they should be separated by - lines beginning with ##. - """ - with path.open(encoding='utf-8') as file: - article_lines = [] - for line in file: - line = line.strip() - if line.startswith('= ') and line.endswith(' ='): - # Fake level-1 headings indicate boundaries between articles - print(line) - self.try_wiki_article(' '.join(article_lines)) - article_lines.clear() - else: - # Skip other headings, so that "external" doesn't look - # ridiculously common, for example - if not (line.startswith('==') and line.endswith('==')): - article_lines.append(line) - self.try_wiki_article(' '.join(article_lines)) - - def try_wiki_article(self, text): - if len(text) > 1000: - self.add_text(text) - - def count_twitter(self, path, offset, nsplit): - with path.open(encoding='utf-8') as file: - for i, line in enumerate(file): - if i % nsplit == offset: - line = line.strip() - text = line.split('\t')[-1] - self.add_text(text) - - def save_wordlist(self, path): - write_counts(self.counts, path) + writer.writerow([word, str(freq)])