diff --git a/README.md b/README.md index c16b7d0..28055ca 100644 --- a/README.md +++ b/README.md @@ -23,8 +23,8 @@ install them on Ubuntu: ## Unicode data -The tokenizers used to split non-Japanese phrases use regexes built using the -`unicodedata` module from Python 3.4, which uses Unicode version 6.3.0. To +The tokenizers that split non-Japanese phrases utilize regexes built using the +`unicodedata` module from Python 3.4, which supports Unicode version 6.3.0. To update these regexes, run `scripts/gen_regex.py`. ## License @@ -58,4 +58,3 @@ Some additional data was collected by a custom application that watches the streaming Twitter API, in accordance with Twitter's Developer Agreement & Policy. This software only gives statistics about words that are very commonly used on Twitter; it does not display or republish any Twitter content. - diff --git a/wordfreq_builder/wordfreq_builder/ninja2dot.py b/scripts/ninja2dot.py similarity index 91% rename from wordfreq_builder/wordfreq_builder/ninja2dot.py rename to scripts/ninja2dot.py index 431ac09..7a2f403 100644 --- a/wordfreq_builder/wordfreq_builder/ninja2dot.py +++ b/scripts/ninja2dot.py @@ -1,3 +1,5 @@ +""" This file generates a graph of the dependencies for the ninja build.""" + import sys @@ -26,4 +28,3 @@ def ninja_to_dot(): if __name__ == '__main__': ninja_to_dot() - diff --git a/tests/test.py b/tests/test.py index 59d40f8..d38fd14 100644 --- a/tests/test.py +++ b/tests/test.py @@ -94,7 +94,7 @@ def test_failed_cB_conversion(): def test_tokenization(): # We preserve apostrophes within words, so "can't" is a single word in the - # data, while the fake word "plan't" can't be found. + # data eq_(tokenize("can't", 'en'), ["can't"]) eq_(tokenize('😂test', 'en'), ['😂', 'test']) @@ -135,12 +135,20 @@ def test_not_enough_ascii(): random_ascii_words(lang='zh') def test_ar(): + + # Remove tatweels eq_( tokenize('متــــــــعب', 'ar'), ['متعب'] ) + # Remove combining marks eq_( tokenize('حَرَكَات', 'ar'), ['حركات'] ) + + eq_( + tokenize('إﻻ', 'ar'), + ['إلا'] + ) diff --git a/wordfreq/__init__.py b/wordfreq/__init__.py index a5ac0ec..cb085f7 100644 --- a/wordfreq/__init__.py +++ b/wordfreq/__init__.py @@ -8,6 +8,8 @@ import itertools import pathlib import random import logging +import unicodedata + logger = logging.getLogger(__name__) @@ -66,11 +68,21 @@ def tokenize(text, lang): return mecab_tokenize(text) if lang == 'ar': - text = COMBINING_MARK_RE.sub('', text.replace('ـ', '')) + text = standardize_arabic(text) return simple_tokenize(text) +def standardize_arabic(text): + """ + Standardizes arabic text by removing combining marks and tatweels. + """ + return unicodedata.normalize( + 'NFKC', + COMBINING_MARK_RE.sub('', text.replace('ـ', '')) + ) + + def read_cBpack(filename): """ Read a file from an idiosyncratic format that we use for storing @@ -257,6 +269,9 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.): If a word decomposes into multiple tokens, we'll return a smoothed estimate of the word frequency that is no greater than the frequency of any of its individual tokens. + + It should be noted that the current tokenizer does not support + multi-word Chinese phrases. """ args = (word, lang, wordlist, minimum) try: diff --git a/wordfreq/data/combined_ar.msgpack.gz b/wordfreq/data/combined_ar.msgpack.gz index db40525..1b059e2 100644 Binary files a/wordfreq/data/combined_ar.msgpack.gz and b/wordfreq/data/combined_ar.msgpack.gz differ diff --git a/wordfreq/data/combined_de.msgpack.gz b/wordfreq/data/combined_de.msgpack.gz index 5e5d05a..0e89465 100644 Binary files a/wordfreq/data/combined_de.msgpack.gz and b/wordfreq/data/combined_de.msgpack.gz differ diff --git a/wordfreq/data/combined_el.msgpack.gz b/wordfreq/data/combined_el.msgpack.gz index 4f1987b..167acb0 100644 Binary files a/wordfreq/data/combined_el.msgpack.gz and b/wordfreq/data/combined_el.msgpack.gz differ diff --git a/wordfreq/data/combined_en.msgpack.gz b/wordfreq/data/combined_en.msgpack.gz index 38c13a4..fa68552 100644 Binary files a/wordfreq/data/combined_en.msgpack.gz and b/wordfreq/data/combined_en.msgpack.gz differ diff --git a/wordfreq/data/combined_es.msgpack.gz b/wordfreq/data/combined_es.msgpack.gz index 60993aa..dfc7f80 100644 Binary files a/wordfreq/data/combined_es.msgpack.gz and b/wordfreq/data/combined_es.msgpack.gz differ diff --git a/wordfreq/data/combined_fr.msgpack.gz b/wordfreq/data/combined_fr.msgpack.gz index 370c499..fc63301 100644 Binary files a/wordfreq/data/combined_fr.msgpack.gz and b/wordfreq/data/combined_fr.msgpack.gz differ diff --git a/wordfreq/data/combined_id.msgpack.gz b/wordfreq/data/combined_id.msgpack.gz index 610c9b6..3989727 100644 Binary files a/wordfreq/data/combined_id.msgpack.gz and b/wordfreq/data/combined_id.msgpack.gz differ diff --git a/wordfreq/data/combined_it.msgpack.gz b/wordfreq/data/combined_it.msgpack.gz index c3c2c21..5830417 100644 Binary files a/wordfreq/data/combined_it.msgpack.gz and b/wordfreq/data/combined_it.msgpack.gz differ diff --git a/wordfreq/data/combined_ja.msgpack.gz b/wordfreq/data/combined_ja.msgpack.gz index 3d5797a..fbaa41f 100644 Binary files a/wordfreq/data/combined_ja.msgpack.gz and b/wordfreq/data/combined_ja.msgpack.gz differ diff --git a/wordfreq/data/combined_ko.msgpack.gz b/wordfreq/data/combined_ko.msgpack.gz index 7d44281..2c2db53 100644 Binary files a/wordfreq/data/combined_ko.msgpack.gz and b/wordfreq/data/combined_ko.msgpack.gz differ diff --git a/wordfreq/data/combined_ms.msgpack.gz b/wordfreq/data/combined_ms.msgpack.gz index e7d628a..93e251c 100644 Binary files a/wordfreq/data/combined_ms.msgpack.gz and b/wordfreq/data/combined_ms.msgpack.gz differ diff --git a/wordfreq/data/combined_nl.msgpack.gz b/wordfreq/data/combined_nl.msgpack.gz index 12ef7a2..db62dde 100644 Binary files a/wordfreq/data/combined_nl.msgpack.gz and b/wordfreq/data/combined_nl.msgpack.gz differ diff --git a/wordfreq/data/combined_pt.msgpack.gz b/wordfreq/data/combined_pt.msgpack.gz index 3c1db27..a198a81 100644 Binary files a/wordfreq/data/combined_pt.msgpack.gz and b/wordfreq/data/combined_pt.msgpack.gz differ diff --git a/wordfreq/data/combined_ru.msgpack.gz b/wordfreq/data/combined_ru.msgpack.gz index ff925e2..3a0f9c4 100644 Binary files a/wordfreq/data/combined_ru.msgpack.gz and b/wordfreq/data/combined_ru.msgpack.gz differ diff --git a/wordfreq/data/combined_zh.msgpack.gz b/wordfreq/data/combined_zh.msgpack.gz index 6d99f38..0f89563 100644 Binary files a/wordfreq/data/combined_zh.msgpack.gz and b/wordfreq/data/combined_zh.msgpack.gz differ diff --git a/wordfreq/data/twitter_ar.msgpack.gz b/wordfreq/data/twitter_ar.msgpack.gz index 20939f9..eb9291a 100644 Binary files a/wordfreq/data/twitter_ar.msgpack.gz and b/wordfreq/data/twitter_ar.msgpack.gz differ diff --git a/wordfreq/data/twitter_de.msgpack.gz b/wordfreq/data/twitter_de.msgpack.gz index 1329391..b943dee 100644 Binary files a/wordfreq/data/twitter_de.msgpack.gz and b/wordfreq/data/twitter_de.msgpack.gz differ diff --git a/wordfreq/data/twitter_en.msgpack.gz b/wordfreq/data/twitter_en.msgpack.gz index 8017c56..956487e 100644 Binary files a/wordfreq/data/twitter_en.msgpack.gz and b/wordfreq/data/twitter_en.msgpack.gz differ diff --git a/wordfreq/data/twitter_es.msgpack.gz b/wordfreq/data/twitter_es.msgpack.gz index 936ec75..56b253d 100644 Binary files a/wordfreq/data/twitter_es.msgpack.gz and b/wordfreq/data/twitter_es.msgpack.gz differ diff --git a/wordfreq/data/twitter_fr.msgpack.gz b/wordfreq/data/twitter_fr.msgpack.gz index e41589a..49f8ef8 100644 Binary files a/wordfreq/data/twitter_fr.msgpack.gz and b/wordfreq/data/twitter_fr.msgpack.gz differ diff --git a/wordfreq/data/twitter_id.msgpack.gz b/wordfreq/data/twitter_id.msgpack.gz index b2bc598..5ad7439 100644 Binary files a/wordfreq/data/twitter_id.msgpack.gz and b/wordfreq/data/twitter_id.msgpack.gz differ diff --git a/wordfreq/data/twitter_it.msgpack.gz b/wordfreq/data/twitter_it.msgpack.gz index 5301ed7..d5c1175 100644 Binary files a/wordfreq/data/twitter_it.msgpack.gz and b/wordfreq/data/twitter_it.msgpack.gz differ diff --git a/wordfreq/data/twitter_ja.msgpack.gz b/wordfreq/data/twitter_ja.msgpack.gz index 74f33d5..3136f18 100644 Binary files a/wordfreq/data/twitter_ja.msgpack.gz and b/wordfreq/data/twitter_ja.msgpack.gz differ diff --git a/wordfreq/data/twitter_ko.msgpack.gz b/wordfreq/data/twitter_ko.msgpack.gz index 63735be..e88a6f8 100644 Binary files a/wordfreq/data/twitter_ko.msgpack.gz and b/wordfreq/data/twitter_ko.msgpack.gz differ diff --git a/wordfreq/data/twitter_ms.msgpack.gz b/wordfreq/data/twitter_ms.msgpack.gz index 83d2b57..0497311 100644 Binary files a/wordfreq/data/twitter_ms.msgpack.gz and b/wordfreq/data/twitter_ms.msgpack.gz differ diff --git a/wordfreq/data/twitter_nl.msgpack.gz b/wordfreq/data/twitter_nl.msgpack.gz index b8d2281..0542cf2 100644 Binary files a/wordfreq/data/twitter_nl.msgpack.gz and b/wordfreq/data/twitter_nl.msgpack.gz differ diff --git a/wordfreq/data/twitter_pt.msgpack.gz b/wordfreq/data/twitter_pt.msgpack.gz index 348d5a1..5b68d15 100644 Binary files a/wordfreq/data/twitter_pt.msgpack.gz and b/wordfreq/data/twitter_pt.msgpack.gz differ diff --git a/wordfreq/data/twitter_ru.msgpack.gz b/wordfreq/data/twitter_ru.msgpack.gz index 9082723..deec3aa 100644 Binary files a/wordfreq/data/twitter_ru.msgpack.gz and b/wordfreq/data/twitter_ru.msgpack.gz differ diff --git a/wordfreq_builder/README.md b/wordfreq_builder/README.md index 18241a1..a17c504 100644 --- a/wordfreq_builder/README.md +++ b/wordfreq_builder/README.md @@ -47,8 +47,7 @@ Start the build, and find something else to do for a few hours: ninja -v -You can copy the results into wordfreq with this command (supposing that -$WORDFREQ points to your wordfreq repo): +You can copy the results into wordfreq with this command: cp data/dist/*.msgpack.gz ../wordfreq/data/ @@ -83,6 +82,19 @@ The specific rules are described by the comments in `rules.ninja`. ## Data sources +### Wikipedia + +Wikipedia is a "free-access, free-content Internet encyclopedia". + +These files can be downloaded from [wikimedia dump][wikipedia] + +The original files are in `data/raw-input/wikipedia`, and they're processed +by the `wiki2text` rule in `rules.ninja`. Parsing wikipedia requires the +[wiki2text][] package. + +[wikipedia]: https://dumps.wikimedia.org/backup-index.html +[wiki2text]: https://github.com/rspeer/wiki2text + ### Leeds Internet Corpus Also known as the "Web as Corpus" project, this is a University of Leeds @@ -102,7 +114,7 @@ by the `convert_leeds` rule in `rules.ninja`. The file `data/raw-input/twitter/all-2014.txt` contains about 72 million tweets collected by the `ftfy.streamtester` package in 2014. -It's not possible to distribute the text of tweets. However, this process could +We are not allowed to distribute the text of tweets. However, this process could be reproduced by running `ftfy.streamtester`, part of the [ftfy][] package, for a couple of weeks. @@ -162,4 +174,3 @@ longer represents the words 'don' and 'won', as we assume most of their frequency comes from "don't" and "won't". Words that turned into similarly common words, however, were left alone: this list doesn't represent "can't" because the word was left as "can". - diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index d693f52..b708533 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -29,12 +29,12 @@ rule split # Wikipedia dumps obtained from dumps.wikimedia.org. The code is at # https://github.com/rspeer/wiki2text. rule wiki2text - command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out + command = bunzip2 -c $in | wiki2text > $out # To tokenize Japanese, we run it through Mecab and take the first column. # We don't have a plan for tokenizing Chinese yet. rule tokenize_japanese - command = mkdir -p $$(dirname $out) && mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out + command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out # Tokenizing text from Twitter requires us to language-detect and tokenize # in the same step. @@ -49,12 +49,12 @@ rule tokenize_twitter # Grep out the term "EOS", an indication that Leeds used MeCab and didn't # strip out the EOS lines. rule convert_leeds - command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out + command = sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out # To convert the OpenSubtitles frequency data, simply replace spaces with # commas. rule convert_opensubtitles - command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out + command = tr ' ' ',' < $in > $out # Convert and clean up the Google Books Syntactic N-grams data. Concatenate all # the input files, keep only the single words and their counts, and only keep @@ -65,16 +65,16 @@ rule convert_opensubtitles # source data was already filtered to only show words in roles with at least # two-digit counts of occurences.) rule convert_google_syntactic_ngrams - command = mkdir -p $$(dirname $out) && zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out + command = zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out rule count - command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out + command = python -m wordfreq_builder.cli.count_tokens $in $out rule merge - command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in + command = python -m wordfreq_builder.cli.combine_lists -o $out $in rule freqs2cB - command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_cB $in $out + command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out rule cat command = cat $in > $out diff --git a/wordfreq_builder/setup.py b/wordfreq_builder/setup.py index c7232cc..6f0a438 100755 --- a/wordfreq_builder/setup.py +++ b/wordfreq_builder/setup.py @@ -9,12 +9,5 @@ setup( platforms=["any"], description="Turns raw data into word frequency lists", packages=['wordfreq_builder'], - install_requires=['msgpack-python', 'pycld2'], - entry_points={ - 'console_scripts': [ - 'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main', - 'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main', - 'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main' - ] - } + install_requires=['msgpack-python', 'pycld2'] ) diff --git a/wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py b/wordfreq_builder/tests/test_tokenizer.py similarity index 100% rename from wordfreq_builder/wordfreq_builder/tests/test_tokenizer.py rename to wordfreq_builder/tests/test_tokenizer.py diff --git a/wordfreq_builder/wordfreq_builder/cli/count_tokens.py b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py index 4aeba5b..56b93cb 100644 --- a/wordfreq_builder/wordfreq_builder/cli/count_tokens.py +++ b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py @@ -13,4 +13,3 @@ if __name__ == '__main__': parser.add_argument('filename_out', help='name of output file') args = parser.parse_args() handle_counts(args.filename_in, args.filename_out) - diff --git a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py index 6bf3957..9d0b1dc 100644 --- a/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py +++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_cB.py @@ -4,8 +4,8 @@ import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() + parser.add_argument('language', help='language of the input file') parser.add_argument('filename_in', help='name of input file containing tokens') parser.add_argument('filename_out', help='name of output file') args = parser.parse_args() - freqs_to_cBpack(args.filename_in, args.filename_out) - + freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language) diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py index df2cb6b..879caa4 100644 --- a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py +++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py @@ -1,18 +1,13 @@ -from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_file +from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter import argparse -def tokenize_twitter(in_filename, out_prefix): - tokenize_file(in_filename, out_prefix, - tokenizer=cld2_surface_tokenizer) - - def main(): parser = argparse.ArgumentParser() parser.add_argument('filename', help='filename of input file containing one tweet per line') parser.add_argument('outprefix', help='prefix of output filenames') args = parser.parse_args() - tokenize_twitter(args.filename, args.outprefix) + tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer) if __name__ == '__main__': diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index ec59716..fa937cd 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -10,10 +10,6 @@ HEADER = """# This file is automatically generated. Do not edit it. TMPDIR = data_filename('tmp') -# Set this to True to rebuild the Twitter tokenization (which takes days) -TOKENIZE_TWITTER = True - - def add_dep(lines, rule, input, output, extra=None, params=None): if isinstance(output, list): output = ' '.join(output) @@ -48,17 +44,15 @@ def make_ninja_deps(rules_filename, out=sys.stdout): # The first dependency is to make sure the build file is up to date. add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja', extra='wordfreq_builder/ninja.py') - - if TOKENIZE_TWITTER: - lines.extend( - twitter_deps( - data_filename('raw-input/twitter/all-2014.txt'), - slice_prefix=data_filename('slices/twitter/tweets-2014'), - combined_prefix=data_filename('generated/twitter/tweets-2014'), - slices=40, - languages=CONFIG['sources']['twitter'] - ) + lines.extend( + twitter_deps( + data_filename('raw-input/twitter/all-2014.txt'), + slice_prefix=data_filename('slices/twitter/tweets-2014'), + combined_prefix=data_filename('generated/twitter/tweets-2014'), + slices=40, + languages=CONFIG['sources']['twitter'] ) + ) lines.extend( wikipedia_deps( data_filename('raw-input/wikipedia'), @@ -92,17 +86,18 @@ def wikipedia_deps(dirname_in, languages): path_in = pathlib.Path(dirname_in) for language in languages: # Find the most recent file for this language - # Skip over files that do not exist - input_file = max(path_in.glob( - '{}wiki*.bz2'.format(language) - )) + input_file = max(path_in.glob('{}wiki*.bz2'.format(language))) plain_text_file = wordlist_filename('wikipedia', language, 'txt') count_file = wordlist_filename('wikipedia', language, 'counts.txt') add_dep(lines, 'wiki2text', input_file, plain_text_file) if language == 'ja': - mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt') - add_dep(lines, 'tokenize_japanese', plain_text_file, mecab_token_file) + mecab_token_file = wordlist_filename( + 'wikipedia', language, 'mecab-tokens.txt' + ) + add_dep( + lines, 'tokenize_japanese', plain_text_file, mecab_token_file + ) add_dep(lines, 'count', mecab_token_file, count_file) else: add_dep(lines, 'count', plain_text_file, count_file) @@ -126,17 +121,18 @@ def google_books_deps(dirname_in): return lines -def twitter_deps(input_filename, slice_prefix, - combined_prefix, slices, languages): +def twitter_deps(input_filename, slice_prefix, combined_prefix, slices, + languages): + lines = [] - slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num) + slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, + num=num) for num in range(slices)] # split the input into slices - add_dep(lines, - 'split', input_filename, slice_files, + add_dep(lines, 'split', input_filename, slice_files, params={'prefix': '{}.part'.format(slice_prefix), - 'slices': slices}) + 'slices': slices}) for slicenum in range(slices): slice_file = slice_files[slicenum] @@ -151,7 +147,9 @@ def twitter_deps(input_filename, slice_prefix, combined_output = wordlist_filename('twitter', language, 'tokens.txt') language_inputs = [ - '{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language) + '{prefix}.{lang}.txt'.format( + prefix=slice_files[slicenum], lang=language + ) for slicenum in range(slices) ] @@ -160,11 +158,14 @@ def twitter_deps(input_filename, slice_prefix, count_file = wordlist_filename('twitter', language, 'counts.txt') if language == 'ja': - mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt') - add_dep(lines, 'tokenize_japanese', combined_output, mecab_token_file) - add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py') - else: - add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py') + mecab_token_file = wordlist_filename( + 'twitter', language, 'mecab-tokens.txt') + add_dep( + lines, 'tokenize_japanese', combined_output, mecab_token_file) + combined_output = mecab_token_file + + add_dep(lines, 'count', combined_output, count_file, + extra='wordfreq_builder/tokenizers.py') return lines @@ -187,7 +188,8 @@ def opensubtitles_deps(dirname_in, languages): input_file = '{prefix}/{lang}.txt'.format( prefix=dirname_in, lang=language ) - reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt') + reformatted_file = wordlist_filename( + 'opensubtitles', language, 'counts.txt') add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file) return lines @@ -205,18 +207,22 @@ def combine_lists(languages): add_dep(lines, 'merge', input_files, output_file, extra='wordfreq_builder/word_counts.py') - output_cBpack = wordlist_filename('combined-dist', language, 'msgpack.gz') + output_cBpack = wordlist_filename( + 'combined-dist', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', output_file, output_cBpack, - extra='wordfreq_builder/word_counts.py') + extra='wordfreq_builder/word_counts.py', + params={'lang': language}) lines.append('default {}'.format(output_cBpack)) # Write standalone lists for Twitter frequency if language in CONFIG['sources']['twitter']: input_file = wordlist_filename('twitter', language, 'counts.txt') - output_cBpack = wordlist_filename('twitter-dist', language, 'msgpack.gz') + output_cBpack = wordlist_filename( + 'twitter-dist', language, 'msgpack.gz') add_dep(lines, 'freqs2cB', input_file, output_cBpack, - extra='wordfreq_builder/word_counts.py') + extra='wordfreq_builder/word_counts.py', + params={'lang': language}) lines.append('default {}'.format(output_cBpack)) diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index 733191d..5815292 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -1,63 +1,56 @@ from html.entities import name2codepoint from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE +from ftfy.fixes import unescape_html import re import pycld2 -CLD2_BAD_CHAR_RANGE = "".join([ - '[', - '\x00-\x08', - '\x0b', - '\x0e-\x1f', - '\x7f-\x9f', - '\ud800-\udfff', - '\ufdd0-\ufdef'] + - [chr(65534+65536*x+y) for x in range(17) for y in range(2)] + - [']']) +CLD2_BAD_CHAR_RANGE = "[%s]" % "".join( + [ + '\x00-\x08', + '\x0b', + '\x0e-\x1f', + '\x7f-\x9f', + '\ud800-\udfff', + '\ufdd0-\ufdef' + ] + + [chr(65534+65536*x+y) for x in range(17) for y in range(2)] +) CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE) TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE)) -TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+'.format(NON_PUNCT_RANGE)) +TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+') def cld2_surface_tokenizer(text): """ Uses CLD2 to detect the language and wordfreq tokenizer to create tokens """ - text = remove_handles_and_urls(text) + text = unescape_html(text) + text = TWITTER_HANDLE_RE.sub('', text) + text = TCO_RE.sub('', text) lang = cld2_detect_language(text) tokens = tokenize(text, lang) return lang, tokens + def cld2_detect_language(text): """ Uses CLD2 to detect the language """ + # Format of pycld2.detect: + # (Confident in result: bool, + # Number of bytes of text: Int, + # Triples of detected languages in order of certainty: + # (Language name: str, + # Language code: str + # Percent of text in this language: float + # Confidence score: float)) + text = CLD2_BAD_CHARS_RE.sub('', text) return pycld2.detect(text)[2][0][1] -def remove_handles_and_urls(text): - text = fix_entities(text) - text = TWITTER_HANDLE_RE.sub('', text) - text = TCO_RE.sub('', text) - return text -def last_tab(line): - """ - Read lines by keeping only the last tab-separated value. - """ - return line.split('\t')[-1].strip() - -def lowercase_text_filter(token): - """ - If this looks like a token that we want to count, return it, lowercased. - If not, filter it out by returning None. - """ - if TOKEN_RE.search(token): - return token.lower() - else: - return None - -def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab): +def tokenize_twitter(in_filename, out_prefix, tokenizer): """ Process a file by running it through the given tokenizer, sorting the results by the language of each line, and inserting newlines @@ -66,7 +59,7 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab): out_files = {} with open(in_filename, encoding='utf-8') as in_file: for line in in_file: - text = line_reader(line) + text = line.split('\t')[-1].strip() language, tokens = tokenizer(text) if language != 'un': tokenized = '\n'.join(tokens) @@ -79,37 +72,3 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab): print(tokenized, file=out_file) for out_file in out_files.values(): out_file.close() - -ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;') - -def fix_entities(text): - """ - Fix the few HTML entities that Twitter uses -- even if they've - already been tokenized. - """ - def replace_entity(match): - return chr(name2codepoint[match.group(1)]) - return ENTITY_RE.sub(replace_entity, text) - -def monolingual_tokenize_file(in_filename, out_filename, language, - tokenizer, line_reader=last_tab, - sample_proportion=1): - """ - Process a file by running it through the given tokenizer, only keeping - lines of the language we're asking for, and inserting newlines - to mark the token boundaries. - - `line_reader` is applied to each line before it given to the tokenizer - - Only the first line out of every `sample_proportion` lines are run through - then tokenizer. - """ - with open(in_filename, encoding='utf-8', errors='replace') as in_file: - with open(out_filename, 'w', encoding='utf-8') as out_file: - for i, line in enumerate(in_file): - if i % sample_proportion == 0: - text = line_reader(line) - tokens, line_language = tokenizer(text) - if line_language == language: - for token in tokens: - print(token, file=out_file) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 8f4099c..d7ace2d 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -1,4 +1,4 @@ -from wordfreq import simple_tokenize +from wordfreq import simple_tokenize, tokenize from collections import defaultdict from operator import itemgetter from ftfy import fix_text @@ -18,41 +18,49 @@ def count_tokens(filename): counts = defaultdict(int) with open(filename, encoding='utf-8', errors='replace') as infile: for line in infile: - for token in simple_tokenize(line.strip()): + for token in simple_tokenize(line): counts[token] += 1 + return counts -def read_freqs(filename, cutoff=0): +def read_freqs(filename, cutoff=0, lang=None): """ Read words and their frequencies from a CSV file. - Only words with a frequency greater than `cutoff` are returned. + Only words with a frequency greater than or equal to `cutoff` are returned. If `cutoff` is greater than 0, the csv file must be sorted by frequency in descending order. + + If lang is given, read_freqs will apply language specific preprocessing + operations. """ raw_counts = defaultdict(float) total = 0. with open(filename, encoding='utf-8', newline='') as infile: reader = csv.reader(infile) for key, strval in reader: + val = float(strval) if val < cutoff: break - for token in simple_tokenize(key): + + tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) + for token in tokens: token = fix_text(token) total += val # Use += so that, if we give the reader concatenated files with # duplicates, it does the right thing raw_counts[token] += val - freqs = {key: raw_count / total - for (key, raw_count) in raw_counts.items()} - return freqs + for word in raw_counts: + raw_counts[word] /= total + + return raw_counts -def freqs_to_cBpack(in_filename, out_filename, cutoff=-600): +def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None): """ Convert a csv file of words and their frequencies to a file in the idiosyncratic 'cBpack' format. @@ -61,15 +69,14 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600): written to the new file. """ freq_cutoff = 10 ** (cutoff / 100.) - freqs = read_freqs(in_filename, freq_cutoff) + freqs = read_freqs(in_filename, freq_cutoff, lang=lang) cBpack = [] for token, freq in freqs.items(): cB = round(math.log10(freq) * 100) - if cB >= cutoff: - neg_cB = -cB - while neg_cB >= len(cBpack): - cBpack.append([]) - cBpack[neg_cB].append(token) + neg_cB = -cB + while neg_cB >= len(cBpack): + cBpack.append([]) + cBpack[neg_cB].append(token) for sublist in cBpack: sublist.sort() @@ -88,7 +95,7 @@ def merge_freqs(freq_dicts): """ vocab = set() for freq_dict in freq_dicts: - vocab |= set(freq_dict) + vocab.update(freq_dict) merged = defaultdict(float) N = len(freq_dicts)