diff --git a/setup.py b/setup.py index 2926f3f..87a0936 100755 --- a/setup.py +++ b/setup.py @@ -34,7 +34,7 @@ if sys.version_info < (3, 4): setup( name="wordfreq", - version='1.2', + version='1.3', maintainer='Luminoso Technologies, Inc.', maintainer_email='info@luminoso.com', url='http://github.com/LuminosoInsight/wordfreq/', diff --git a/wordfreq/data/combined_ar.msgpack.gz b/wordfreq/data/combined_ar.msgpack.gz index 15d464b..1096472 100644 Binary files a/wordfreq/data/combined_ar.msgpack.gz and b/wordfreq/data/combined_ar.msgpack.gz differ diff --git a/wordfreq/data/combined_de.msgpack.gz b/wordfreq/data/combined_de.msgpack.gz index 6b1f8e5..94af721 100644 Binary files a/wordfreq/data/combined_de.msgpack.gz and b/wordfreq/data/combined_de.msgpack.gz differ diff --git a/wordfreq/data/combined_el.msgpack.gz b/wordfreq/data/combined_el.msgpack.gz index 1da9edc..856abc1 100644 Binary files a/wordfreq/data/combined_el.msgpack.gz and b/wordfreq/data/combined_el.msgpack.gz differ diff --git a/wordfreq/data/combined_en.msgpack.gz b/wordfreq/data/combined_en.msgpack.gz index 642dc49..bbd0cc4 100644 Binary files a/wordfreq/data/combined_en.msgpack.gz and b/wordfreq/data/combined_en.msgpack.gz differ diff --git a/wordfreq/data/combined_es.msgpack.gz b/wordfreq/data/combined_es.msgpack.gz index 58269d1..39f0eea 100644 Binary files a/wordfreq/data/combined_es.msgpack.gz and b/wordfreq/data/combined_es.msgpack.gz differ diff --git a/wordfreq/data/combined_fr.msgpack.gz b/wordfreq/data/combined_fr.msgpack.gz index 7bd9905..6faea92 100644 Binary files a/wordfreq/data/combined_fr.msgpack.gz and b/wordfreq/data/combined_fr.msgpack.gz differ diff --git a/wordfreq/data/combined_id.msgpack.gz b/wordfreq/data/combined_id.msgpack.gz index 9844891..9b33049 100644 Binary files a/wordfreq/data/combined_id.msgpack.gz and b/wordfreq/data/combined_id.msgpack.gz differ diff --git a/wordfreq/data/combined_it.msgpack.gz b/wordfreq/data/combined_it.msgpack.gz index 3734388..741f518 100644 Binary files a/wordfreq/data/combined_it.msgpack.gz and b/wordfreq/data/combined_it.msgpack.gz differ diff --git a/wordfreq/data/combined_ja.msgpack.gz b/wordfreq/data/combined_ja.msgpack.gz index 5477083..f1c660d 100644 Binary files a/wordfreq/data/combined_ja.msgpack.gz and b/wordfreq/data/combined_ja.msgpack.gz differ diff --git a/wordfreq/data/combined_ko.msgpack.gz b/wordfreq/data/combined_ko.msgpack.gz index 05f30a3..5dda29a 100644 Binary files a/wordfreq/data/combined_ko.msgpack.gz and b/wordfreq/data/combined_ko.msgpack.gz differ diff --git a/wordfreq/data/combined_ms.msgpack.gz b/wordfreq/data/combined_ms.msgpack.gz index 1aede1d..d7f4ad7 100644 Binary files a/wordfreq/data/combined_ms.msgpack.gz and b/wordfreq/data/combined_ms.msgpack.gz differ diff --git a/wordfreq/data/combined_nl.msgpack.gz b/wordfreq/data/combined_nl.msgpack.gz index f6208a3..48d681a 100644 Binary files a/wordfreq/data/combined_nl.msgpack.gz and b/wordfreq/data/combined_nl.msgpack.gz differ diff --git a/wordfreq/data/combined_pl.msgpack.gz b/wordfreq/data/combined_pl.msgpack.gz index 6a70139..2d45b1a 100644 Binary files a/wordfreq/data/combined_pl.msgpack.gz and b/wordfreq/data/combined_pl.msgpack.gz differ diff --git a/wordfreq/data/combined_pt.msgpack.gz b/wordfreq/data/combined_pt.msgpack.gz index 4009376..7371866 100644 Binary files a/wordfreq/data/combined_pt.msgpack.gz and b/wordfreq/data/combined_pt.msgpack.gz differ diff --git a/wordfreq/data/combined_ru.msgpack.gz b/wordfreq/data/combined_ru.msgpack.gz index 4eeb762..123eb54 100644 Binary files a/wordfreq/data/combined_ru.msgpack.gz and b/wordfreq/data/combined_ru.msgpack.gz differ diff --git a/wordfreq/data/combined_sv.msgpack.gz b/wordfreq/data/combined_sv.msgpack.gz index 0c883ae..0cc1398 100644 Binary files a/wordfreq/data/combined_sv.msgpack.gz and b/wordfreq/data/combined_sv.msgpack.gz differ diff --git a/wordfreq/data/combined_tr.msgpack.gz b/wordfreq/data/combined_tr.msgpack.gz index 53ae060..3f6063c 100644 Binary files a/wordfreq/data/combined_tr.msgpack.gz and b/wordfreq/data/combined_tr.msgpack.gz differ diff --git a/wordfreq/data/combined_zh.msgpack.gz b/wordfreq/data/combined_zh.msgpack.gz index 02b11c1..1205f84 100644 Binary files a/wordfreq/data/combined_zh.msgpack.gz and b/wordfreq/data/combined_zh.msgpack.gz differ diff --git a/wordfreq/data/twitter_ar.msgpack.gz b/wordfreq/data/twitter_ar.msgpack.gz index 77f00de..d87307f 100644 Binary files a/wordfreq/data/twitter_ar.msgpack.gz and b/wordfreq/data/twitter_ar.msgpack.gz differ diff --git a/wordfreq/data/twitter_de.msgpack.gz b/wordfreq/data/twitter_de.msgpack.gz index 2c4b131..9422fb5 100644 Binary files a/wordfreq/data/twitter_de.msgpack.gz and b/wordfreq/data/twitter_de.msgpack.gz differ diff --git a/wordfreq/data/twitter_el.msgpack.gz b/wordfreq/data/twitter_el.msgpack.gz index 4f063da..af1a0b1 100644 Binary files a/wordfreq/data/twitter_el.msgpack.gz and b/wordfreq/data/twitter_el.msgpack.gz differ diff --git a/wordfreq/data/twitter_en.msgpack.gz b/wordfreq/data/twitter_en.msgpack.gz index 126d991..cb81d3d 100644 Binary files a/wordfreq/data/twitter_en.msgpack.gz and b/wordfreq/data/twitter_en.msgpack.gz differ diff --git a/wordfreq/data/twitter_es.msgpack.gz b/wordfreq/data/twitter_es.msgpack.gz index e0a7518..9f80432 100644 Binary files a/wordfreq/data/twitter_es.msgpack.gz and b/wordfreq/data/twitter_es.msgpack.gz differ diff --git a/wordfreq/data/twitter_fr.msgpack.gz b/wordfreq/data/twitter_fr.msgpack.gz index 115fe97..05de393 100644 Binary files a/wordfreq/data/twitter_fr.msgpack.gz and b/wordfreq/data/twitter_fr.msgpack.gz differ diff --git a/wordfreq/data/twitter_id.msgpack.gz b/wordfreq/data/twitter_id.msgpack.gz index 081ddcb..579964a 100644 Binary files a/wordfreq/data/twitter_id.msgpack.gz and b/wordfreq/data/twitter_id.msgpack.gz differ diff --git a/wordfreq/data/twitter_it.msgpack.gz b/wordfreq/data/twitter_it.msgpack.gz index f97e2a2..174235b 100644 Binary files a/wordfreq/data/twitter_it.msgpack.gz and b/wordfreq/data/twitter_it.msgpack.gz differ diff --git a/wordfreq/data/twitter_ja.msgpack.gz b/wordfreq/data/twitter_ja.msgpack.gz index d428de3..8f739f9 100644 Binary files a/wordfreq/data/twitter_ja.msgpack.gz and b/wordfreq/data/twitter_ja.msgpack.gz differ diff --git a/wordfreq/data/twitter_ko.msgpack.gz b/wordfreq/data/twitter_ko.msgpack.gz index 2e6678e..334a127 100644 Binary files a/wordfreq/data/twitter_ko.msgpack.gz and b/wordfreq/data/twitter_ko.msgpack.gz differ diff --git a/wordfreq/data/twitter_ms.msgpack.gz b/wordfreq/data/twitter_ms.msgpack.gz index 9907914..346bdaa 100644 Binary files a/wordfreq/data/twitter_ms.msgpack.gz and b/wordfreq/data/twitter_ms.msgpack.gz differ diff --git a/wordfreq/data/twitter_nl.msgpack.gz b/wordfreq/data/twitter_nl.msgpack.gz index 0a1183f..7681324 100644 Binary files a/wordfreq/data/twitter_nl.msgpack.gz and b/wordfreq/data/twitter_nl.msgpack.gz differ diff --git a/wordfreq/data/twitter_pl.msgpack.gz b/wordfreq/data/twitter_pl.msgpack.gz index 5a8d62a..11b61eb 100644 Binary files a/wordfreq/data/twitter_pl.msgpack.gz and b/wordfreq/data/twitter_pl.msgpack.gz differ diff --git a/wordfreq/data/twitter_pt.msgpack.gz b/wordfreq/data/twitter_pt.msgpack.gz index e243cc7..0e845ab 100644 Binary files a/wordfreq/data/twitter_pt.msgpack.gz and b/wordfreq/data/twitter_pt.msgpack.gz differ diff --git a/wordfreq/data/twitter_ru.msgpack.gz b/wordfreq/data/twitter_ru.msgpack.gz index 01cd2de..e426344 100644 Binary files a/wordfreq/data/twitter_ru.msgpack.gz and b/wordfreq/data/twitter_ru.msgpack.gz differ diff --git a/wordfreq/data/twitter_sv.msgpack.gz b/wordfreq/data/twitter_sv.msgpack.gz index 333f2a7..ab1e956 100644 Binary files a/wordfreq/data/twitter_sv.msgpack.gz and b/wordfreq/data/twitter_sv.msgpack.gz differ diff --git a/wordfreq/data/twitter_tr.msgpack.gz b/wordfreq/data/twitter_tr.msgpack.gz index 9a955b3..28eefa6 100644 Binary files a/wordfreq/data/twitter_tr.msgpack.gz and b/wordfreq/data/twitter_tr.msgpack.gz differ diff --git a/wordfreq_builder/lib/jq-linux64 b/wordfreq_builder/lib/jq-linux64 new file mode 100755 index 0000000..939227e Binary files /dev/null and b/wordfreq_builder/lib/jq-linux64 differ diff --git a/wordfreq_builder/rules.ninja b/wordfreq_builder/rules.ninja index df00062..ac9d4a0 100644 --- a/wordfreq_builder/rules.ninja +++ b/wordfreq_builder/rules.ninja @@ -13,7 +13,7 @@ # contains the programatically-defined dependency graph. # Variables -DATA = ./data +JQ = lib/jq-linux64 # How to build the build.ninja file itself. (Use the Makefile to get it the # first time.) @@ -92,10 +92,13 @@ rule merge command = python -m wordfreq_builder.cli.merge_freqs -o $out -c $cutoff -l $lang $in rule merge_counts - command = python -m wordfreq_builder.cli.merge_counts -o $out $in + command = python -m wordfreq_builder.cli.merge_counts -o $out -c $cutoff $in rule freqs2cB command = python -m wordfreq_builder.cli.freqs_to_cB $in $out rule cat command = cat $in > $out + +rule extract_reddit + command = bunzip2 -c $in | $JQ -r '.body' | fgrep -v '[deleted]' | sed 's/>/>/g' | sed 's/</ $out diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py index 5e3de69..4efe1d9 100644 --- a/wordfreq_builder/wordfreq_builder/cli/merge_counts.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_counts.py @@ -2,10 +2,10 @@ from wordfreq_builder.word_counts import read_values, merge_counts, write_wordli import argparse -def merge_lists(input_names, output_name): +def merge_lists(input_names, output_name, cutoff=0): count_dicts = [] for input_name in input_names: - values, total = read_values(input_name, cutoff=0) + values, total = read_values(input_name, cutoff=cutoff, max_size=1000000) count_dicts.append(values) merged = merge_counts(count_dicts) write_wordlist(merged, output_name) @@ -13,8 +13,12 @@ def merge_lists(input_names, output_name): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv') - parser.add_argument('inputs', help='names of input files to merge', nargs='+') + parser.add_argument('-o', '--output', default='combined-counts.csv', + help='filename to write the output to') + parser.add_argument('-c', '--cutoff', type=int, default=0 + help='minimum count to read from an input file') + parser.add_argument('inputs', nargs='+', + help='names of input files to merge') args = parser.parse_args() - merge_lists(args.inputs, args.output) + merge_lists(args.inputs, args.output, cutoff=args.cutoff) diff --git a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py index ddc308c..e16660b 100644 --- a/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py +++ b/wordfreq_builder/wordfreq_builder/cli/merge_freqs.py @@ -18,10 +18,14 @@ def merge_lists(input_names, output_name, cutoff, lang): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv') - parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2) - parser.add_argument('-l', '--language', help='language code for which language the words are in', default=None) - parser.add_argument('inputs', help='names of input files to merge', nargs='+') + parser.add_argument('-o', '--output', default='combined-freqs.csv', + help='filename to write the output to') + parser.add_argument('-c', '--cutoff', type=int, default=2, + help='stop after seeing a count below this') + parser.add_argument('-l', '--language', default=None, + help='language code for which language the words are in') + parser.add_argument('inputs', nargs='+', + help='names of input files to merge') args = parser.parse_args() merge_lists(args.inputs, args.output, args.cutoff, args.language) diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py new file mode 100644 index 0000000..6a275b3 --- /dev/null +++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_reddit.py @@ -0,0 +1,14 @@ +from wordfreq_builder.tokenizers import cld2_reddit_tokenizer, tokenize_by_language +import argparse + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('filename', help='filename of input file containing one comment per line') + parser.add_argument('outprefix', help='prefix of output filenames') + args = parser.parse_args() + tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_reddit_tokenizer) + + +if __name__ == '__main__': + main() diff --git a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py index 879caa4..d144866 100644 --- a/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py +++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_twitter.py @@ -1,4 +1,4 @@ -from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter +from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_by_language import argparse @@ -7,7 +7,7 @@ def main(): parser.add_argument('filename', help='filename of input file containing one tweet per line') parser.add_argument('outprefix', help='prefix of output filenames') args = parser.parse_args() - tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer) + tokenize_by_language(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer) if __name__ == '__main__': diff --git a/wordfreq_builder/wordfreq_builder/config.py b/wordfreq_builder/wordfreq_builder/config.py index 7ae1798..e0006e1 100644 --- a/wordfreq_builder/wordfreq_builder/config.py +++ b/wordfreq_builder/wordfreq_builder/config.py @@ -40,7 +40,8 @@ CONFIG = { ], 'subtlex-en': ['en'], 'subtlex-other': ['de', 'nl', 'zh'], - 'jieba': ['zh'] + 'jieba': ['zh'], + 'reddit': ['en'], }, # Subtlex languages that need to be pre-processed 'wordlist_paths': { @@ -52,6 +53,7 @@ CONFIG = { 'subtlex-en': 'generated/subtlex/subtlex_{lang}.{ext}', 'subtlex-other': 'generated/subtlex/subtlex_{lang}.{ext}', 'jieba': 'generated/jieba/jieba_{lang}.{ext}', + 'reddit': 'generated/reddit/reddit_{lang}.{ext}', 'combined': 'generated/combined/combined_{lang}.{ext}', 'combined-dist': 'dist/combined_{lang}.{ext}', 'twitter-dist': 'dist/twitter_{lang}.{ext}', diff --git a/wordfreq_builder/wordfreq_builder/ninja.py b/wordfreq_builder/wordfreq_builder/ninja.py index 80437ff..dc2a058 100644 --- a/wordfreq_builder/wordfreq_builder/ninja.py +++ b/wordfreq_builder/wordfreq_builder/ninja.py @@ -77,6 +77,10 @@ def make_ninja_deps(rules_filename, out=sys.stdout): data_filename('source-lists/subtlex'), CONFIG['sources']['subtlex-other'] ), + reddit_deps( + data_filename('raw-input/reddit'), + CONFIG['sources']['reddit'] + ), jieba_deps( data_filename('source-lists/jieba'), CONFIG['sources']['jieba'] @@ -232,6 +236,30 @@ def jieba_deps(dirname_in, languages): return lines +def reddit_deps(dirname_in, languages): + lines = [] + if not languages: + return lines + assert languages == ['en'] + + processed_files = [] + path_in = pathlib.Path(dirname_in) + for filepath in path_in.glob('*/*.bz2'): + base = filepath.name[:-4] + transformed_file = wordlist_filename('reddit', 'en', base + '.txt.gz') + add_dep(lines, 'extract_reddit', str(filepath), transformed_file) + count_file = wordlist_filename('reddit', 'en', base + '.counts.txt') + add_dep(lines, 'count', transformed_file, count_file) + processed_files.append(count_file) + + output_file = wordlist_filename('reddit', 'en', 'counts.txt') + add_dep( + lines, 'merge_counts', processed_files, output_file, + params={'cutoff': 3} + ) + return lines + + # Which columns of the SUBTLEX data files do the word and its frequency appear # in? SUBTLEX_COLUMN_MAP = { @@ -264,7 +292,10 @@ def subtlex_en_deps(dirname_in, languages): ) output_file = wordlist_filename('subtlex-en', 'en', 'counts.txt') - add_dep(lines, 'merge_counts', processed_files, output_file) + add_dep( + lines, 'merge_counts', processed_files, output_file, + params={'cutoff': 0} + ) return lines @@ -292,7 +323,8 @@ def subtlex_other_deps(dirname_in, languages): params={'textcol': textcol, 'freqcol': freqcol, 'startrow': 2} ) add_dep( - lines, 'merge_counts', processed_file, output_file + lines, 'merge_counts', processed_file, output_file, + params={'cutoff': 0} ) return lines diff --git a/wordfreq_builder/wordfreq_builder/tokenizers.py b/wordfreq_builder/wordfreq_builder/tokenizers.py index 7d18026..b47e94a 100644 --- a/wordfreq_builder/wordfreq_builder/tokenizers.py +++ b/wordfreq_builder/wordfreq_builder/tokenizers.py @@ -22,6 +22,8 @@ CLD2_BAD_CHARS_RE = regex.compile(CLD2_BAD_CHAR_RANGE) TWITTER_HANDLE_RE = regex.compile(r'@[\S--\p{punct}]+') TCO_RE = regex.compile('http(?:s)?://t.co/[a-zA-Z0-9]+') +URL_RE = regex.compile(r'http(?:s)?://[^) ]*') +MARKDOWN_URL_RESIDUE_RE = regex.compile(r'\]\(\)') def cld2_surface_tokenizer(text): @@ -31,6 +33,7 @@ def cld2_surface_tokenizer(text): text = unescape_html(text) text = TWITTER_HANDLE_RE.sub('', text) text = TCO_RE.sub('', text) + lang = cld2_detect_language(text) # Don't allow tokenization in Chinese when language-detecting, because @@ -42,6 +45,32 @@ def cld2_surface_tokenizer(text): return lang, tokens +# Low-frequency languages tend to be detected incorrectly. Keep a limited +# list of languages we're allowed to use here. +KEEP_THESE_LANGUAGES = { + 'ar', 'de', 'el', 'en', 'es', 'fr', 'hr', 'id', 'it', 'ja', 'ko', 'ms', + 'nl', 'pl', 'pt', 'ro', 'ru', 'sv' +} + + +def cld2_reddit_tokenizer(text): + """ + A language-detecting tokenizer with special cases for handling text from + Reddit. + """ + text = URL_RE.sub('', text) + text = MARKDOWN_URL_RESIDUE_RE.sub(']', text) + + lang = cld2_detect_language(text) + if lang not in KEEP_THESE_LANGUAGES: + # Reddit is 99.9% English, so if we detected a rare language, it's + # much more likely that it's actually English. + lang = 'en' + + tokens = tokenize(text, lang, include_punctuation=True) + return lang, tokens + + def cld2_detect_language(text): """ Uses CLD2 to detect the language. @@ -59,13 +88,11 @@ def cld2_detect_language(text): return pycld2.detect(text)[2][0][1] -def tokenize_twitter(in_filename, out_prefix, tokenizer): +def tokenize_by_language(in_filename, out_prefix, tokenizer): """ - Process a file by running it through the Twitter-specific tokenizer, - which uses cld2 for language detection, and removes Twitter handles - and t.co URLs. + Process a file by running it through a given tokenizer. - Produces output files that are separated by language, with newlines + Produces output files that are separated by language, with spaces between the tokens. """ out_files = {} @@ -74,7 +101,7 @@ def tokenize_twitter(in_filename, out_prefix, tokenizer): text = line.split('\t')[-1].strip() language, tokens = tokenizer(text) if language != 'un': - tokenized = '\n'.join(tokens) + tokenized = ' '.join(tokens) out_filename = '%s.%s.txt' % (out_prefix, language) if out_filename in out_files: out_file = out_files[out_filename] diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index a3bf0ae..47de7e5 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -24,16 +24,19 @@ def count_tokens(filename): containing '�'. """ counts = defaultdict(int) - with open(filename, encoding='utf-8', errors='replace') as infile: - for line in infile: - line = URL_RE.sub('', line.strip()) - for token in simple_tokenize(line): - counts[token] += 1 - + if filename.endswith('gz'): + infile = gzip.open(filename, 'rt', encoding='utf-8', errors='replace') + else: + infile = open(filename, encoding='utf-8', errors='replace') + for line in infile: + line = URL_RE.sub('', line.strip()) + for token in simple_tokenize(line): + counts[token] += 1 + infile.close() return counts -def read_values(filename, cutoff=0, lang=None): +def read_values(filename, cutoff=0, max_size=1e8, lang=None): """ Read words and their frequency or count values from a CSV file. Returns a dictionary of values and the total of all values. @@ -52,7 +55,7 @@ def read_values(filename, cutoff=0, lang=None): for key, strval in csv.reader(infile): val = float(strval) key = fix_text(key) - if val < cutoff: + if val < cutoff or len(values) >= max_size: break tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) for token in tokens: @@ -76,7 +79,7 @@ def read_freqs(filename, cutoff=0, lang=None): If lang is given, read_freqs will apply language specific preprocessing operations. """ - values, total = read_values(filename, cutoff, lang) + values, total = read_values(filename, cutoff, lang=lang) for word in values: values[word] /= total