Merge branch 'ninja-build'

Conflicts: wordfreq_builder/cmd_count_twitter.py wordfreq_builder/cmd_count_wikipedia.py
2024-12-25 10:15:23 +00:00 · 2015-05-08 00:01:01 -04:00 · 2015-05-08 00:01:01 -04:00 · b541fe68e1
commit b541fe68e1
parent 5d14d24738 2f14417bcf
19 changed files with 687 additions and 185 deletions
--- a/wordfreq_builder/.gitignore
+++ b/wordfreq_builder/.gitignore
@ -6,3 +6,7 @@ dist
 *.egg-info
 build
 _build
 build.ninja
 data
 .ninja_deps
 .ninja_log
--- a/wordfreq_builder/Makefile
+++ b/wordfreq_builder/Makefile
@ -0,0 +1,12 @@
 PYTHON = python
 all: build.ninja
 # make sure this package is in 'develop' mode and up to date
 wordfreq_builder.egg-info/PKG-INFO: setup.py
 	$(PYTHON) setup.py develop
 # build the Ninja file that will take over the build process
 build.ninja: rules.ninja wordfreq_builder/ninja.py wordfreq_builder/config.py wordfreq_builder.egg-info/PKG-INFO
 	$(PYTHON) -m wordfreq_builder.cli.build_deps rules.ninja > build.ninja
--- a/wordfreq_builder/rules.ninja
+++ b/wordfreq_builder/rules.ninja
@ -0,0 +1,63 @@
 # This defines the rules on how to build parts of the wordfreq lists, using the
 # Ninja build system:
 #
 #   http://martine.github.io/ninja/manual.html
 #
 # Ninja is available in the 'ninja-build' Ubuntu package. It's like make with
 # better parallelism and the ability for build steps to produce multiple
 # outputs. The tradeoff is that its rule syntax isn't full of magic for
 # expanding wildcards and finding dependencies, so in general you have to
 # write the dependencies using a script.
 #
 # This file will become the header of the larger build.ninja file, which also
 # contains the programatically-defined dependency graph.
 # Variables
 DATA = ./data
 # Splits the single file $in into $slices parts, whose names will be
 # $prefix plus a two-digit numeric suffix.
 rule split
  command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix
 # wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from
 # Wikipedia dumps obtained from dumps.wikimedia.org.  The code is at
 # https://github.com/rspeer/wiki2text.
 rule wiki2text
  command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
 rule wiki2tokens
  command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out
 rule tokenize_japanese
  command = mkdir -p $$(dirname $out) && mecab < $in | cut -f 1 | grep -v "EOS"
 rule tokenize_twitter
  command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.pretokenize_twitter $in $prefix
 rule format_twitter
  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.format_twitter $in $out
 # To convert the Leeds corpus, look for space-separated lines that start with
 # an integer and a decimal. The integer is the rank, which we discard. The
 # decimal is the frequency, and the remaining text is the term. Use sed -n
 # with /p to output only lines where the match was successful.
 rule convert_leeds
  command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in > $out
 # To convert the OpenSubtitles frequency data, simply replace spaces with
 # commas.
 rule convert_opensubtitles
  command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out
 rule count
  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out
 rule merge
  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in
 rule freqs2dB
  command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_dB $in $out
 rule cat
  command = cat $in > $out
--- a/wordfreq_builder/setup.py
+++ b/wordfreq_builder/setup.py
@ -9,4 +9,13 @@ setup(
    platforms=["any"],
    description="Turns raw data into word frequency lists",
    packages=['wordfreq_builder'],
    install_requires=['msgpack-python'],
    entry_points={
        'console_scripts': [
            'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
            'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
            'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main',
            'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
        ]
    }
 )
--- a/wordfreq_builder/wordfreq_builder/cli/init.py
+++ b/wordfreq_builder/wordfreq_builder/cli/init.py
--- a/wordfreq_builder/wordfreq_builder/cli/build_deps.py
+++ b/wordfreq_builder/wordfreq_builder/cli/build_deps.py
@ -0,0 +1,15 @@
 from wordfreq_builder.ninja import make_ninja_deps
 import argparse
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('in_filename', help='filename of rules file')
    args = parser.parse_args()
    # Make the complete ninja file and write it to standard out
    make_ninja_deps(args.in_filename)
 if __name__ == '__main__':
    main()
--- a/wordfreq_builder/wordfreq_builder/cli/combine_lists.py
+++ b/wordfreq_builder/wordfreq_builder/cli/combine_lists.py
@ -0,0 +1,19 @@
 from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
 import argparse
 def merge_lists(input_names, output_name):
    freq_dicts = []
    for input_name in input_names:
        freq_dicts.append(read_freqs(input_name, cutoff=2))
    merged = merge_freqs(freq_dicts)
    write_wordlist(merged, output_name)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
    parser.add_argument('inputs', help='names of input files to merge', nargs='+')
    args = parser.parse_args()
    merge_lists(args.inputs, args.output)
--- a/wordfreq_builder/wordfreq_builder/cli/count_tokens.py
+++ b/wordfreq_builder/wordfreq_builder/cli/count_tokens.py
@ -0,0 +1,16 @@
 from wordfreq_builder.word_counts import count_tokens, write_wordlist
 import argparse
 def handle_counts(filename_in, filename_out):
    counts = count_tokens(filename_in)
    write_wordlist(counts, filename_out)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('filename_in', help='name of input file containing tokens')
    parser.add_argument('filename_out', help='name of output file')
    args = parser.parse_args()
    handle_counts(args.filename_in, args.filename_out)
--- a/wordfreq_builder/wordfreq_builder/cli/format_twitter.py
+++ b/wordfreq_builder/wordfreq_builder/cli/format_twitter.py
@ -0,0 +1,14 @@
 from wordfreq_builder.tokenizers import retokenize_file
 import argparse
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('in_filename', help='filename of input file containing one tweet per line')
    parser.add_argument('out_filename', help='filename of output file')
    args = parser.parse_args()
    retokenize_file(args.in_filename, args.out_filename)
 if __name__ == '__main__':
    main()
--- a/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py
+++ b/wordfreq_builder/wordfreq_builder/cli/freqs_to_dB.py
@ -0,0 +1,11 @@
 from wordfreq_builder.word_counts import freqs_to_dBpack
 import argparse
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('filename_in', help='name of input file containing tokens')
    parser.add_argument('filename_out', help='name of output file')
    args = parser.parse_args()
    freqs_to_dBpack(args.filename_in, args.filename_out)
--- a/wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py
+++ b/wordfreq_builder/wordfreq_builder/cli/pretokenize_twitter.py
@ -0,0 +1,19 @@
 from wordfreq_builder.tokenizers import rosette_surface_tokenizer, pretokenize_file
 import argparse
 def pretokenize_twitter(in_filename, out_prefix):
    pretokenize_file(in_filename, out_prefix,
                     tokenizer=rosette_surface_tokenizer)
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help='filename of input file containing one tweet per line')
    parser.add_argument('outprefix', help='prefix of output filenames')
    args = parser.parse_args()
    pretokenize_twitter(args.filename, args.outprefix)
 if __name__ == '__main__':
    main()
--- a/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py
+++ b/wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py
@ -0,0 +1,30 @@
 from wordfreq_builder.tokenizers import rosette_surface_tokenizer, monolingual_tokenize_file
 import argparse
 def tokenize_wikipedia(in_filename, out_filename, language, proportion):
    monolingual_tokenize_file(
        in_filename, out_filename,
        language=language,
        tokenizer=rosette_surface_tokenizer,
        line_reader=strip_headings,
        sample_proportion=proportion
    )
 def strip_headings(text):
    return text.strip().strip('=')
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('in_filename', help='filename of input file')
    parser.add_argument('out_filename', help='filename of output file')
    parser.add_argument('language', help='the language code of the text')
    parser.add_argument('-p', '--proportion', help='process 1/n of the lines (default 100)', type=int, default=100)
    args = parser.parse_args()
    tokenize_wikipedia(args.in_filename, args.out_filename, args.language, args.proportion)
 if __name__ == '__main__':
    main()
--- a/wordfreq_builder/wordfreq_builder/cmd_combine_lists.py
+++ b/wordfreq_builder/wordfreq_builder/cmd_combine_lists.py
@ -1,21 +0,0 @@
 from wordfreq_builder.word_counts import read_counts, write_counts, merge_counts
 from pathlib import Path
 import argparse
 def merge_lists(input_names, output_name, balance=False):
    count_dicts = []
    for input_name in input_names:
        count_dicts.append(read_counts(Path(input_name)))
    merged = merge_counts(count_dicts, balance=balance)
    write_counts(merged, Path(output_name))
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
    parser.add_argument('-b', '--balance', action='store_true', help='Automatically balance unequally-sampled word frequencies')
    parser.add_argument('inputs', help='names of input files to merge', nargs='+')
    args = parser.parse_args()
    merge_lists(args.inputs, args.output, balance=args.balance)
--- a/wordfreq_builder/wordfreq_builder/cmd_count_twitter.py
+++ b/wordfreq_builder/wordfreq_builder/cmd_count_twitter.py
@ -1,27 +0,0 @@
 from wordfreq_builder.word_counts import WordCountBuilder
 from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer
 from pathlib import Path
 import argparse
 def count_twitter(pathname, offset=0, nsplit=1, surface=True):
    path = Path(pathname)
    if surface == True:
        tokenizer = rosette_surface_tokenizer
    else:
        tokenizer = rosette_tokenizer
    builder = WordCountBuilder(tokenizer=tokenizer)
    save_filename = 'twitter-counts-%d.csv' % offset
    save_pathname = path.parent / save_filename
    builder.count_twitter(path, offset, nsplit)
    builder.save_wordlist(save_pathname)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help='filename of input file containing one tweet per line')
    parser.add_argument('offset', type=int)
    parser.add_argument('nsplit', type=int)
    args = parser.parse_args()
    count_twitter(args.filename, args.offset, args.nsplit, surface=True)
--- a/wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py
+++ b/wordfreq_builder/wordfreq_builder/cmd_count_wikipedia.py
@ -1,23 +0,0 @@
 from wordfreq_builder.word_counts import WordCountBuilder
 from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer
 from pathlib import Path
 import argparse
 def count_wikipedia(filename, surface=True):
    path = Path(filename)
    if surface == True:
        tokenizer = rosette_surface_tokenizer
    else:
        tokenizer = rosette_tokenizer
    builder = WordCountBuilder(tokenizer=tokenizer, unique_docs=False)
    builder.count_wikipedia(path)
    builder.save_wordlist(path.parent / 'counts.csv')
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('filename', help='flat text file containing extracted Wikipedia text')
    args = parser.parse_args()
    count_wikipedia(args.filename, surface=True)
--- a/wordfreq_builder/wordfreq_builder/config.py
+++ b/wordfreq_builder/wordfreq_builder/config.py
@ -0,0 +1,69 @@
 import os
 CONFIG = {
    'version': '0.9.0',
    # data_dir is a relative or absolute path to where the wordlist data
    # is stored
    'data_dir': 'data',
    'sources': {
        # A list of language codes (possibly un-standardized) that we'll
        # look up in filenames for these various data sources.
        'twitter': [
            'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
            'pt', 'ru',
            # can be added later: 'th', 'tr'
        ],
        'wikipedia': [
            'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
            'pt', 'ru'
        ],
        'opensubtitles': [
            # All languages where the most common word in OpenSubtitles
            # appears at least 5000 times
            'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et',
            'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'id', 'is', 'it', 'lt', 'lv',
            'mk', 'ms', 'nb', 'nl', 'pl', 'pt', 'ro', 'sk', 'sl', 'sq', 'sr',
            'sv', 'tr', 'uk', 'zh'
        ],
        'leeds': [
            'ar', 'de', 'el', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh'
        ]
    },
    'wordlist_paths': {
        'twitter': 'generated/twitter/tweets-2014.{lang}.{ext}',
        'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}',
        'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}',
        'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
        'combined': 'generated/combined/combined_{lang}.{ext}'
    },
    'min_sources': 2
 }
 def data_filename(filename):
    return os.path.join(CONFIG['data_dir'], filename)
 def wordlist_filename(source, language, extension='txt'):
    path = CONFIG['wordlist_paths'][source].format(
        lang=language, ext=extension
    )
    return data_filename(path)
 def source_names(language):
    """
    Get the names of data sources that supply data for the given language.
    """
    return sorted([key for key in CONFIG['sources']
                  if language in CONFIG['sources'][key]])
 def all_languages():
    languages = set()
    for langlist in CONFIG['sources'].values():
        languages |= set(langlist)
    return [lang for lang in sorted(languages)
            if len(source_names(lang))
            >= CONFIG['min_sources']]
--- a/wordfreq_builder/wordfreq_builder/ninja.py
+++ b/wordfreq_builder/wordfreq_builder/ninja.py
@ -0,0 +1,199 @@
 from wordfreq_builder.config import (
    CONFIG, data_filename, wordlist_filename, all_languages, source_names
 )
 import sys
 import pathlib
 HEADER = """# This file is automatically generated. Do not edit it.
 # You can regenerate it using the 'wordfreq-build-deps' command.
 """
 TMPDIR = data_filename('tmp')
 # Set this to True to rebuild the Twitter tokenization (which takes days)
 PRETOKENIZE_TWITTER = False
 def add_dep(lines, rule, input, output, extra=None, params=None):
    if isinstance(output, list):
        output = ' '.join(output)
    if isinstance(input, list):
        input = ' '.join(input)
    if extra:
        if isinstance(extra, list):
            extra = ' '.join(extra)
        extrastr = ' | ' + extra
    else:
        extrastr = ''
    build_rule = "build {output}: {rule} {input}{extra}".format(
        output=output, rule=rule, input=input, extra=extrastr
    )
    lines.append(build_rule)
    if params:
        for key, val in params.items():
            lines.append("  {key} = {val}".format(locals()))
    lines.append("")
 def make_ninja_deps(rules_filename, out=sys.stdout):
    """
    Output a complete Ninja file describing how to build the wordfreq data.
    """
    print(HEADER, file=out)
    # Copy in the rules section
    with open(rules_filename, encoding='utf-8') as rulesfile:
        print(rulesfile.read(), file=out)
    lines = []
    if PRETOKENIZE_TWITTER:
        lines.extend(
            twitter_preprocess_deps(
                data_filename('raw-input/twitter/all-2014.txt'),
                slice_prefix=data_filename('slices/twitter/tweets-2014'),
                combined_prefix=data_filename('intermediate/twitter/tweets-2014'),
                slices=40,
                languages=CONFIG['sources']['twitter']
            )
        )
    lines.extend(
        twitter_deps(
            data_filename('intermediate/twitter/tweets-2014'),
            languages=CONFIG['sources']['twitter']
        )
    )
    lines.extend(
        wikipedia_deps(
            data_filename('raw-input/wikipedia'),
            CONFIG['sources']['wikipedia']
        )
    )
    lines.extend(
        leeds_deps(
            data_filename('source-lists/leeds'),
            CONFIG['sources']['leeds']
        )
    )
    lines.extend(
        opensubtitles_deps(
            data_filename('source-lists/opensubtitles'),
            CONFIG['sources']['opensubtitles']
        )
    )
    lines.extend(combine_lists(all_languages()))
    print('\n'.join(lines), file=out)
 def wikipedia_deps(dirname_in, languages):
    lines = []
    path_in = pathlib.Path(dirname_in)
    for language in languages:
        # Find the most recent file for this language
        input_file = max(path_in.glob(
            '{}wiki*.bz2'.format(language)
        ))
        raw_file = wordlist_filename('wikipedia', language, 'txt')
        token_file = wordlist_filename('wikipedia', language, 'tokens.txt')
        count_file = wordlist_filename('wikipedia', language, 'counts.txt')
        add_dep(lines, 'wiki2text', input_file, raw_file)
        add_dep(lines, 'wiki2tokens', input_file, token_file)
        add_dep(lines, 'count', token_file, count_file)
    return lines
 def twitter_preprocess_deps(input_filename, slice_prefix,
                            combined_prefix, slices, languages):
    lines = []
    slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
                   for num in range(slices)]
    # split the input into slices
    add_dep(lines,
            'split', input_filename, slice_files,
            {'prefix': '{}.part'.format(slice_prefix),
             'slices': slices})
    for slicenum in range(slices):
        slice_file = slice_files[slicenum]
        language_outputs = [
            '{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language)
            for language in languages
        ]
        add_dep(lines, 'tokenize_twitter', slice_file, language_outputs,
                {'prefix': slice_file})
    for language in languages:
        combined_output = '{prefix}.{lang}.txt'.format(prefix=combined_prefix, lang=language)
        language_inputs = [
            '{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language)
            for slicenum in range(slices)
        ]
        add_dep(lines, 'cat', language_inputs, combined_output)
    return lines
 def twitter_deps(prefix_in, languages):
    lines = []
    for language in languages:
        input_file = '{prefix}.{lang}.txt'.format(prefix=prefix_in, lang=language)
        token_file = wordlist_filename('twitter', language, 'tokens.txt')
        add_dep(lines,
                'format_twitter', input_file, token_file,
                extra='wordfreq_builder/tokenizers.py')
        count_file = wordlist_filename('twitter', language, 'counts.txt')
        add_dep(lines, 'count', token_file, count_file)
    return lines
 def leeds_deps(dirname_in, languages):
    lines = []
    for language in languages:
        input_file = '{prefix}/internet-{lang}-forms.num'.format(
            prefix=dirname_in, lang=language
        )
        reformatted_file = wordlist_filename('leeds', language, 'counts.txt')
        add_dep(lines, 'convert_leeds', input_file, reformatted_file)
    return lines
 def opensubtitles_deps(dirname_in, languages):
    lines = []
    for language in languages:
        input_file = '{prefix}/{lang}.txt'.format(
            prefix=dirname_in, lang=language
        )
        reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt')
        add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
    return lines
 def combine_lists(languages):
    lines = []
    for language in languages:
        sources = source_names(language)
        input_files = [
            wordlist_filename(source, language, 'counts.txt')
            for source in sources
        ]
        output_file = wordlist_filename('combined', language)
        add_dep(lines, 'merge', input_files, output_file,
                extra='wordfreq_builder/word_counts.py')
        output_dBpack = wordlist_filename('combined', language, 'msgpack.gz')
        add_dep(lines, 'freqs2dB', output_file, output_dBpack,
                extra='wordfreq_builder/word_counts.py')
    return lines
 def main():
    make_ninja_deps('rules.ninja')
 if __name__ == '__main__':
    main()
--- a/wordfreq_builder/wordfreq_builder/tokenizers.py
+++ b/wordfreq_builder/wordfreq_builder/tokenizers.py
@ -1,33 +1,153 @@
 from lumi_science.text_readers.rosette_readers import RosetteReader
 from html.entities import name2codepoint
 import re
 ROSETTE = RosetteReader()
-def rosette_tokenizer(text):
+# Some of Rosette's language codes are incorrect. For example, 'zh_sc' should
-    analysis, lang = ROSETTE.rosette.analyze(text)
+# mean "Chinese as used in Seychelles", which is kind of nonsense. What Rosette
-    # I'm aware this doesn't do the right things with multi-word stems.
+# really means is "Simplified Chinese", whose code is 'zh-Hans'.
-    # Wordfreq doesn't either. And wordfreq isn't designed to look up
+ROSETTE_LANG_MAP = {
-    # multiple words anyway.
+    'zh_sc': 'zh-Hans',
-    tokens = []
+    'zh_tc': 'zh-Hant',
-    for (stem, pos, span) in analysis:
+    'en_uc': 'en',
-        for subtoken in stem.split(' '):
+}
-            tokens.append(subtoken + '|' + lang)
+
-    return tokens
+
 NON_PUNCT_RE = re.compile('[0-9A-Za-z\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff０-９Ａ-Ｚａ-ｚ\uff66-\U0002ffff]')
 EMOTICON_RANGE = '\u2600-\u26ff\U0001F000-\U0001F7FF'
 RETOKENIZE_RE = re.compile('[{0}#@/]|[^{0}#@/ ]+'.format(EMOTICON_RANGE))
 def last_tab(line):
    """
    Read lines by keeping only the last tab-separated value.
    """
    return line.split('\t')[-1].strip()
 def lowercase_text_filter(token):
    if NON_PUNCT_RE.search(token):
        return token.lower()
    else:
        return None
 def is_url(token):
    return token.startswith('http:') or token.startswith('https:')
 def pretokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
    """
    Process a file by running it through the given tokenizer, sorting the
    results by the language of each line, and inserting spaces into lines
    to mark the token boundaries. This computes the 'hard part' of
    tokenization and allows the results to be saved, so that we can change
    the finer details of the output without re-running everything.
    """
    out_files = {}
    for line in open(in_filename, encoding='utf-8'):
        text = line_reader(line)
        tokens, language = tokenizer(text)
        tokenized = ' '.join(tokens)
        if language is not None:
            out_filename = '%s.%s.txt' % (out_prefix, language)
            if out_filename in out_files:
                out_file = out_files[out_filename]
            else:
                out_file = open(out_filename, 'w', encoding='utf-8')
                out_files[out_filename] = out_file
            print(tokenized, file=out_file)
    for out_file in out_files.values():
        out_file.close()
 ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;')
 def fix_entities(text):
    """
    Fix the few HTML entities that Twitter uses -- even if they've
    already been tokenized.
    """
    def replace_entity(match):
        return chr(name2codepoint[match.group(1)])
    return ENTITY_RE.sub(replace_entity, text)
 def retokenize(text):
    text = fix_entities(text)
    tokens = RETOKENIZE_RE.findall(text)
    skip_next = False
    for token in tokens:
        if token == '/' or token == '@':
            # Avoid idiosyncratic tokens such as URLs and
            # usernames
            skip_next = True
        elif skip_next:
            skip_next = False
        else:
            if not is_url(token):
                filtered = lowercase_text_filter(token)
                if filtered:
                    yield filtered
 def retokenize_file(in_filename, out_filename):
    """
    Process a file that has been tokenized (by inserting spaces) in a
    language-specific way by Rosette.
    """
    with open(in_filename, encoding='utf-8') as in_file:
        with open(out_filename, 'w', encoding='utf-8') as out_file:
            for line in in_file:
                skip_next = False
                for token in retokenize(line.strip()):
                    if skip_next:
                        skip_next = False
                    elif token == '/' or token == '@':
                        # Avoid idiosyncratic tokens such as URLs and
                        # usernames
                        skip_next = True
                    elif lowercase_text_filter(token):
                        print(token, file=out_file)
 def monolingual_tokenize_file(in_filename, out_filename, language,
                              tokenizer, line_reader=last_tab,
                              token_filter=lowercase_text_filter,
                              sample_proportion=100):
    with open(in_filename, encoding='utf-8', errors='replace') as in_file:
        with open(out_filename, 'w', encoding='utf-8') as out_file:
            for i, line in enumerate(in_file):
                if i % sample_proportion == 0:
                    text = line_reader(line)
                    tokens, line_language = tokenizer(text)
                    if line_language == language:
                        filtered = [token_filter(t) for t in tokens]
                        filtered = [t for t in filtered if t is not None]
                        for token in filtered:
                            print(token, file=out_file)
 def rosette_surface_tokenizer(text):
    try:
        analysis, lang = ROSETTE.rosette.analyze(text)
    except (RuntimeError, UnicodeError):
        # Our Rosette interface throws errors given arbitrary data. :(
        return text, None
    language = ROSETTE_LANG_MAP.get(lang, lang)
    tokens = []
    for (stem, pos, span) in analysis:
        surface_text = text[span[0]:span[1]]
-        for subtoken in surface_text.split(' '):
+        tokens.append(surface_text)
-            tokens.append(subtoken + '|' + lang)
+    return tokens, language
    return tokens
-def treebank_surface_tokenizer(text):
+def treebank_surface_tokenizer(text, language='en'):
    """
    This is a simplified version of the Treebank tokenizer in NLTK.
@ -45,6 +165,10 @@ def treebank_surface_tokenizer(text):
    as a result -- for example, it splits "wanna" into "wan" and "na", which
    are supposed to be considered unusual surface forms of "want" and "to".
    We just leave it as the word "wanna".
    The language will just be returned, as this function isn't doing any
    language detection. It defaults to 'en', as English is the language that
    Treebank tokenization is designed for.
    """
    #starting quotes
    text = re.sub(r'^\"', r'``', text)
@ -80,4 +204,4 @@ def treebank_surface_tokenizer(text):
    text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
                  text)
-    return text.split()
+    return text.split(), language
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -1,116 +1,85 @@
-from wordfreq_builder.tokenizers import treebank_surface_tokenizer
+from wordfreq_builder.tokenizers import retokenize
 from collections import defaultdict
 from operator import itemgetter
-from pathlib import Path
+from ftfy import fix_text
-from unicodedata import normalize
+import math
 import csv
-import sys
+import msgpack
 import gzip
-def read_counts(path):
+def count_tokens(filename):
    counts = defaultdict(int)
-    with path.open(encoding='utf-8', newline='') as infile:
+    with open(filename, encoding='utf-8') as infile:
-        reader = csv.reader(infile)
+        for line in infile:
-        for key, strval in reader:
+            for token in retokenize(line.strip()):
-            val = float(strval)
+                counts[token] += 1
            # Use += so that, if we give the reader concatenated files with
            # duplicates, it does the right thing
            counts[key] += val
    return counts
-def count_languages(counts):
+def read_freqs(filename, cutoff=0):
-    langcounts = defaultdict(int)
+    raw_counts = defaultdict(float)
-    for key, strval in counts.items():
+    total = 0.
-        val = int(strval)
+    with open(filename, encoding='utf-8', newline='') as infile:
-        text, lang = key.rsplit('|', 1)
+        reader = csv.reader(infile)
-        langcounts[lang] += val
+        for key, strval in reader:
-    return langcounts
+            val = float(strval)
            if val < cutoff:
                break
            for token in retokenize(key):
                token = fix_text(token)
                total += val
                # Use += so that, if we give the reader concatenated files with
                # duplicates, it does the right thing
                raw_counts[token] += val
    freqs = {key: raw_count / total
             for (key, raw_count) in raw_counts.items()}
    return freqs
-def merge_counts(count_dicts, balance=False):
+def freqs_to_dBpack(in_filename, out_filename, cutoff=-60):
    freq_cutoff = 10 ** (cutoff / 10.)
    freqs = read_freqs(in_filename, freq_cutoff)
    dBpack = []
    for token, freq in freqs.items():
        dB = round(math.log10(freq) * 10)
        if dB >= cutoff:
            neg_dB = -dB
            while neg_dB >= len(dBpack):
                dBpack.append([])
            dBpack[neg_dB].append(token)
    with gzip.open(out_filename, 'wb') as outfile:
        msgpack.dump(dBpack, outfile)
 def merge_freqs(freq_dicts):
    vocab = set()
    for freq_dict in freq_dicts:
        vocab |= set(freq_dict)
    merged = defaultdict(float)
-    maxweight = None
+    N = len(freq_dicts)
-    for counts in count_dicts:
+    for term in vocab:
-        if balance:
+        term_total = 0.
-            if maxweight is None:
+        for freq_dict in freq_dicts:
-                maxweight = max(counts.values())
+            term_total += freq_dict.get(term, 0.)
-            weight = maxweight / max(counts.values()) / len(count_dicts)
+        merged[term] = term_total / N
-        else:
+
            weight = 1.
        for key, val in counts.items():
            merged[key] += val * weight
    return merged
-def write_counts(counts, path, cutoff=2):
+def write_wordlist(freqs, filename, cutoff=1e-8):
-    print("Writing to %s" % path)
+    """
-    with path.open('w', encoding='utf-8', newline='') as outfile:
+    Write a dictionary of either raw counts or frequencies to a file of
    comma-separated values.
    """
    with open(filename, 'w', encoding='utf-8', newline='\n') as outfile:
        writer = csv.writer(outfile)
-        items = sorted(counts.items(), key=itemgetter(1), reverse=True)
+        items = sorted(freqs.items(), key=itemgetter(1), reverse=True)
-        for word, count in items:
+        for word, freq in items:
-            if count < cutoff:
+            if freq < cutoff:
                # Don't write all the terms that appeared too infrequently
                break
            if not ('"' in word or ',' in word):
-                writer.writerow([word, str(int(count))])
+                writer.writerow([word, str(freq)])
 class WordCountBuilder:
    def __init__(self, unique_docs=True, tokenizer=None):
        self.counts = defaultdict(int)
        self.unique_docs = unique_docs
        if tokenizer is None:
            self.tokenizer = treebank_surface_tokenizer
        else:
            self.tokenizer = tokenizer
    def add_text(self, text):
        text = normalize('NFKC', text).lower()
        try:
            tokens = self.tokenizer(text)
            # print(' '.join(tokens))
        except Exception as e:
            print("Couldn't tokenize due to %r: %s" % (e, text), file=sys.stderr)
            return
        if self.unique_docs:
            tokens = set(tokens)
        for tok in tokens:
            self.counts[tok] += 1
    def count_wikipedia(self, path):
        """
        Read a directory of extracted Wikipedia articles. The articles can be
        grouped together into files, in which case they should be separated by
        lines beginning with ##.
        """
        with path.open(encoding='utf-8') as file:
            article_lines = []
            for line in file:
                line = line.strip()
                if line.startswith('= ') and line.endswith(' ='):
                    # Fake level-1 headings indicate boundaries between articles
                    print(line)
                    self.try_wiki_article(' '.join(article_lines))
                    article_lines.clear()
                else:
                    # Skip other headings, so that "external" doesn't look
                    # ridiculously common, for example
                    if not (line.startswith('==') and line.endswith('==')):
                        article_lines.append(line)
            self.try_wiki_article(' '.join(article_lines))
    def try_wiki_article(self, text):
        if len(text) > 1000:
            self.add_text(text)
    def count_twitter(self, path, offset, nsplit):
        with path.open(encoding='utf-8') as file:
            for i, line in enumerate(file):
                if i % nsplit == offset:
                    line = line.strip()
                    text = line.split('\t')[-1]
                    self.add_text(text)
    def save_wordlist(self, path):
        write_counts(self.counts, path)