Merge branch 'ninja-build'

Conflicts:
	wordfreq_builder/cmd_count_twitter.py
	wordfreq_builder/cmd_count_wikipedia.py
This commit is contained in:
Robyn Speer 2015-05-08 00:01:01 -04:00
commit cddeae0acb
19 changed files with 687 additions and 185 deletions

View File

@ -6,3 +6,7 @@ dist
*.egg-info
build
_build
build.ninja
data
.ninja_deps
.ninja_log

12
wordfreq_builder/Makefile Normal file
View File

@ -0,0 +1,12 @@
PYTHON = python
all: build.ninja
# make sure this package is in 'develop' mode and up to date
wordfreq_builder.egg-info/PKG-INFO: setup.py
$(PYTHON) setup.py develop
# build the Ninja file that will take over the build process
build.ninja: rules.ninja wordfreq_builder/ninja.py wordfreq_builder/config.py wordfreq_builder.egg-info/PKG-INFO
$(PYTHON) -m wordfreq_builder.cli.build_deps rules.ninja > build.ninja

View File

@ -0,0 +1,63 @@
# This defines the rules on how to build parts of the wordfreq lists, using the
# Ninja build system:
#
# http://martine.github.io/ninja/manual.html
#
# Ninja is available in the 'ninja-build' Ubuntu package. It's like make with
# better parallelism and the ability for build steps to produce multiple
# outputs. The tradeoff is that its rule syntax isn't full of magic for
# expanding wildcards and finding dependencies, so in general you have to
# write the dependencies using a script.
#
# This file will become the header of the larger build.ninja file, which also
# contains the programatically-defined dependency graph.
# Variables
DATA = ./data
# Splits the single file $in into $slices parts, whose names will be
# $prefix plus a two-digit numeric suffix.
rule split
command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix
# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from
# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at
# https://github.com/rspeer/wiki2text.
rule wiki2text
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
rule wiki2tokens
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out
rule tokenize_japanese
command = mkdir -p $$(dirname $out) && mecab < $in | cut -f 1 | grep -v "EOS"
rule tokenize_twitter
command = mkdir -p $$(dirname $prefix) && python -m wordfreq_builder.cli.pretokenize_twitter $in $prefix
rule format_twitter
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.format_twitter $in $out
# To convert the Leeds corpus, look for space-separated lines that start with
# an integer and a decimal. The integer is the rank, which we discard. The
# decimal is the frequency, and the remaining text is the term. Use sed -n
# with /p to output only lines where the match was successful.
rule convert_leeds
command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in > $out
# To convert the OpenSubtitles frequency data, simply replace spaces with
# commas.
rule convert_opensubtitles
command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out
rule count
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out
rule merge
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in
rule freqs2dB
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_dB $in $out
rule cat
command = cat $in > $out

View File

@ -9,4 +9,13 @@ setup(
platforms=["any"],
description="Turns raw data into word frequency lists",
packages=['wordfreq_builder'],
install_requires=['msgpack-python'],
entry_points={
'console_scripts': [
'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main',
'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
]
}
)

View File

@ -0,0 +1,15 @@
from wordfreq_builder.ninja import make_ninja_deps
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument('in_filename', help='filename of rules file')
args = parser.parse_args()
# Make the complete ninja file and write it to standard out
make_ninja_deps(args.in_filename)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,19 @@
from wordfreq_builder.word_counts import read_freqs, merge_freqs, write_wordlist
import argparse
def merge_lists(input_names, output_name):
freq_dicts = []
for input_name in input_names:
freq_dicts.append(read_freqs(input_name, cutoff=2))
merged = merge_freqs(freq_dicts)
write_wordlist(merged, output_name)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
args = parser.parse_args()
merge_lists(args.inputs, args.output)

View File

@ -0,0 +1,16 @@
from wordfreq_builder.word_counts import count_tokens, write_wordlist
import argparse
def handle_counts(filename_in, filename_out):
counts = count_tokens(filename_in)
write_wordlist(counts, filename_out)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('filename_in', help='name of input file containing tokens')
parser.add_argument('filename_out', help='name of output file')
args = parser.parse_args()
handle_counts(args.filename_in, args.filename_out)

View File

@ -0,0 +1,14 @@
from wordfreq_builder.tokenizers import retokenize_file
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument('in_filename', help='filename of input file containing one tweet per line')
parser.add_argument('out_filename', help='filename of output file')
args = parser.parse_args()
retokenize_file(args.in_filename, args.out_filename)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,11 @@
from wordfreq_builder.word_counts import freqs_to_dBpack
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('filename_in', help='name of input file containing tokens')
parser.add_argument('filename_out', help='name of output file')
args = parser.parse_args()
freqs_to_dBpack(args.filename_in, args.filename_out)

View File

@ -0,0 +1,19 @@
from wordfreq_builder.tokenizers import rosette_surface_tokenizer, pretokenize_file
import argparse
def pretokenize_twitter(in_filename, out_prefix):
pretokenize_file(in_filename, out_prefix,
tokenizer=rosette_surface_tokenizer)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('filename', help='filename of input file containing one tweet per line')
parser.add_argument('outprefix', help='prefix of output filenames')
args = parser.parse_args()
pretokenize_twitter(args.filename, args.outprefix)
if __name__ == '__main__':
main()

View File

@ -0,0 +1,30 @@
from wordfreq_builder.tokenizers import rosette_surface_tokenizer, monolingual_tokenize_file
import argparse
def tokenize_wikipedia(in_filename, out_filename, language, proportion):
monolingual_tokenize_file(
in_filename, out_filename,
language=language,
tokenizer=rosette_surface_tokenizer,
line_reader=strip_headings,
sample_proportion=proportion
)
def strip_headings(text):
return text.strip().strip('=')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('in_filename', help='filename of input file')
parser.add_argument('out_filename', help='filename of output file')
parser.add_argument('language', help='the language code of the text')
parser.add_argument('-p', '--proportion', help='process 1/n of the lines (default 100)', type=int, default=100)
args = parser.parse_args()
tokenize_wikipedia(args.in_filename, args.out_filename, args.language, args.proportion)
if __name__ == '__main__':
main()

View File

@ -1,21 +0,0 @@
from wordfreq_builder.word_counts import read_counts, write_counts, merge_counts
from pathlib import Path
import argparse
def merge_lists(input_names, output_name, balance=False):
count_dicts = []
for input_name in input_names:
count_dicts.append(read_counts(Path(input_name)))
merged = merge_counts(count_dicts, balance=balance)
write_counts(merged, Path(output_name))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
parser.add_argument('-b', '--balance', action='store_true', help='Automatically balance unequally-sampled word frequencies')
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
args = parser.parse_args()
merge_lists(args.inputs, args.output, balance=args.balance)

View File

@ -1,27 +0,0 @@
from wordfreq_builder.word_counts import WordCountBuilder
from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer
from pathlib import Path
import argparse
def count_twitter(pathname, offset=0, nsplit=1, surface=True):
path = Path(pathname)
if surface == True:
tokenizer = rosette_surface_tokenizer
else:
tokenizer = rosette_tokenizer
builder = WordCountBuilder(tokenizer=tokenizer)
save_filename = 'twitter-counts-%d.csv' % offset
save_pathname = path.parent / save_filename
builder.count_twitter(path, offset, nsplit)
builder.save_wordlist(save_pathname)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('filename', help='filename of input file containing one tweet per line')
parser.add_argument('offset', type=int)
parser.add_argument('nsplit', type=int)
args = parser.parse_args()
count_twitter(args.filename, args.offset, args.nsplit, surface=True)

View File

@ -1,23 +0,0 @@
from wordfreq_builder.word_counts import WordCountBuilder
from wordfreq_builder.tokenizers import rosette_tokenizer, rosette_surface_tokenizer
from pathlib import Path
import argparse
def count_wikipedia(filename, surface=True):
path = Path(filename)
if surface == True:
tokenizer = rosette_surface_tokenizer
else:
tokenizer = rosette_tokenizer
builder = WordCountBuilder(tokenizer=tokenizer, unique_docs=False)
builder.count_wikipedia(path)
builder.save_wordlist(path.parent / 'counts.csv')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('filename', help='flat text file containing extracted Wikipedia text')
args = parser.parse_args()
count_wikipedia(args.filename, surface=True)

View File

@ -0,0 +1,69 @@
import os
CONFIG = {
'version': '0.9.0',
# data_dir is a relative or absolute path to where the wordlist data
# is stored
'data_dir': 'data',
'sources': {
# A list of language codes (possibly un-standardized) that we'll
# look up in filenames for these various data sources.
'twitter': [
'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
'pt', 'ru',
# can be added later: 'th', 'tr'
],
'wikipedia': [
'ar', 'de', 'en', 'es', 'fr', 'id', 'it', 'ja', 'ko', 'ms', 'nl',
'pt', 'ru'
],
'opensubtitles': [
# All languages where the most common word in OpenSubtitles
# appears at least 5000 times
'ar', 'bg', 'bs', 'ca', 'cs', 'da', 'de', 'el', 'en', 'es', 'et',
'fa', 'fi', 'fr', 'he', 'hr', 'hu', 'id', 'is', 'it', 'lt', 'lv',
'mk', 'ms', 'nb', 'nl', 'pl', 'pt', 'ro', 'sk', 'sl', 'sq', 'sr',
'sv', 'tr', 'uk', 'zh'
],
'leeds': [
'ar', 'de', 'el', 'en', 'es', 'fr', 'it', 'ja', 'pt', 'ru', 'zh'
]
},
'wordlist_paths': {
'twitter': 'generated/twitter/tweets-2014.{lang}.{ext}',
'wikipedia': 'generated/wikipedia/wikipedia_{lang}.{ext}',
'opensubtitles': 'generated/opensubtitles/opensubtitles_{lang}.{ext}',
'leeds': 'generated/leeds/leeds_internet_{lang}.{ext}',
'combined': 'generated/combined/combined_{lang}.{ext}'
},
'min_sources': 2
}
def data_filename(filename):
return os.path.join(CONFIG['data_dir'], filename)
def wordlist_filename(source, language, extension='txt'):
path = CONFIG['wordlist_paths'][source].format(
lang=language, ext=extension
)
return data_filename(path)
def source_names(language):
"""
Get the names of data sources that supply data for the given language.
"""
return sorted([key for key in CONFIG['sources']
if language in CONFIG['sources'][key]])
def all_languages():
languages = set()
for langlist in CONFIG['sources'].values():
languages |= set(langlist)
return [lang for lang in sorted(languages)
if len(source_names(lang))
>= CONFIG['min_sources']]

View File

@ -0,0 +1,199 @@
from wordfreq_builder.config import (
CONFIG, data_filename, wordlist_filename, all_languages, source_names
)
import sys
import pathlib
HEADER = """# This file is automatically generated. Do not edit it.
# You can regenerate it using the 'wordfreq-build-deps' command.
"""
TMPDIR = data_filename('tmp')
# Set this to True to rebuild the Twitter tokenization (which takes days)
PRETOKENIZE_TWITTER = False
def add_dep(lines, rule, input, output, extra=None, params=None):
if isinstance(output, list):
output = ' '.join(output)
if isinstance(input, list):
input = ' '.join(input)
if extra:
if isinstance(extra, list):
extra = ' '.join(extra)
extrastr = ' | ' + extra
else:
extrastr = ''
build_rule = "build {output}: {rule} {input}{extra}".format(
output=output, rule=rule, input=input, extra=extrastr
)
lines.append(build_rule)
if params:
for key, val in params.items():
lines.append(" {key} = {val}".format(locals()))
lines.append("")
def make_ninja_deps(rules_filename, out=sys.stdout):
"""
Output a complete Ninja file describing how to build the wordfreq data.
"""
print(HEADER, file=out)
# Copy in the rules section
with open(rules_filename, encoding='utf-8') as rulesfile:
print(rulesfile.read(), file=out)
lines = []
if PRETOKENIZE_TWITTER:
lines.extend(
twitter_preprocess_deps(
data_filename('raw-input/twitter/all-2014.txt'),
slice_prefix=data_filename('slices/twitter/tweets-2014'),
combined_prefix=data_filename('intermediate/twitter/tweets-2014'),
slices=40,
languages=CONFIG['sources']['twitter']
)
)
lines.extend(
twitter_deps(
data_filename('intermediate/twitter/tweets-2014'),
languages=CONFIG['sources']['twitter']
)
)
lines.extend(
wikipedia_deps(
data_filename('raw-input/wikipedia'),
CONFIG['sources']['wikipedia']
)
)
lines.extend(
leeds_deps(
data_filename('source-lists/leeds'),
CONFIG['sources']['leeds']
)
)
lines.extend(
opensubtitles_deps(
data_filename('source-lists/opensubtitles'),
CONFIG['sources']['opensubtitles']
)
)
lines.extend(combine_lists(all_languages()))
print('\n'.join(lines), file=out)
def wikipedia_deps(dirname_in, languages):
lines = []
path_in = pathlib.Path(dirname_in)
for language in languages:
# Find the most recent file for this language
input_file = max(path_in.glob(
'{}wiki*.bz2'.format(language)
))
raw_file = wordlist_filename('wikipedia', language, 'txt')
token_file = wordlist_filename('wikipedia', language, 'tokens.txt')
count_file = wordlist_filename('wikipedia', language, 'counts.txt')
add_dep(lines, 'wiki2text', input_file, raw_file)
add_dep(lines, 'wiki2tokens', input_file, token_file)
add_dep(lines, 'count', token_file, count_file)
return lines
def twitter_preprocess_deps(input_filename, slice_prefix,
combined_prefix, slices, languages):
lines = []
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
for num in range(slices)]
# split the input into slices
add_dep(lines,
'split', input_filename, slice_files,
{'prefix': '{}.part'.format(slice_prefix),
'slices': slices})
for slicenum in range(slices):
slice_file = slice_files[slicenum]
language_outputs = [
'{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language)
for language in languages
]
add_dep(lines, 'tokenize_twitter', slice_file, language_outputs,
{'prefix': slice_file})
for language in languages:
combined_output = '{prefix}.{lang}.txt'.format(prefix=combined_prefix, lang=language)
language_inputs = [
'{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language)
for slicenum in range(slices)
]
add_dep(lines, 'cat', language_inputs, combined_output)
return lines
def twitter_deps(prefix_in, languages):
lines = []
for language in languages:
input_file = '{prefix}.{lang}.txt'.format(prefix=prefix_in, lang=language)
token_file = wordlist_filename('twitter', language, 'tokens.txt')
add_dep(lines,
'format_twitter', input_file, token_file,
extra='wordfreq_builder/tokenizers.py')
count_file = wordlist_filename('twitter', language, 'counts.txt')
add_dep(lines, 'count', token_file, count_file)
return lines
def leeds_deps(dirname_in, languages):
lines = []
for language in languages:
input_file = '{prefix}/internet-{lang}-forms.num'.format(
prefix=dirname_in, lang=language
)
reformatted_file = wordlist_filename('leeds', language, 'counts.txt')
add_dep(lines, 'convert_leeds', input_file, reformatted_file)
return lines
def opensubtitles_deps(dirname_in, languages):
lines = []
for language in languages:
input_file = '{prefix}/{lang}.txt'.format(
prefix=dirname_in, lang=language
)
reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt')
add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
return lines
def combine_lists(languages):
lines = []
for language in languages:
sources = source_names(language)
input_files = [
wordlist_filename(source, language, 'counts.txt')
for source in sources
]
output_file = wordlist_filename('combined', language)
add_dep(lines, 'merge', input_files, output_file,
extra='wordfreq_builder/word_counts.py')
output_dBpack = wordlist_filename('combined', language, 'msgpack.gz')
add_dep(lines, 'freqs2dB', output_file, output_dBpack,
extra='wordfreq_builder/word_counts.py')
return lines
def main():
make_ninja_deps('rules.ninja')
if __name__ == '__main__':
main()

View File

@ -1,33 +1,153 @@
from lumi_science.text_readers.rosette_readers import RosetteReader
from html.entities import name2codepoint
import re
ROSETTE = RosetteReader()
def rosette_tokenizer(text):
analysis, lang = ROSETTE.rosette.analyze(text)
# I'm aware this doesn't do the right things with multi-word stems.
# Wordfreq doesn't either. And wordfreq isn't designed to look up
# multiple words anyway.
tokens = []
for (stem, pos, span) in analysis:
for subtoken in stem.split(' '):
tokens.append(subtoken + '|' + lang)
return tokens
# Some of Rosette's language codes are incorrect. For example, 'zh_sc' should
# mean "Chinese as used in Seychelles", which is kind of nonsense. What Rosette
# really means is "Simplified Chinese", whose code is 'zh-Hans'.
ROSETTE_LANG_MAP = {
'zh_sc': 'zh-Hans',
'zh_tc': 'zh-Hant',
'en_uc': 'en',
}
NON_PUNCT_RE = re.compile('[0-9A-Za-z\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff---\uff66-\U0002ffff]')
EMOTICON_RANGE = '\u2600-\u26ff\U0001F000-\U0001F7FF'
RETOKENIZE_RE = re.compile('[{0}#@/]|[^{0}#@/ ]+'.format(EMOTICON_RANGE))
def last_tab(line):
"""
Read lines by keeping only the last tab-separated value.
"""
return line.split('\t')[-1].strip()
def lowercase_text_filter(token):
if NON_PUNCT_RE.search(token):
return token.lower()
else:
return None
def is_url(token):
return token.startswith('http:') or token.startswith('https:')
def pretokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
"""
Process a file by running it through the given tokenizer, sorting the
results by the language of each line, and inserting spaces into lines
to mark the token boundaries. This computes the 'hard part' of
tokenization and allows the results to be saved, so that we can change
the finer details of the output without re-running everything.
"""
out_files = {}
for line in open(in_filename, encoding='utf-8'):
text = line_reader(line)
tokens, language = tokenizer(text)
tokenized = ' '.join(tokens)
if language is not None:
out_filename = '%s.%s.txt' % (out_prefix, language)
if out_filename in out_files:
out_file = out_files[out_filename]
else:
out_file = open(out_filename, 'w', encoding='utf-8')
out_files[out_filename] = out_file
print(tokenized, file=out_file)
for out_file in out_files.values():
out_file.close()
ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;')
def fix_entities(text):
"""
Fix the few HTML entities that Twitter uses -- even if they've
already been tokenized.
"""
def replace_entity(match):
return chr(name2codepoint[match.group(1)])
return ENTITY_RE.sub(replace_entity, text)
def retokenize(text):
text = fix_entities(text)
tokens = RETOKENIZE_RE.findall(text)
skip_next = False
for token in tokens:
if token == '/' or token == '@':
# Avoid idiosyncratic tokens such as URLs and
# usernames
skip_next = True
elif skip_next:
skip_next = False
else:
if not is_url(token):
filtered = lowercase_text_filter(token)
if filtered:
yield filtered
def retokenize_file(in_filename, out_filename):
"""
Process a file that has been tokenized (by inserting spaces) in a
language-specific way by Rosette.
"""
with open(in_filename, encoding='utf-8') as in_file:
with open(out_filename, 'w', encoding='utf-8') as out_file:
for line in in_file:
skip_next = False
for token in retokenize(line.strip()):
if skip_next:
skip_next = False
elif token == '/' or token == '@':
# Avoid idiosyncratic tokens such as URLs and
# usernames
skip_next = True
elif lowercase_text_filter(token):
print(token, file=out_file)
def monolingual_tokenize_file(in_filename, out_filename, language,
tokenizer, line_reader=last_tab,
token_filter=lowercase_text_filter,
sample_proportion=100):
with open(in_filename, encoding='utf-8', errors='replace') as in_file:
with open(out_filename, 'w', encoding='utf-8') as out_file:
for i, line in enumerate(in_file):
if i % sample_proportion == 0:
text = line_reader(line)
tokens, line_language = tokenizer(text)
if line_language == language:
filtered = [token_filter(t) for t in tokens]
filtered = [t for t in filtered if t is not None]
for token in filtered:
print(token, file=out_file)
def rosette_surface_tokenizer(text):
analysis, lang = ROSETTE.rosette.analyze(text)
try:
analysis, lang = ROSETTE.rosette.analyze(text)
except (RuntimeError, UnicodeError):
# Our Rosette interface throws errors given arbitrary data. :(
return text, None
language = ROSETTE_LANG_MAP.get(lang, lang)
tokens = []
for (stem, pos, span) in analysis:
surface_text = text[span[0]:span[1]]
for subtoken in surface_text.split(' '):
tokens.append(subtoken + '|' + lang)
return tokens
tokens.append(surface_text)
return tokens, language
def treebank_surface_tokenizer(text):
def treebank_surface_tokenizer(text, language='en'):
"""
This is a simplified version of the Treebank tokenizer in NLTK.
@ -45,6 +165,10 @@ def treebank_surface_tokenizer(text):
as a result -- for example, it splits "wanna" into "wan" and "na", which
are supposed to be considered unusual surface forms of "want" and "to".
We just leave it as the word "wanna".
The language will just be returned, as this function isn't doing any
language detection. It defaults to 'en', as English is the language that
Treebank tokenization is designed for.
"""
#starting quotes
text = re.sub(r'^\"', r'``', text)
@ -80,4 +204,4 @@ def treebank_surface_tokenizer(text):
text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ",
text)
return text.split()
return text.split(), language

View File

@ -1,116 +1,85 @@
from wordfreq_builder.tokenizers import treebank_surface_tokenizer
from wordfreq_builder.tokenizers import retokenize
from collections import defaultdict
from operator import itemgetter
from pathlib import Path
from unicodedata import normalize
from ftfy import fix_text
import math
import csv
import sys
import msgpack
import gzip
def read_counts(path):
def count_tokens(filename):
counts = defaultdict(int)
with path.open(encoding='utf-8', newline='') as infile:
reader = csv.reader(infile)
for key, strval in reader:
val = float(strval)
# Use += so that, if we give the reader concatenated files with
# duplicates, it does the right thing
counts[key] += val
with open(filename, encoding='utf-8') as infile:
for line in infile:
for token in retokenize(line.strip()):
counts[token] += 1
return counts
def count_languages(counts):
langcounts = defaultdict(int)
for key, strval in counts.items():
val = int(strval)
text, lang = key.rsplit('|', 1)
langcounts[lang] += val
return langcounts
def read_freqs(filename, cutoff=0):
raw_counts = defaultdict(float)
total = 0.
with open(filename, encoding='utf-8', newline='') as infile:
reader = csv.reader(infile)
for key, strval in reader:
val = float(strval)
if val < cutoff:
break
for token in retokenize(key):
token = fix_text(token)
total += val
# Use += so that, if we give the reader concatenated files with
# duplicates, it does the right thing
raw_counts[token] += val
freqs = {key: raw_count / total
for (key, raw_count) in raw_counts.items()}
return freqs
def merge_counts(count_dicts, balance=False):
def freqs_to_dBpack(in_filename, out_filename, cutoff=-60):
freq_cutoff = 10 ** (cutoff / 10.)
freqs = read_freqs(in_filename, freq_cutoff)
dBpack = []
for token, freq in freqs.items():
dB = round(math.log10(freq) * 10)
if dB >= cutoff:
neg_dB = -dB
while neg_dB >= len(dBpack):
dBpack.append([])
dBpack[neg_dB].append(token)
with gzip.open(out_filename, 'wb') as outfile:
msgpack.dump(dBpack, outfile)
def merge_freqs(freq_dicts):
vocab = set()
for freq_dict in freq_dicts:
vocab |= set(freq_dict)
merged = defaultdict(float)
maxweight = None
for counts in count_dicts:
if balance:
if maxweight is None:
maxweight = max(counts.values())
weight = maxweight / max(counts.values()) / len(count_dicts)
else:
weight = 1.
for key, val in counts.items():
merged[key] += val * weight
N = len(freq_dicts)
for term in vocab:
term_total = 0.
for freq_dict in freq_dicts:
term_total += freq_dict.get(term, 0.)
merged[term] = term_total / N
return merged
def write_counts(counts, path, cutoff=2):
print("Writing to %s" % path)
with path.open('w', encoding='utf-8', newline='') as outfile:
def write_wordlist(freqs, filename, cutoff=1e-8):
"""
Write a dictionary of either raw counts or frequencies to a file of
comma-separated values.
"""
with open(filename, 'w', encoding='utf-8', newline='\n') as outfile:
writer = csv.writer(outfile)
items = sorted(counts.items(), key=itemgetter(1), reverse=True)
for word, count in items:
if count < cutoff:
# Don't write all the terms that appeared too infrequently
items = sorted(freqs.items(), key=itemgetter(1), reverse=True)
for word, freq in items:
if freq < cutoff:
break
if not ('"' in word or ',' in word):
writer.writerow([word, str(int(count))])
class WordCountBuilder:
def __init__(self, unique_docs=True, tokenizer=None):
self.counts = defaultdict(int)
self.unique_docs = unique_docs
if tokenizer is None:
self.tokenizer = treebank_surface_tokenizer
else:
self.tokenizer = tokenizer
def add_text(self, text):
text = normalize('NFKC', text).lower()
try:
tokens = self.tokenizer(text)
# print(' '.join(tokens))
except Exception as e:
print("Couldn't tokenize due to %r: %s" % (e, text), file=sys.stderr)
return
if self.unique_docs:
tokens = set(tokens)
for tok in tokens:
self.counts[tok] += 1
def count_wikipedia(self, path):
"""
Read a directory of extracted Wikipedia articles. The articles can be
grouped together into files, in which case they should be separated by
lines beginning with ##.
"""
with path.open(encoding='utf-8') as file:
article_lines = []
for line in file:
line = line.strip()
if line.startswith('= ') and line.endswith(' ='):
# Fake level-1 headings indicate boundaries between articles
print(line)
self.try_wiki_article(' '.join(article_lines))
article_lines.clear()
else:
# Skip other headings, so that "external" doesn't look
# ridiculously common, for example
if not (line.startswith('==') and line.endswith('==')):
article_lines.append(line)
self.try_wiki_article(' '.join(article_lines))
def try_wiki_article(self, text):
if len(text) > 1000:
self.add_text(text)
def count_twitter(self, path, offset, nsplit):
with path.open(encoding='utf-8') as file:
for i, line in enumerate(file):
if i % nsplit == offset:
line = line.strip()
text = line.split('\t')[-1]
self.add_text(text)
def save_wordlist(self, path):
write_counts(self.counts, path)
writer.writerow([word, str(freq)])