Merge pull request #19 from LuminosoInsight/code-review-fixes-2015-07-17

Code review fixes 2015 07 17
This commit is contained in:
Rob Speer 2015-07-22 15:09:00 -04:00
commit 32102ba3c2
42 changed files with 151 additions and 158 deletions

View File

@ -23,8 +23,8 @@ install them on Ubuntu:
## Unicode data
The tokenizers used to split non-Japanese phrases use regexes built using the
`unicodedata` module from Python 3.4, which uses Unicode version 6.3.0. To
The tokenizers that split non-Japanese phrases utilize regexes built using the
`unicodedata` module from Python 3.4, which supports Unicode version 6.3.0. To
update these regexes, run `scripts/gen_regex.py`.
## License
@ -58,4 +58,3 @@ Some additional data was collected by a custom application that watches the
streaming Twitter API, in accordance with Twitter's Developer Agreement &
Policy. This software only gives statistics about words that are very commonly
used on Twitter; it does not display or republish any Twitter content.

View File

@ -1,3 +1,5 @@
""" This file generates a graph of the dependencies for the ninja build."""
import sys
@ -26,4 +28,3 @@ def ninja_to_dot():
if __name__ == '__main__':
ninja_to_dot()

View File

@ -94,7 +94,7 @@ def test_failed_cB_conversion():
def test_tokenization():
# We preserve apostrophes within words, so "can't" is a single word in the
# data, while the fake word "plan't" can't be found.
# data
eq_(tokenize("can't", 'en'), ["can't"])
eq_(tokenize('😂test', 'en'), ['😂', 'test'])
@ -135,12 +135,20 @@ def test_not_enough_ascii():
random_ascii_words(lang='zh')
def test_ar():
# Remove tatweels
eq_(
tokenize('متــــــــعب', 'ar'),
['متعب']
)
# Remove combining marks
eq_(
tokenize('حَرَكَات', 'ar'),
['حركات']
)
eq_(
tokenize('إﻻ', 'ar'),
['إلا']
)

View File

@ -8,6 +8,8 @@ import itertools
import pathlib
import random
import logging
import unicodedata
logger = logging.getLogger(__name__)
@ -66,11 +68,21 @@ def tokenize(text, lang):
return mecab_tokenize(text)
if lang == 'ar':
text = COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
text = standardize_arabic(text)
return simple_tokenize(text)
def standardize_arabic(text):
"""
Standardizes arabic text by removing combining marks and tatweels.
"""
return unicodedata.normalize(
'NFKC',
COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
)
def read_cBpack(filename):
"""
Read a file from an idiosyncratic format that we use for storing
@ -257,6 +269,9 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
If a word decomposes into multiple tokens, we'll return a smoothed estimate
of the word frequency that is no greater than the frequency of any of its
individual tokens.
It should be noted that the current tokenizer does not support
multi-word Chinese phrases.
"""
args = (word, lang, wordlist, minimum)
try:

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -47,8 +47,7 @@ Start the build, and find something else to do for a few hours:
ninja -v
You can copy the results into wordfreq with this command (supposing that
$WORDFREQ points to your wordfreq repo):
You can copy the results into wordfreq with this command:
cp data/dist/*.msgpack.gz ../wordfreq/data/
@ -83,6 +82,19 @@ The specific rules are described by the comments in `rules.ninja`.
## Data sources
### Wikipedia
Wikipedia is a "free-access, free-content Internet encyclopedia".
These files can be downloaded from [wikimedia dump][wikipedia]
The original files are in `data/raw-input/wikipedia`, and they're processed
by the `wiki2text` rule in `rules.ninja`. Parsing wikipedia requires the
[wiki2text][] package.
[wikipedia]: https://dumps.wikimedia.org/backup-index.html
[wiki2text]: https://github.com/rspeer/wiki2text
### Leeds Internet Corpus
Also known as the "Web as Corpus" project, this is a University of Leeds
@ -102,7 +114,7 @@ by the `convert_leeds` rule in `rules.ninja`.
The file `data/raw-input/twitter/all-2014.txt` contains about 72 million tweets
collected by the `ftfy.streamtester` package in 2014.
It's not possible to distribute the text of tweets. However, this process could
We are not allowed to distribute the text of tweets. However, this process could
be reproduced by running `ftfy.streamtester`, part of the [ftfy][] package, for
a couple of weeks.
@ -162,4 +174,3 @@ longer represents the words 'don' and 'won', as we assume most of their
frequency comes from "don't" and "won't". Words that turned into similarly
common words, however, were left alone: this list doesn't represent "can't"
because the word was left as "can".

View File

@ -29,12 +29,12 @@ rule split
# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at
# https://github.com/rspeer/wiki2text.
rule wiki2text
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
command = bunzip2 -c $in | wiki2text > $out
# To tokenize Japanese, we run it through Mecab and take the first column.
# We don't have a plan for tokenizing Chinese yet.
rule tokenize_japanese
command = mkdir -p $$(dirname $out) && mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
# Tokenizing text from Twitter requires us to language-detect and tokenize
# in the same step.
@ -49,12 +49,12 @@ rule tokenize_twitter
# Grep out the term "EOS", an indication that Leeds used MeCab and didn't
# strip out the EOS lines.
rule convert_leeds
command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out
command = sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out
# To convert the OpenSubtitles frequency data, simply replace spaces with
# commas.
rule convert_opensubtitles
command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out
command = tr ' ' ',' < $in > $out
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
# the input files, keep only the single words and their counts, and only keep
@ -65,16 +65,16 @@ rule convert_opensubtitles
# source data was already filtered to only show words in roles with at least
# two-digit counts of occurences.)
rule convert_google_syntactic_ngrams
command = mkdir -p $$(dirname $out) && zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
command = zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
rule count
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out
command = python -m wordfreq_builder.cli.count_tokens $in $out
rule merge
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in
command = python -m wordfreq_builder.cli.combine_lists -o $out $in
rule freqs2cB
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_cB $in $out
command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out
rule cat
command = cat $in > $out

View File

@ -9,12 +9,5 @@ setup(
platforms=["any"],
description="Turns raw data into word frequency lists",
packages=['wordfreq_builder'],
install_requires=['msgpack-python', 'pycld2'],
entry_points={
'console_scripts': [
'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
]
}
install_requires=['msgpack-python', 'pycld2']
)

View File

@ -13,4 +13,3 @@ if __name__ == '__main__':
parser.add_argument('filename_out', help='name of output file')
args = parser.parse_args()
handle_counts(args.filename_in, args.filename_out)

View File

@ -4,8 +4,8 @@ import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('language', help='language of the input file')
parser.add_argument('filename_in', help='name of input file containing tokens')
parser.add_argument('filename_out', help='name of output file')
args = parser.parse_args()
freqs_to_cBpack(args.filename_in, args.filename_out)
freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language)

View File

@ -1,18 +1,13 @@
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_file
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
import argparse
def tokenize_twitter(in_filename, out_prefix):
tokenize_file(in_filename, out_prefix,
tokenizer=cld2_surface_tokenizer)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('filename', help='filename of input file containing one tweet per line')
parser.add_argument('outprefix', help='prefix of output filenames')
args = parser.parse_args()
tokenize_twitter(args.filename, args.outprefix)
tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
if __name__ == '__main__':

View File

@ -10,10 +10,6 @@ HEADER = """# This file is automatically generated. Do not edit it.
TMPDIR = data_filename('tmp')
# Set this to True to rebuild the Twitter tokenization (which takes days)
TOKENIZE_TWITTER = True
def add_dep(lines, rule, input, output, extra=None, params=None):
if isinstance(output, list):
output = ' '.join(output)
@ -48,17 +44,15 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
# The first dependency is to make sure the build file is up to date.
add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja',
extra='wordfreq_builder/ninja.py')
if TOKENIZE_TWITTER:
lines.extend(
twitter_deps(
data_filename('raw-input/twitter/all-2014.txt'),
slice_prefix=data_filename('slices/twitter/tweets-2014'),
combined_prefix=data_filename('generated/twitter/tweets-2014'),
slices=40,
languages=CONFIG['sources']['twitter']
)
lines.extend(
twitter_deps(
data_filename('raw-input/twitter/all-2014.txt'),
slice_prefix=data_filename('slices/twitter/tweets-2014'),
combined_prefix=data_filename('generated/twitter/tweets-2014'),
slices=40,
languages=CONFIG['sources']['twitter']
)
)
lines.extend(
wikipedia_deps(
data_filename('raw-input/wikipedia'),
@ -92,17 +86,18 @@ def wikipedia_deps(dirname_in, languages):
path_in = pathlib.Path(dirname_in)
for language in languages:
# Find the most recent file for this language
# Skip over files that do not exist
input_file = max(path_in.glob(
'{}wiki*.bz2'.format(language)
))
input_file = max(path_in.glob('{}wiki*.bz2'.format(language)))
plain_text_file = wordlist_filename('wikipedia', language, 'txt')
count_file = wordlist_filename('wikipedia', language, 'counts.txt')
add_dep(lines, 'wiki2text', input_file, plain_text_file)
if language == 'ja':
mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt')
add_dep(lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
mecab_token_file = wordlist_filename(
'wikipedia', language, 'mecab-tokens.txt'
)
add_dep(
lines, 'tokenize_japanese', plain_text_file, mecab_token_file
)
add_dep(lines, 'count', mecab_token_file, count_file)
else:
add_dep(lines, 'count', plain_text_file, count_file)
@ -126,17 +121,18 @@ def google_books_deps(dirname_in):
return lines
def twitter_deps(input_filename, slice_prefix,
combined_prefix, slices, languages):
def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
languages):
lines = []
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num)
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix,
num=num)
for num in range(slices)]
# split the input into slices
add_dep(lines,
'split', input_filename, slice_files,
add_dep(lines, 'split', input_filename, slice_files,
params={'prefix': '{}.part'.format(slice_prefix),
'slices': slices})
'slices': slices})
for slicenum in range(slices):
slice_file = slice_files[slicenum]
@ -151,7 +147,9 @@ def twitter_deps(input_filename, slice_prefix,
combined_output = wordlist_filename('twitter', language, 'tokens.txt')
language_inputs = [
'{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language)
'{prefix}.{lang}.txt'.format(
prefix=slice_files[slicenum], lang=language
)
for slicenum in range(slices)
]
@ -160,11 +158,14 @@ def twitter_deps(input_filename, slice_prefix,
count_file = wordlist_filename('twitter', language, 'counts.txt')
if language == 'ja':
mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt')
add_dep(lines, 'tokenize_japanese', combined_output, mecab_token_file)
add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py')
else:
add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py')
mecab_token_file = wordlist_filename(
'twitter', language, 'mecab-tokens.txt')
add_dep(
lines, 'tokenize_japanese', combined_output, mecab_token_file)
combined_output = mecab_token_file
add_dep(lines, 'count', combined_output, count_file,
extra='wordfreq_builder/tokenizers.py')
return lines
@ -187,7 +188,8 @@ def opensubtitles_deps(dirname_in, languages):
input_file = '{prefix}/{lang}.txt'.format(
prefix=dirname_in, lang=language
)
reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt')
reformatted_file = wordlist_filename(
'opensubtitles', language, 'counts.txt')
add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
return lines
@ -205,18 +207,22 @@ def combine_lists(languages):
add_dep(lines, 'merge', input_files, output_file,
extra='wordfreq_builder/word_counts.py')
output_cBpack = wordlist_filename('combined-dist', language, 'msgpack.gz')
output_cBpack = wordlist_filename(
'combined-dist', language, 'msgpack.gz')
add_dep(lines, 'freqs2cB', output_file, output_cBpack,
extra='wordfreq_builder/word_counts.py')
extra='wordfreq_builder/word_counts.py',
params={'lang': language})
lines.append('default {}'.format(output_cBpack))
# Write standalone lists for Twitter frequency
if language in CONFIG['sources']['twitter']:
input_file = wordlist_filename('twitter', language, 'counts.txt')
output_cBpack = wordlist_filename('twitter-dist', language, 'msgpack.gz')
output_cBpack = wordlist_filename(
'twitter-dist', language, 'msgpack.gz')
add_dep(lines, 'freqs2cB', input_file, output_cBpack,
extra='wordfreq_builder/word_counts.py')
extra='wordfreq_builder/word_counts.py',
params={'lang': language})
lines.append('default {}'.format(output_cBpack))

View File

@ -1,63 +1,56 @@
from html.entities import name2codepoint
from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
from ftfy.fixes import unescape_html
import re
import pycld2
CLD2_BAD_CHAR_RANGE = "".join([
'[',
'\x00-\x08',
'\x0b',
'\x0e-\x1f',
'\x7f-\x9f',
'\ud800-\udfff',
'\ufdd0-\ufdef'] +
[chr(65534+65536*x+y) for x in range(17) for y in range(2)] +
[']'])
CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
[
'\x00-\x08',
'\x0b',
'\x0e-\x1f',
'\x7f-\x9f',
'\ud800-\udfff',
'\ufdd0-\ufdef'
] +
[chr(65534+65536*x+y) for x in range(17) for y in range(2)]
)
CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE)
TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE))
TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+'.format(NON_PUNCT_RANGE))
TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
def cld2_surface_tokenizer(text):
"""
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
"""
text = remove_handles_and_urls(text)
text = unescape_html(text)
text = TWITTER_HANDLE_RE.sub('', text)
text = TCO_RE.sub('', text)
lang = cld2_detect_language(text)
tokens = tokenize(text, lang)
return lang, tokens
def cld2_detect_language(text):
"""
Uses CLD2 to detect the language
"""
# Format of pycld2.detect:
# (Confident in result: bool,
# Number of bytes of text: Int,
# Triples of detected languages in order of certainty:
# (Language name: str,
# Language code: str
# Percent of text in this language: float
# Confidence score: float))
text = CLD2_BAD_CHARS_RE.sub('', text)
return pycld2.detect(text)[2][0][1]
def remove_handles_and_urls(text):
text = fix_entities(text)
text = TWITTER_HANDLE_RE.sub('', text)
text = TCO_RE.sub('', text)
return text
def last_tab(line):
"""
Read lines by keeping only the last tab-separated value.
"""
return line.split('\t')[-1].strip()
def lowercase_text_filter(token):
"""
If this looks like a token that we want to count, return it, lowercased.
If not, filter it out by returning None.
"""
if TOKEN_RE.search(token):
return token.lower()
else:
return None
def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
def tokenize_twitter(in_filename, out_prefix, tokenizer):
"""
Process a file by running it through the given tokenizer, sorting the
results by the language of each line, and inserting newlines
@ -66,7 +59,7 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
out_files = {}
with open(in_filename, encoding='utf-8') as in_file:
for line in in_file:
text = line_reader(line)
text = line.split('\t')[-1].strip()
language, tokens = tokenizer(text)
if language != 'un':
tokenized = '\n'.join(tokens)
@ -79,37 +72,3 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
print(tokenized, file=out_file)
for out_file in out_files.values():
out_file.close()
ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;')
def fix_entities(text):
"""
Fix the few HTML entities that Twitter uses -- even if they've
already been tokenized.
"""
def replace_entity(match):
return chr(name2codepoint[match.group(1)])
return ENTITY_RE.sub(replace_entity, text)
def monolingual_tokenize_file(in_filename, out_filename, language,
tokenizer, line_reader=last_tab,
sample_proportion=1):
"""
Process a file by running it through the given tokenizer, only keeping
lines of the language we're asking for, and inserting newlines
to mark the token boundaries.
`line_reader` is applied to each line before it given to the tokenizer
Only the first line out of every `sample_proportion` lines are run through
then tokenizer.
"""
with open(in_filename, encoding='utf-8', errors='replace') as in_file:
with open(out_filename, 'w', encoding='utf-8') as out_file:
for i, line in enumerate(in_file):
if i % sample_proportion == 0:
text = line_reader(line)
tokens, line_language = tokenizer(text)
if line_language == language:
for token in tokens:
print(token, file=out_file)

View File

@ -1,4 +1,4 @@
from wordfreq import simple_tokenize
from wordfreq import simple_tokenize, tokenize
from collections import defaultdict
from operator import itemgetter
from ftfy import fix_text
@ -18,41 +18,49 @@ def count_tokens(filename):
counts = defaultdict(int)
with open(filename, encoding='utf-8', errors='replace') as infile:
for line in infile:
for token in simple_tokenize(line.strip()):
for token in simple_tokenize(line):
counts[token] += 1
return counts
def read_freqs(filename, cutoff=0):
def read_freqs(filename, cutoff=0, lang=None):
"""
Read words and their frequencies from a CSV file.
Only words with a frequency greater than `cutoff` are returned.
Only words with a frequency greater than or equal to `cutoff` are returned.
If `cutoff` is greater than 0, the csv file must be sorted by frequency
in descending order.
If lang is given, read_freqs will apply language specific preprocessing
operations.
"""
raw_counts = defaultdict(float)
total = 0.
with open(filename, encoding='utf-8', newline='') as infile:
reader = csv.reader(infile)
for key, strval in reader:
val = float(strval)
if val < cutoff:
break
for token in simple_tokenize(key):
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
for token in tokens:
token = fix_text(token)
total += val
# Use += so that, if we give the reader concatenated files with
# duplicates, it does the right thing
raw_counts[token] += val
freqs = {key: raw_count / total
for (key, raw_count) in raw_counts.items()}
return freqs
for word in raw_counts:
raw_counts[word] /= total
return raw_counts
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
"""
Convert a csv file of words and their frequencies to a file in the
idiosyncratic 'cBpack' format.
@ -61,15 +69,14 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
written to the new file.
"""
freq_cutoff = 10 ** (cutoff / 100.)
freqs = read_freqs(in_filename, freq_cutoff)
freqs = read_freqs(in_filename, freq_cutoff, lang=lang)
cBpack = []
for token, freq in freqs.items():
cB = round(math.log10(freq) * 100)
if cB >= cutoff:
neg_cB = -cB
while neg_cB >= len(cBpack):
cBpack.append([])
cBpack[neg_cB].append(token)
neg_cB = -cB
while neg_cB >= len(cBpack):
cBpack.append([])
cBpack[neg_cB].append(token)
for sublist in cBpack:
sublist.sort()
@ -88,7 +95,7 @@ def merge_freqs(freq_dicts):
"""
vocab = set()
for freq_dict in freq_dicts:
vocab |= set(freq_dict)
vocab.update(freq_dict)
merged = defaultdict(float)
N = len(freq_dicts)