Merge pull request #19 from LuminosoInsight/code-review-fixes-2015-07-17

Code review fixes 2015 07 17
This commit is contained in:
Rob Speer 2015-07-22 15:09:00 -04:00
commit 32102ba3c2
42 changed files with 151 additions and 158 deletions

View File

@ -23,8 +23,8 @@ install them on Ubuntu:
## Unicode data ## Unicode data
The tokenizers used to split non-Japanese phrases use regexes built using the The tokenizers that split non-Japanese phrases utilize regexes built using the
`unicodedata` module from Python 3.4, which uses Unicode version 6.3.0. To `unicodedata` module from Python 3.4, which supports Unicode version 6.3.0. To
update these regexes, run `scripts/gen_regex.py`. update these regexes, run `scripts/gen_regex.py`.
## License ## License
@ -58,4 +58,3 @@ Some additional data was collected by a custom application that watches the
streaming Twitter API, in accordance with Twitter's Developer Agreement & streaming Twitter API, in accordance with Twitter's Developer Agreement &
Policy. This software only gives statistics about words that are very commonly Policy. This software only gives statistics about words that are very commonly
used on Twitter; it does not display or republish any Twitter content. used on Twitter; it does not display or republish any Twitter content.

View File

@ -1,3 +1,5 @@
""" This file generates a graph of the dependencies for the ninja build."""
import sys import sys
@ -26,4 +28,3 @@ def ninja_to_dot():
if __name__ == '__main__': if __name__ == '__main__':
ninja_to_dot() ninja_to_dot()

View File

@ -94,7 +94,7 @@ def test_failed_cB_conversion():
def test_tokenization(): def test_tokenization():
# We preserve apostrophes within words, so "can't" is a single word in the # We preserve apostrophes within words, so "can't" is a single word in the
# data, while the fake word "plan't" can't be found. # data
eq_(tokenize("can't", 'en'), ["can't"]) eq_(tokenize("can't", 'en'), ["can't"])
eq_(tokenize('😂test', 'en'), ['😂', 'test']) eq_(tokenize('😂test', 'en'), ['😂', 'test'])
@ -135,12 +135,20 @@ def test_not_enough_ascii():
random_ascii_words(lang='zh') random_ascii_words(lang='zh')
def test_ar(): def test_ar():
# Remove tatweels
eq_( eq_(
tokenize('متــــــــعب', 'ar'), tokenize('متــــــــعب', 'ar'),
['متعب'] ['متعب']
) )
# Remove combining marks
eq_( eq_(
tokenize('حَرَكَات', 'ar'), tokenize('حَرَكَات', 'ar'),
['حركات'] ['حركات']
) )
eq_(
tokenize('إﻻ', 'ar'),
['إلا']
)

View File

@ -8,6 +8,8 @@ import itertools
import pathlib import pathlib
import random import random
import logging import logging
import unicodedata
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -66,11 +68,21 @@ def tokenize(text, lang):
return mecab_tokenize(text) return mecab_tokenize(text)
if lang == 'ar': if lang == 'ar':
text = COMBINING_MARK_RE.sub('', text.replace('ـ', '')) text = standardize_arabic(text)
return simple_tokenize(text) return simple_tokenize(text)
def standardize_arabic(text):
"""
Standardizes arabic text by removing combining marks and tatweels.
"""
return unicodedata.normalize(
'NFKC',
COMBINING_MARK_RE.sub('', text.replace('ـ', ''))
)
def read_cBpack(filename): def read_cBpack(filename):
""" """
Read a file from an idiosyncratic format that we use for storing Read a file from an idiosyncratic format that we use for storing
@ -257,6 +269,9 @@ def word_frequency(word, lang, wordlist='combined', minimum=0.):
If a word decomposes into multiple tokens, we'll return a smoothed estimate If a word decomposes into multiple tokens, we'll return a smoothed estimate
of the word frequency that is no greater than the frequency of any of its of the word frequency that is no greater than the frequency of any of its
individual tokens. individual tokens.
It should be noted that the current tokenizer does not support
multi-word Chinese phrases.
""" """
args = (word, lang, wordlist, minimum) args = (word, lang, wordlist, minimum)
try: try:

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -47,8 +47,7 @@ Start the build, and find something else to do for a few hours:
ninja -v ninja -v
You can copy the results into wordfreq with this command (supposing that You can copy the results into wordfreq with this command:
$WORDFREQ points to your wordfreq repo):
cp data/dist/*.msgpack.gz ../wordfreq/data/ cp data/dist/*.msgpack.gz ../wordfreq/data/
@ -83,6 +82,19 @@ The specific rules are described by the comments in `rules.ninja`.
## Data sources ## Data sources
### Wikipedia
Wikipedia is a "free-access, free-content Internet encyclopedia".
These files can be downloaded from [wikimedia dump][wikipedia]
The original files are in `data/raw-input/wikipedia`, and they're processed
by the `wiki2text` rule in `rules.ninja`. Parsing wikipedia requires the
[wiki2text][] package.
[wikipedia]: https://dumps.wikimedia.org/backup-index.html
[wiki2text]: https://github.com/rspeer/wiki2text
### Leeds Internet Corpus ### Leeds Internet Corpus
Also known as the "Web as Corpus" project, this is a University of Leeds Also known as the "Web as Corpus" project, this is a University of Leeds
@ -102,7 +114,7 @@ by the `convert_leeds` rule in `rules.ninja`.
The file `data/raw-input/twitter/all-2014.txt` contains about 72 million tweets The file `data/raw-input/twitter/all-2014.txt` contains about 72 million tweets
collected by the `ftfy.streamtester` package in 2014. collected by the `ftfy.streamtester` package in 2014.
It's not possible to distribute the text of tweets. However, this process could We are not allowed to distribute the text of tweets. However, this process could
be reproduced by running `ftfy.streamtester`, part of the [ftfy][] package, for be reproduced by running `ftfy.streamtester`, part of the [ftfy][] package, for
a couple of weeks. a couple of weeks.
@ -162,4 +174,3 @@ longer represents the words 'don' and 'won', as we assume most of their
frequency comes from "don't" and "won't". Words that turned into similarly frequency comes from "don't" and "won't". Words that turned into similarly
common words, however, were left alone: this list doesn't represent "can't" common words, however, were left alone: this list doesn't represent "can't"
because the word was left as "can". because the word was left as "can".

View File

@ -29,12 +29,12 @@ rule split
# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at # Wikipedia dumps obtained from dumps.wikimedia.org. The code is at
# https://github.com/rspeer/wiki2text. # https://github.com/rspeer/wiki2text.
rule wiki2text rule wiki2text
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out command = bunzip2 -c $in | wiki2text > $out
# To tokenize Japanese, we run it through Mecab and take the first column. # To tokenize Japanese, we run it through Mecab and take the first column.
# We don't have a plan for tokenizing Chinese yet. # We don't have a plan for tokenizing Chinese yet.
rule tokenize_japanese rule tokenize_japanese
command = mkdir -p $$(dirname $out) && mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out command = mecab -b 1048576 < $in | cut -f 1 | grep -v "EOS" > $out
# Tokenizing text from Twitter requires us to language-detect and tokenize # Tokenizing text from Twitter requires us to language-detect and tokenize
# in the same step. # in the same step.
@ -49,12 +49,12 @@ rule tokenize_twitter
# Grep out the term "EOS", an indication that Leeds used MeCab and didn't # Grep out the term "EOS", an indication that Leeds used MeCab and didn't
# strip out the EOS lines. # strip out the EOS lines.
rule convert_leeds rule convert_leeds
command = mkdir -p $$(dirname $out) && sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out command = sed -rn 's/([0-9]+) ([0-9.]+) (.*)/\3,\2/p' < $in | grep -v 'EOS,' > $out
# To convert the OpenSubtitles frequency data, simply replace spaces with # To convert the OpenSubtitles frequency data, simply replace spaces with
# commas. # commas.
rule convert_opensubtitles rule convert_opensubtitles
command = mkdir -p $$(dirname $out) && tr ' ' ',' < $in > $out command = tr ' ' ',' < $in > $out
# Convert and clean up the Google Books Syntactic N-grams data. Concatenate all # Convert and clean up the Google Books Syntactic N-grams data. Concatenate all
# the input files, keep only the single words and their counts, and only keep # the input files, keep only the single words and their counts, and only keep
@ -65,16 +65,16 @@ rule convert_opensubtitles
# source data was already filtered to only show words in roles with at least # source data was already filtered to only show words in roles with at least
# two-digit counts of occurences.) # two-digit counts of occurences.)
rule convert_google_syntactic_ngrams rule convert_google_syntactic_ngrams
command = mkdir -p $$(dirname $out) && zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out command = zcat $in | cut -f 1,3 | grep -v '[,"]' | sed -rn 's/(.*)\s(...+)/\1,\2/p' > $out
rule count rule count
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.count_tokens $in $out command = python -m wordfreq_builder.cli.count_tokens $in $out
rule merge rule merge
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.combine_lists -o $out $in command = python -m wordfreq_builder.cli.combine_lists -o $out $in
rule freqs2cB rule freqs2cB
command = mkdir -p $$(dirname $out) && python -m wordfreq_builder.cli.freqs_to_cB $in $out command = python -m wordfreq_builder.cli.freqs_to_cB $lang $in $out
rule cat rule cat
command = cat $in > $out command = cat $in > $out

View File

@ -9,12 +9,5 @@ setup(
platforms=["any"], platforms=["any"],
description="Turns raw data into word frequency lists", description="Turns raw data into word frequency lists",
packages=['wordfreq_builder'], packages=['wordfreq_builder'],
install_requires=['msgpack-python', 'pycld2'], install_requires=['msgpack-python', 'pycld2']
entry_points={
'console_scripts': [
'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
]
}
) )

View File

@ -13,4 +13,3 @@ if __name__ == '__main__':
parser.add_argument('filename_out', help='name of output file') parser.add_argument('filename_out', help='name of output file')
args = parser.parse_args() args = parser.parse_args()
handle_counts(args.filename_in, args.filename_out) handle_counts(args.filename_in, args.filename_out)

View File

@ -4,8 +4,8 @@ import argparse
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('language', help='language of the input file')
parser.add_argument('filename_in', help='name of input file containing tokens') parser.add_argument('filename_in', help='name of input file containing tokens')
parser.add_argument('filename_out', help='name of output file') parser.add_argument('filename_out', help='name of output file')
args = parser.parse_args() args = parser.parse_args()
freqs_to_cBpack(args.filename_in, args.filename_out) freqs_to_cBpack(args.filename_in, args.filename_out, lang=args.language)

View File

@ -1,18 +1,13 @@
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_file from wordfreq_builder.tokenizers import cld2_surface_tokenizer, tokenize_twitter
import argparse import argparse
def tokenize_twitter(in_filename, out_prefix):
tokenize_file(in_filename, out_prefix,
tokenizer=cld2_surface_tokenizer)
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('filename', help='filename of input file containing one tweet per line') parser.add_argument('filename', help='filename of input file containing one tweet per line')
parser.add_argument('outprefix', help='prefix of output filenames') parser.add_argument('outprefix', help='prefix of output filenames')
args = parser.parse_args() args = parser.parse_args()
tokenize_twitter(args.filename, args.outprefix) tokenize_twitter(args.filename, args.outprefix, tokenizer=cld2_surface_tokenizer)
if __name__ == '__main__': if __name__ == '__main__':

View File

@ -10,10 +10,6 @@ HEADER = """# This file is automatically generated. Do not edit it.
TMPDIR = data_filename('tmp') TMPDIR = data_filename('tmp')
# Set this to True to rebuild the Twitter tokenization (which takes days)
TOKENIZE_TWITTER = True
def add_dep(lines, rule, input, output, extra=None, params=None): def add_dep(lines, rule, input, output, extra=None, params=None):
if isinstance(output, list): if isinstance(output, list):
output = ' '.join(output) output = ' '.join(output)
@ -48,17 +44,15 @@ def make_ninja_deps(rules_filename, out=sys.stdout):
# The first dependency is to make sure the build file is up to date. # The first dependency is to make sure the build file is up to date.
add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja', add_dep(lines, 'build_deps', 'rules.ninja', 'build.ninja',
extra='wordfreq_builder/ninja.py') extra='wordfreq_builder/ninja.py')
lines.extend(
if TOKENIZE_TWITTER: twitter_deps(
lines.extend( data_filename('raw-input/twitter/all-2014.txt'),
twitter_deps( slice_prefix=data_filename('slices/twitter/tweets-2014'),
data_filename('raw-input/twitter/all-2014.txt'), combined_prefix=data_filename('generated/twitter/tweets-2014'),
slice_prefix=data_filename('slices/twitter/tweets-2014'), slices=40,
combined_prefix=data_filename('generated/twitter/tweets-2014'), languages=CONFIG['sources']['twitter']
slices=40,
languages=CONFIG['sources']['twitter']
)
) )
)
lines.extend( lines.extend(
wikipedia_deps( wikipedia_deps(
data_filename('raw-input/wikipedia'), data_filename('raw-input/wikipedia'),
@ -92,17 +86,18 @@ def wikipedia_deps(dirname_in, languages):
path_in = pathlib.Path(dirname_in) path_in = pathlib.Path(dirname_in)
for language in languages: for language in languages:
# Find the most recent file for this language # Find the most recent file for this language
# Skip over files that do not exist input_file = max(path_in.glob('{}wiki*.bz2'.format(language)))
input_file = max(path_in.glob(
'{}wiki*.bz2'.format(language)
))
plain_text_file = wordlist_filename('wikipedia', language, 'txt') plain_text_file = wordlist_filename('wikipedia', language, 'txt')
count_file = wordlist_filename('wikipedia', language, 'counts.txt') count_file = wordlist_filename('wikipedia', language, 'counts.txt')
add_dep(lines, 'wiki2text', input_file, plain_text_file) add_dep(lines, 'wiki2text', input_file, plain_text_file)
if language == 'ja': if language == 'ja':
mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt') mecab_token_file = wordlist_filename(
add_dep(lines, 'tokenize_japanese', plain_text_file, mecab_token_file) 'wikipedia', language, 'mecab-tokens.txt'
)
add_dep(
lines, 'tokenize_japanese', plain_text_file, mecab_token_file
)
add_dep(lines, 'count', mecab_token_file, count_file) add_dep(lines, 'count', mecab_token_file, count_file)
else: else:
add_dep(lines, 'count', plain_text_file, count_file) add_dep(lines, 'count', plain_text_file, count_file)
@ -126,17 +121,18 @@ def google_books_deps(dirname_in):
return lines return lines
def twitter_deps(input_filename, slice_prefix, def twitter_deps(input_filename, slice_prefix, combined_prefix, slices,
combined_prefix, slices, languages): languages):
lines = [] lines = []
slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix, num=num) slice_files = ['{prefix}.part{num:0>2d}'.format(prefix=slice_prefix,
num=num)
for num in range(slices)] for num in range(slices)]
# split the input into slices # split the input into slices
add_dep(lines, add_dep(lines, 'split', input_filename, slice_files,
'split', input_filename, slice_files,
params={'prefix': '{}.part'.format(slice_prefix), params={'prefix': '{}.part'.format(slice_prefix),
'slices': slices}) 'slices': slices})
for slicenum in range(slices): for slicenum in range(slices):
slice_file = slice_files[slicenum] slice_file = slice_files[slicenum]
@ -151,7 +147,9 @@ def twitter_deps(input_filename, slice_prefix,
combined_output = wordlist_filename('twitter', language, 'tokens.txt') combined_output = wordlist_filename('twitter', language, 'tokens.txt')
language_inputs = [ language_inputs = [
'{prefix}.{lang}.txt'.format(prefix=slice_files[slicenum], lang=language) '{prefix}.{lang}.txt'.format(
prefix=slice_files[slicenum], lang=language
)
for slicenum in range(slices) for slicenum in range(slices)
] ]
@ -160,11 +158,14 @@ def twitter_deps(input_filename, slice_prefix,
count_file = wordlist_filename('twitter', language, 'counts.txt') count_file = wordlist_filename('twitter', language, 'counts.txt')
if language == 'ja': if language == 'ja':
mecab_token_file = wordlist_filename('twitter', language, 'mecab-tokens.txt') mecab_token_file = wordlist_filename(
add_dep(lines, 'tokenize_japanese', combined_output, mecab_token_file) 'twitter', language, 'mecab-tokens.txt')
add_dep(lines, 'count', mecab_token_file, count_file, extra='wordfreq_builder/tokenizers.py') add_dep(
else: lines, 'tokenize_japanese', combined_output, mecab_token_file)
add_dep(lines, 'count', combined_output, count_file, extra='wordfreq_builder/tokenizers.py') combined_output = mecab_token_file
add_dep(lines, 'count', combined_output, count_file,
extra='wordfreq_builder/tokenizers.py')
return lines return lines
@ -187,7 +188,8 @@ def opensubtitles_deps(dirname_in, languages):
input_file = '{prefix}/{lang}.txt'.format( input_file = '{prefix}/{lang}.txt'.format(
prefix=dirname_in, lang=language prefix=dirname_in, lang=language
) )
reformatted_file = wordlist_filename('opensubtitles', language, 'counts.txt') reformatted_file = wordlist_filename(
'opensubtitles', language, 'counts.txt')
add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file) add_dep(lines, 'convert_opensubtitles', input_file, reformatted_file)
return lines return lines
@ -205,18 +207,22 @@ def combine_lists(languages):
add_dep(lines, 'merge', input_files, output_file, add_dep(lines, 'merge', input_files, output_file,
extra='wordfreq_builder/word_counts.py') extra='wordfreq_builder/word_counts.py')
output_cBpack = wordlist_filename('combined-dist', language, 'msgpack.gz') output_cBpack = wordlist_filename(
'combined-dist', language, 'msgpack.gz')
add_dep(lines, 'freqs2cB', output_file, output_cBpack, add_dep(lines, 'freqs2cB', output_file, output_cBpack,
extra='wordfreq_builder/word_counts.py') extra='wordfreq_builder/word_counts.py',
params={'lang': language})
lines.append('default {}'.format(output_cBpack)) lines.append('default {}'.format(output_cBpack))
# Write standalone lists for Twitter frequency # Write standalone lists for Twitter frequency
if language in CONFIG['sources']['twitter']: if language in CONFIG['sources']['twitter']:
input_file = wordlist_filename('twitter', language, 'counts.txt') input_file = wordlist_filename('twitter', language, 'counts.txt')
output_cBpack = wordlist_filename('twitter-dist', language, 'msgpack.gz') output_cBpack = wordlist_filename(
'twitter-dist', language, 'msgpack.gz')
add_dep(lines, 'freqs2cB', input_file, output_cBpack, add_dep(lines, 'freqs2cB', input_file, output_cBpack,
extra='wordfreq_builder/word_counts.py') extra='wordfreq_builder/word_counts.py',
params={'lang': language})
lines.append('default {}'.format(output_cBpack)) lines.append('default {}'.format(output_cBpack))

View File

@ -1,63 +1,56 @@
from html.entities import name2codepoint from html.entities import name2codepoint
from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE from wordfreq import tokenize, TOKEN_RE, NON_PUNCT_RANGE
from ftfy.fixes import unescape_html
import re import re
import pycld2 import pycld2
CLD2_BAD_CHAR_RANGE = "".join([ CLD2_BAD_CHAR_RANGE = "[%s]" % "".join(
'[', [
'\x00-\x08', '\x00-\x08',
'\x0b', '\x0b',
'\x0e-\x1f', '\x0e-\x1f',
'\x7f-\x9f', '\x7f-\x9f',
'\ud800-\udfff', '\ud800-\udfff',
'\ufdd0-\ufdef'] + '\ufdd0-\ufdef'
[chr(65534+65536*x+y) for x in range(17) for y in range(2)] + ] +
[']']) [chr(65534+65536*x+y) for x in range(17) for y in range(2)]
)
CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE) CLD2_BAD_CHARS_RE = re.compile(CLD2_BAD_CHAR_RANGE)
TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE)) TWITTER_HANDLE_RE = re.compile('@{0}+'.format(NON_PUNCT_RANGE))
TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+'.format(NON_PUNCT_RANGE)) TCO_RE = re.compile('http(?:s)?://t.co/[a-zA-Z0-9]+')
def cld2_surface_tokenizer(text): def cld2_surface_tokenizer(text):
""" """
Uses CLD2 to detect the language and wordfreq tokenizer to create tokens Uses CLD2 to detect the language and wordfreq tokenizer to create tokens
""" """
text = remove_handles_and_urls(text) text = unescape_html(text)
text = TWITTER_HANDLE_RE.sub('', text)
text = TCO_RE.sub('', text)
lang = cld2_detect_language(text) lang = cld2_detect_language(text)
tokens = tokenize(text, lang) tokens = tokenize(text, lang)
return lang, tokens return lang, tokens
def cld2_detect_language(text): def cld2_detect_language(text):
""" """
Uses CLD2 to detect the language Uses CLD2 to detect the language
""" """
# Format of pycld2.detect:
# (Confident in result: bool,
# Number of bytes of text: Int,
# Triples of detected languages in order of certainty:
# (Language name: str,
# Language code: str
# Percent of text in this language: float
# Confidence score: float))
text = CLD2_BAD_CHARS_RE.sub('', text) text = CLD2_BAD_CHARS_RE.sub('', text)
return pycld2.detect(text)[2][0][1] return pycld2.detect(text)[2][0][1]
def remove_handles_and_urls(text):
text = fix_entities(text)
text = TWITTER_HANDLE_RE.sub('', text)
text = TCO_RE.sub('', text)
return text
def last_tab(line): def tokenize_twitter(in_filename, out_prefix, tokenizer):
"""
Read lines by keeping only the last tab-separated value.
"""
return line.split('\t')[-1].strip()
def lowercase_text_filter(token):
"""
If this looks like a token that we want to count, return it, lowercased.
If not, filter it out by returning None.
"""
if TOKEN_RE.search(token):
return token.lower()
else:
return None
def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
""" """
Process a file by running it through the given tokenizer, sorting the Process a file by running it through the given tokenizer, sorting the
results by the language of each line, and inserting newlines results by the language of each line, and inserting newlines
@ -66,7 +59,7 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
out_files = {} out_files = {}
with open(in_filename, encoding='utf-8') as in_file: with open(in_filename, encoding='utf-8') as in_file:
for line in in_file: for line in in_file:
text = line_reader(line) text = line.split('\t')[-1].strip()
language, tokens = tokenizer(text) language, tokens = tokenizer(text)
if language != 'un': if language != 'un':
tokenized = '\n'.join(tokens) tokenized = '\n'.join(tokens)
@ -79,37 +72,3 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
print(tokenized, file=out_file) print(tokenized, file=out_file)
for out_file in out_files.values(): for out_file in out_files.values():
out_file.close() out_file.close()
ENTITY_RE = re.compile(r'& ?(amp|quot|lt|gt) ?;')
def fix_entities(text):
"""
Fix the few HTML entities that Twitter uses -- even if they've
already been tokenized.
"""
def replace_entity(match):
return chr(name2codepoint[match.group(1)])
return ENTITY_RE.sub(replace_entity, text)
def monolingual_tokenize_file(in_filename, out_filename, language,
tokenizer, line_reader=last_tab,
sample_proportion=1):
"""
Process a file by running it through the given tokenizer, only keeping
lines of the language we're asking for, and inserting newlines
to mark the token boundaries.
`line_reader` is applied to each line before it given to the tokenizer
Only the first line out of every `sample_proportion` lines are run through
then tokenizer.
"""
with open(in_filename, encoding='utf-8', errors='replace') as in_file:
with open(out_filename, 'w', encoding='utf-8') as out_file:
for i, line in enumerate(in_file):
if i % sample_proportion == 0:
text = line_reader(line)
tokens, line_language = tokenizer(text)
if line_language == language:
for token in tokens:
print(token, file=out_file)

View File

@ -1,4 +1,4 @@
from wordfreq import simple_tokenize from wordfreq import simple_tokenize, tokenize
from collections import defaultdict from collections import defaultdict
from operator import itemgetter from operator import itemgetter
from ftfy import fix_text from ftfy import fix_text
@ -18,41 +18,49 @@ def count_tokens(filename):
counts = defaultdict(int) counts = defaultdict(int)
with open(filename, encoding='utf-8', errors='replace') as infile: with open(filename, encoding='utf-8', errors='replace') as infile:
for line in infile: for line in infile:
for token in simple_tokenize(line.strip()): for token in simple_tokenize(line):
counts[token] += 1 counts[token] += 1
return counts return counts
def read_freqs(filename, cutoff=0): def read_freqs(filename, cutoff=0, lang=None):
""" """
Read words and their frequencies from a CSV file. Read words and their frequencies from a CSV file.
Only words with a frequency greater than `cutoff` are returned. Only words with a frequency greater than or equal to `cutoff` are returned.
If `cutoff` is greater than 0, the csv file must be sorted by frequency If `cutoff` is greater than 0, the csv file must be sorted by frequency
in descending order. in descending order.
If lang is given, read_freqs will apply language specific preprocessing
operations.
""" """
raw_counts = defaultdict(float) raw_counts = defaultdict(float)
total = 0. total = 0.
with open(filename, encoding='utf-8', newline='') as infile: with open(filename, encoding='utf-8', newline='') as infile:
reader = csv.reader(infile) reader = csv.reader(infile)
for key, strval in reader: for key, strval in reader:
val = float(strval) val = float(strval)
if val < cutoff: if val < cutoff:
break break
for token in simple_tokenize(key):
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
for token in tokens:
token = fix_text(token) token = fix_text(token)
total += val total += val
# Use += so that, if we give the reader concatenated files with # Use += so that, if we give the reader concatenated files with
# duplicates, it does the right thing # duplicates, it does the right thing
raw_counts[token] += val raw_counts[token] += val
freqs = {key: raw_count / total for word in raw_counts:
for (key, raw_count) in raw_counts.items()} raw_counts[word] /= total
return freqs
return raw_counts
def freqs_to_cBpack(in_filename, out_filename, cutoff=-600): def freqs_to_cBpack(in_filename, out_filename, cutoff=-600, lang=None):
""" """
Convert a csv file of words and their frequencies to a file in the Convert a csv file of words and their frequencies to a file in the
idiosyncratic 'cBpack' format. idiosyncratic 'cBpack' format.
@ -61,15 +69,14 @@ def freqs_to_cBpack(in_filename, out_filename, cutoff=-600):
written to the new file. written to the new file.
""" """
freq_cutoff = 10 ** (cutoff / 100.) freq_cutoff = 10 ** (cutoff / 100.)
freqs = read_freqs(in_filename, freq_cutoff) freqs = read_freqs(in_filename, freq_cutoff, lang=lang)
cBpack = [] cBpack = []
for token, freq in freqs.items(): for token, freq in freqs.items():
cB = round(math.log10(freq) * 100) cB = round(math.log10(freq) * 100)
if cB >= cutoff: neg_cB = -cB
neg_cB = -cB while neg_cB >= len(cBpack):
while neg_cB >= len(cBpack): cBpack.append([])
cBpack.append([]) cBpack[neg_cB].append(token)
cBpack[neg_cB].append(token)
for sublist in cBpack: for sublist in cBpack:
sublist.sort() sublist.sort()
@ -88,7 +95,7 @@ def merge_freqs(freq_dicts):
""" """
vocab = set() vocab = set()
for freq_dict in freq_dicts: for freq_dict in freq_dicts:
vocab |= set(freq_dict) vocab.update(freq_dict)
merged = defaultdict(float) merged = defaultdict(float)
N = len(freq_dicts) N = len(freq_dicts)