add and adjust some build steps

- more build steps for Wikipedia
- rename 'tokenize_twitter' to 'pretokenize_twitter' to indicate that
  the results are preliminary
This commit is contained in:
Robyn Speer 2015-05-05 13:59:21 -04:00
parent 33c5f78c07
commit 59409266ca
6 changed files with 101 additions and 14 deletions

View File

@ -20,15 +20,30 @@ DATA = ./data
rule split
command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix
# wiki2text is a tool I wrote using the development version of Nim, which
# extracts plain text from Wikipedia dumps obtained from dumps.wikimedia.org.
# The code is at https://github.com/rspeer/wiki2text, but right now it'll
# take a bit of setup to get it to run.
# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from
# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at
# https://github.com/rspeer/wiki2text.
rule wiki2text
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
rule wiki2tokens
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out
rule tokenize_twitter
command = mkdir -p $$(dirname $prefix) && wordfreq-tokenize-twitter $in $prefix
# This rule uses command-line tools to take in a file with one token per line,
# and output a comma-separated file with the token counts:
#
# * 'sort $in | uniq -c' does the actual counting.
# * 'sort -nrk 1' sorts the result in reverse numeric order by the first field
# (the count).
# * The 'sed' command rearranges the lines to be comma-separated values with
# the count coming second, instead of the count being a right-justified
# number at the start of the line.
#
rule count
command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)$/\2,\1/' > $out
rule cat
command = cat $in > $out

View File

@ -11,7 +11,8 @@ setup(
packages=['wordfreq_builder'],
entry_points={
'console_scripts': [
'wordfreq-tokenize-twitter = wordfreq_builder.cli.tokenize_twitter:main',
'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main',
'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
]
}

View File

@ -1,10 +1,10 @@
from wordfreq_builder.tokenizers import rosette_surface_tokenizer, tokenize_file
from wordfreq_builder.tokenizers import rosette_surface_tokenizer, pretokenize_file
import argparse
def tokenize_twitter(in_filename, out_prefix):
tokenize_file(in_filename, out_prefix,
tokenizer=rosette_surface_tokenizer)
def pretokenize_twitter(in_filename, out_prefix):
pretokenize_file(in_filename, out_prefix,
tokenizer=rosette_surface_tokenizer)
def main():
@ -12,7 +12,7 @@ def main():
parser.add_argument('filename', help='filename of input file containing one tweet per line')
parser.add_argument('outprefix', help='prefix of output filenames')
args = parser.parse_args()
tokenize_twitter(args.filename, args.outprefix)
pretokenize_twitter(args.filename, args.outprefix)
if __name__ == '__main__':

View File

@ -0,0 +1,30 @@
from wordfreq_builder.tokenizers import rosette_surface_tokenizer, monolingual_tokenize_file
import argparse
def tokenize_wikipedia(in_filename, out_filename, language, proportion):
monolingual_tokenize_file(
in_filename, out_filename,
language=language,
tokenizer=rosette_surface_tokenizer,
line_reader=strip_headings,
sample_proportion=proportion
)
def strip_headings(text):
return text.strip().strip('=')
def main():
parser = argparse.ArgumentParser()
parser.add_argument('in_filename', help='filename of input file')
parser.add_argument('out_filename', help='filename of output file')
parser.add_argument('language', help='the language code of the text')
parser.add_argument('-p', '--proportion', help='process 1/n of the lines (default 100)', type=int, default=100)
args = parser.parse_args()
tokenize_wikipedia(args.in_filename, args.out_filename, args.language, args.proportion)
if __name__ == '__main__':
main()

View File

@ -46,6 +46,11 @@ def wiki_parse_deps(dirname_in, dirname_out, languages):
outs=output_file, ins=input_file
)
lines.append(build_rule)
output_file = path_out / 'wikipedia_{}.tokens.txt'.format(language)
build_rule = "build {outs}: wiki2tokens {ins}".format(
outs=output_file, ins=input_file
)
lines.append(build_rule)
return lines
@ -69,7 +74,7 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix,
'{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language)
for language in CONFIG['languages']
]
build_rule = "build {outs}: tokenize_twitter {ins} | wordfreq_builder/tokenizers.py".format(
build_rule = "build {outs}: tokenize_twitter {ins}".format(
outs=' '.join(language_outputs), ins=slice_file
)
lines.append(build_rule)

View File

@ -1,5 +1,6 @@
from lumi_science.text_readers.rosette_readers import RosetteReader
import re
import unicodedata
ROSETTE = RosetteReader()
@ -15,6 +16,9 @@ ROSETTE_LANG_MAP = {
}
NON_PUNCT_RE = re.compile('[0-9A-Za-z\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff---\uff66-\U0002ffff]')
def last_tab(line):
"""
Read lines by keeping only the last tab-separated value.
@ -22,11 +26,26 @@ def last_tab(line):
return line.split('\t')[-1].strip()
def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
def non_punct_filter(token):
if NON_PUNCT_RE.search(token):
return token.lower()
else:
return None
def pretokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
"""
Process a file by running it through the given tokenizer, sorting the
results by the language of each line, and inserting spaces into lines
to mark the token boundaries. This computes the 'hard part' of
tokenization and allows the results to be saved, so that we can change
the finer details of the output without re-running everything.
"""
out_files = {}
for line in open(in_filename, encoding='utf-8'):
text = line_reader(line)
tokenized, language = tokenizer(text)
tokens, language = tokenizer(text)
tokenized = ' '.join(tokens)
if language is not None:
out_filename = '%s.%s.txt' % (out_prefix, language)
if out_filename in out_files:
@ -39,6 +58,23 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
out_file.close()
def monolingual_tokenize_file(in_filename, out_filename, language,
tokenizer, line_reader=last_tab,
token_filter=non_punct_filter,
sample_proportion=100):
with open(in_filename, encoding='utf-8', errors='replace') as in_file:
with open(out_filename, 'w', encoding='utf-8') as out_file:
for i, line in enumerate(in_file):
if i % sample_proportion == 0:
text = line_reader(line)
tokens, line_language = tokenizer(text)
if line_language == language:
filtered = [token_filter(t) for t in tokens]
filtered = [t for t in filtered if t is not None]
for token in filtered:
print(token, file=out_file)
def rosette_surface_tokenizer(text):
try:
analysis, lang = ROSETTE.rosette.analyze(text)
@ -50,7 +86,7 @@ def rosette_surface_tokenizer(text):
for (stem, pos, span) in analysis:
surface_text = text[span[0]:span[1]]
tokens.append(surface_text)
return ' '.join(tokens), language
return tokens, language
def treebank_surface_tokenizer(text, language='en'):