mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 01:41:39 +00:00
add and adjust some build steps
- more build steps for Wikipedia - rename 'tokenize_twitter' to 'pretokenize_twitter' to indicate that the results are preliminary
This commit is contained in:
parent
33c5f78c07
commit
59409266ca
@ -20,15 +20,30 @@ DATA = ./data
|
||||
rule split
|
||||
command = mkdir -p $$(dirname $prefix) && split -d -n r/$slices $in $prefix
|
||||
|
||||
# wiki2text is a tool I wrote using the development version of Nim, which
|
||||
# extracts plain text from Wikipedia dumps obtained from dumps.wikimedia.org.
|
||||
# The code is at https://github.com/rspeer/wiki2text, but right now it'll
|
||||
# take a bit of setup to get it to run.
|
||||
# wiki2text is a tool I wrote using Nim 0.11, which extracts plain text from
|
||||
# Wikipedia dumps obtained from dumps.wikimedia.org. The code is at
|
||||
# https://github.com/rspeer/wiki2text.
|
||||
rule wiki2text
|
||||
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
|
||||
|
||||
rule wiki2tokens
|
||||
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out
|
||||
|
||||
rule tokenize_twitter
|
||||
command = mkdir -p $$(dirname $prefix) && wordfreq-tokenize-twitter $in $prefix
|
||||
|
||||
# This rule uses command-line tools to take in a file with one token per line,
|
||||
# and output a comma-separated file with the token counts:
|
||||
#
|
||||
# * 'sort $in | uniq -c' does the actual counting.
|
||||
# * 'sort -nrk 1' sorts the result in reverse numeric order by the first field
|
||||
# (the count).
|
||||
# * The 'sed' command rearranges the lines to be comma-separated values with
|
||||
# the count coming second, instead of the count being a right-justified
|
||||
# number at the start of the line.
|
||||
#
|
||||
rule count
|
||||
command = sort $in | uniq -c | sort -nrk 1 | sed -r 's/\s*([0-9]+)\s+(.*)$/\2,\1/' > $out
|
||||
|
||||
rule cat
|
||||
command = cat $in > $out
|
||||
|
@ -11,7 +11,8 @@ setup(
|
||||
packages=['wordfreq_builder'],
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'wordfreq-tokenize-twitter = wordfreq_builder.cli.tokenize_twitter:main',
|
||||
'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
|
||||
'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main',
|
||||
'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
|
||||
]
|
||||
}
|
||||
|
@ -1,10 +1,10 @@
|
||||
from wordfreq_builder.tokenizers import rosette_surface_tokenizer, tokenize_file
|
||||
from wordfreq_builder.tokenizers import rosette_surface_tokenizer, pretokenize_file
|
||||
import argparse
|
||||
|
||||
|
||||
def tokenize_twitter(in_filename, out_prefix):
|
||||
tokenize_file(in_filename, out_prefix,
|
||||
tokenizer=rosette_surface_tokenizer)
|
||||
def pretokenize_twitter(in_filename, out_prefix):
|
||||
pretokenize_file(in_filename, out_prefix,
|
||||
tokenizer=rosette_surface_tokenizer)
|
||||
|
||||
|
||||
def main():
|
||||
@ -12,7 +12,7 @@ def main():
|
||||
parser.add_argument('filename', help='filename of input file containing one tweet per line')
|
||||
parser.add_argument('outprefix', help='prefix of output filenames')
|
||||
args = parser.parse_args()
|
||||
tokenize_twitter(args.filename, args.outprefix)
|
||||
pretokenize_twitter(args.filename, args.outprefix)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
30
wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py
Normal file
30
wordfreq_builder/wordfreq_builder/cli/tokenize_wikipedia.py
Normal file
@ -0,0 +1,30 @@
|
||||
from wordfreq_builder.tokenizers import rosette_surface_tokenizer, monolingual_tokenize_file
|
||||
import argparse
|
||||
|
||||
|
||||
def tokenize_wikipedia(in_filename, out_filename, language, proportion):
|
||||
monolingual_tokenize_file(
|
||||
in_filename, out_filename,
|
||||
language=language,
|
||||
tokenizer=rosette_surface_tokenizer,
|
||||
line_reader=strip_headings,
|
||||
sample_proportion=proportion
|
||||
)
|
||||
|
||||
|
||||
def strip_headings(text):
|
||||
return text.strip().strip('=')
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('in_filename', help='filename of input file')
|
||||
parser.add_argument('out_filename', help='filename of output file')
|
||||
parser.add_argument('language', help='the language code of the text')
|
||||
parser.add_argument('-p', '--proportion', help='process 1/n of the lines (default 100)', type=int, default=100)
|
||||
args = parser.parse_args()
|
||||
tokenize_wikipedia(args.in_filename, args.out_filename, args.language, args.proportion)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
@ -46,6 +46,11 @@ def wiki_parse_deps(dirname_in, dirname_out, languages):
|
||||
outs=output_file, ins=input_file
|
||||
)
|
||||
lines.append(build_rule)
|
||||
output_file = path_out / 'wikipedia_{}.tokens.txt'.format(language)
|
||||
build_rule = "build {outs}: wiki2tokens {ins}".format(
|
||||
outs=output_file, ins=input_file
|
||||
)
|
||||
lines.append(build_rule)
|
||||
return lines
|
||||
|
||||
|
||||
@ -69,7 +74,7 @@ def language_detect_and_tokenize_deps(input_filename, slice_prefix,
|
||||
'{prefix}.{lang}.txt'.format(prefix=slice_file, lang=language)
|
||||
for language in CONFIG['languages']
|
||||
]
|
||||
build_rule = "build {outs}: tokenize_twitter {ins} | wordfreq_builder/tokenizers.py".format(
|
||||
build_rule = "build {outs}: tokenize_twitter {ins}".format(
|
||||
outs=' '.join(language_outputs), ins=slice_file
|
||||
)
|
||||
lines.append(build_rule)
|
||||
|
@ -1,5 +1,6 @@
|
||||
from lumi_science.text_readers.rosette_readers import RosetteReader
|
||||
import re
|
||||
import unicodedata
|
||||
|
||||
|
||||
ROSETTE = RosetteReader()
|
||||
@ -15,6 +16,9 @@ ROSETTE_LANG_MAP = {
|
||||
}
|
||||
|
||||
|
||||
NON_PUNCT_RE = re.compile('[0-9A-Za-z\xc0-\u1fff\u2070-\u2fff\u301f-\ufeff0-9A-Za-z\uff66-\U0002ffff]')
|
||||
|
||||
|
||||
def last_tab(line):
|
||||
"""
|
||||
Read lines by keeping only the last tab-separated value.
|
||||
@ -22,11 +26,26 @@ def last_tab(line):
|
||||
return line.split('\t')[-1].strip()
|
||||
|
||||
|
||||
def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
|
||||
def non_punct_filter(token):
|
||||
if NON_PUNCT_RE.search(token):
|
||||
return token.lower()
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
def pretokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
|
||||
"""
|
||||
Process a file by running it through the given tokenizer, sorting the
|
||||
results by the language of each line, and inserting spaces into lines
|
||||
to mark the token boundaries. This computes the 'hard part' of
|
||||
tokenization and allows the results to be saved, so that we can change
|
||||
the finer details of the output without re-running everything.
|
||||
"""
|
||||
out_files = {}
|
||||
for line in open(in_filename, encoding='utf-8'):
|
||||
text = line_reader(line)
|
||||
tokenized, language = tokenizer(text)
|
||||
tokens, language = tokenizer(text)
|
||||
tokenized = ' '.join(tokens)
|
||||
if language is not None:
|
||||
out_filename = '%s.%s.txt' % (out_prefix, language)
|
||||
if out_filename in out_files:
|
||||
@ -39,6 +58,23 @@ def tokenize_file(in_filename, out_prefix, tokenizer, line_reader=last_tab):
|
||||
out_file.close()
|
||||
|
||||
|
||||
def monolingual_tokenize_file(in_filename, out_filename, language,
|
||||
tokenizer, line_reader=last_tab,
|
||||
token_filter=non_punct_filter,
|
||||
sample_proportion=100):
|
||||
with open(in_filename, encoding='utf-8', errors='replace') as in_file:
|
||||
with open(out_filename, 'w', encoding='utf-8') as out_file:
|
||||
for i, line in enumerate(in_file):
|
||||
if i % sample_proportion == 0:
|
||||
text = line_reader(line)
|
||||
tokens, line_language = tokenizer(text)
|
||||
if line_language == language:
|
||||
filtered = [token_filter(t) for t in tokens]
|
||||
filtered = [t for t in filtered if t is not None]
|
||||
for token in filtered:
|
||||
print(token, file=out_file)
|
||||
|
||||
|
||||
def rosette_surface_tokenizer(text):
|
||||
try:
|
||||
analysis, lang = ROSETTE.rosette.analyze(text)
|
||||
@ -50,7 +86,7 @@ def rosette_surface_tokenizer(text):
|
||||
for (stem, pos, span) in analysis:
|
||||
surface_text = text[span[0]:span[1]]
|
||||
tokens.append(surface_text)
|
||||
return ' '.join(tokens), language
|
||||
return tokens, language
|
||||
|
||||
|
||||
def treebank_surface_tokenizer(text, language='en'):
|
||||
|
Loading…
Reference in New Issue
Block a user