mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-26 02:28:50 +00:00
remove wiki2tokens and tokenize_wikipedia
These components are no longer necessary. Wikipedia output can and should be tokenized with the standard tokenizer, instead of the almost-equivalent one in the Nim code.
This commit is contained in:
parent
9a2855394d
commit
4771c12814
@ -31,16 +31,6 @@ rule split
|
|||||||
rule wiki2text
|
rule wiki2text
|
||||||
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
|
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text > $out
|
||||||
|
|
||||||
# The wiki2tokens rule is the same as the wiki2text rule, but uses the -t
|
|
||||||
# flag to tell the Nim code to output one token per line (according to its
|
|
||||||
# language-agnostic tokenizer, which splits on punctuation and whitespace in
|
|
||||||
# basically the same way as wordfreq).
|
|
||||||
#
|
|
||||||
# The fact that this uses a language-agnostic tokenizer means it should not
|
|
||||||
# be applied to Chinese or Japanese.
|
|
||||||
rule wiki2tokens
|
|
||||||
command = mkdir -p $$(dirname $out) && bunzip2 -c $in | wiki2text -t > $out
|
|
||||||
|
|
||||||
# To tokenize Japanese, we run it through Mecab and take the first column.
|
# To tokenize Japanese, we run it through Mecab and take the first column.
|
||||||
# We don't have a plan for tokenizing Chinese yet.
|
# We don't have a plan for tokenizing Chinese yet.
|
||||||
rule tokenize_japanese
|
rule tokenize_japanese
|
||||||
|
@ -14,7 +14,6 @@ setup(
|
|||||||
'console_scripts': [
|
'console_scripts': [
|
||||||
'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
|
'wordfreq-pretokenize-twitter = wordfreq_builder.cli.pretokenize_twitter:main',
|
||||||
'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
|
'wordfreq-format-twitter = wordfreq_builder.cli.format_twitter:main',
|
||||||
'wordfreq-tokenize-wikipedia = wordfreq_builder.cli.tokenize_wikipedia:main',
|
|
||||||
'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
|
'wordfreq-build-deps = wordfreq_builder.cli.build_deps:main'
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -1,30 +0,0 @@
|
|||||||
from wordfreq_builder.tokenizers import cld2_surface_tokenizer, monolingual_tokenize_file
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
|
|
||||||
def tokenize_wikipedia(in_filename, out_filename, language, proportion):
|
|
||||||
monolingual_tokenize_file(
|
|
||||||
in_filename, out_filename,
|
|
||||||
language=language,
|
|
||||||
tokenizer=cld2_surface_tokenizer,
|
|
||||||
line_reader=strip_headings,
|
|
||||||
sample_proportion=proportion
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def strip_headings(text):
|
|
||||||
return text.strip().strip('=')
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument('in_filename', help='filename of input file')
|
|
||||||
parser.add_argument('out_filename', help='filename of output file')
|
|
||||||
parser.add_argument('language', help='the language code of the text')
|
|
||||||
parser.add_argument('-p', '--proportion', help='process 1/n of the lines (default 100)', type=int, default=100)
|
|
||||||
args = parser.parse_args()
|
|
||||||
tokenize_wikipedia(args.in_filename, args.out_filename, args.language, args.proportion)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
@ -96,16 +96,16 @@ def wikipedia_deps(dirname_in, languages):
|
|||||||
input_file = max(path_in.glob(
|
input_file = max(path_in.glob(
|
||||||
'{}wiki*.bz2'.format(language)
|
'{}wiki*.bz2'.format(language)
|
||||||
))
|
))
|
||||||
token_file = wordlist_filename('wikipedia', language, 'tokens.txt')
|
plain_text_file = wordlist_filename('wikipedia', language, 'txt')
|
||||||
count_file = wordlist_filename('wikipedia', language, 'counts.txt')
|
count_file = wordlist_filename('wikipedia', language, 'counts.txt')
|
||||||
|
|
||||||
add_dep(lines, 'wiki2tokens', input_file, token_file)
|
add_dep(lines, 'wiki2text', input_file, plain_text_file)
|
||||||
if language == 'ja':
|
if language == 'ja':
|
||||||
mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt')
|
mecab_token_file = wordlist_filename('wikipedia', language, 'mecab-tokens.txt')
|
||||||
add_dep(lines, 'tokenize_japanese', token_file, mecab_token_file)
|
add_dep(lines, 'tokenize_japanese', plain_text_file, mecab_token_file)
|
||||||
add_dep(lines, 'count', mecab_token_file, count_file)
|
add_dep(lines, 'count', mecab_token_file, count_file)
|
||||||
else:
|
else:
|
||||||
add_dep(lines, 'count', token_file, count_file)
|
add_dep(lines, 'count', plain_text_file, count_file)
|
||||||
|
|
||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user