mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-24 18:01:38 +00:00
parent
d924c8e2a5
commit
6f11256ed1
@ -236,13 +236,6 @@ def jieba_deps(dirname_in, languages):
|
|||||||
return lines
|
return lines
|
||||||
|
|
||||||
|
|
||||||
def reddit_base_filename(path):
|
|
||||||
"""
|
|
||||||
Get the base name of a Reddit input file, without its path or extension.
|
|
||||||
"""
|
|
||||||
return path.stem
|
|
||||||
|
|
||||||
|
|
||||||
def reddit_deps(dirname_in, languages):
|
def reddit_deps(dirname_in, languages):
|
||||||
lines = []
|
lines = []
|
||||||
path_in = pathlib.Path(dirname_in)
|
path_in = pathlib.Path(dirname_in)
|
||||||
@ -252,7 +245,7 @@ def reddit_deps(dirname_in, languages):
|
|||||||
# Extract text from the Reddit comment dumps, and write them to
|
# Extract text from the Reddit comment dumps, and write them to
|
||||||
# .txt.gz files
|
# .txt.gz files
|
||||||
for filepath in path_in.glob('*/*.bz2'):
|
for filepath in path_in.glob('*/*.bz2'):
|
||||||
base = reddit_base_filename(filepath)
|
base = filepath.stem
|
||||||
transformed_file = wordlist_filename('reddit', base + '.all', 'txt')
|
transformed_file = wordlist_filename('reddit', base + '.all', 'txt')
|
||||||
slices[base] = transformed_file
|
slices[base] = transformed_file
|
||||||
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
|
add_dep(lines, 'extract_reddit', str(filepath), transformed_file)
|
||||||
|
Loading…
Reference in New Issue
Block a user