mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Merge remote-tracking branch 'origin/master' into big-list
Conflicts: wordfreq_builder/wordfreq_builder/cli/merge_counts.py
This commit is contained in:
commit
164a5b1a05
3
setup.py
3
setup.py
@ -24,7 +24,8 @@ classifiers = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
current_dir = os.path.dirname(__file__)
|
current_dir = os.path.dirname(__file__)
|
||||||
README_contents = open(os.path.join(current_dir, 'README.md')).read()
|
README_contents = open(os.path.join(current_dir, 'README.md'),
|
||||||
|
encoding='utf-8').read()
|
||||||
doclines = README_contents.split("\n")
|
doclines = README_contents.split("\n")
|
||||||
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
|
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
|
||||||
if sys.version_info < (3, 4):
|
if sys.version_info < (3, 4):
|
||||||
|
@ -100,7 +100,7 @@ def test_tokenization():
|
|||||||
# data
|
# data
|
||||||
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
|
eq_(tokenize("I don't split at apostrophes, you see.", 'en'),
|
||||||
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
|
['i', "don't", 'split', 'at', 'apostrophes', 'you', 'see'])
|
||||||
|
|
||||||
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
|
eq_(tokenize("I don't split at apostrophes, you see.", 'en', include_punctuation=True),
|
||||||
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
|
['i', "don't", 'split', 'at', 'apostrophes', ',', 'you', 'see', '.'])
|
||||||
|
|
||||||
@ -180,3 +180,10 @@ def test_ideographic_fallback():
|
|||||||
tokenize(ja_text, 'en'),
|
tokenize(ja_text, 'en'),
|
||||||
['ひらがな', 'カタカナ', 'romaji']
|
['ひらがな', 'カタカナ', 'romaji']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Test that we leave Thai letters stuck together. If we had better Thai support,
|
||||||
|
# we would actually split this into a three-word phrase.
|
||||||
|
eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
|
||||||
|
eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
|
||||||
|
['การเล่นดนตรี', 'means', 'playing', 'music'])
|
||||||
|
|
||||||
|
@ -3,23 +3,24 @@ import unicodedata
|
|||||||
|
|
||||||
|
|
||||||
TOKEN_RE = regex.compile(r"""
|
TOKEN_RE = regex.compile(r"""
|
||||||
# Case 1: a special case for Chinese and Japanese
|
# Case 1: a special case for non-spaced languages
|
||||||
# -----------------------------------------------
|
# -----------------------------------------------
|
||||||
|
|
||||||
# When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana
|
# When we see characters that are Han ideographs (\p{IsIdeo}), hiragana
|
||||||
# (\p{Script=Hiragana}), we allow a sequence of those characters to be
|
# (\p{Script=Hiragana}), or Thai (\p{Script=Thai}), we allow a sequence
|
||||||
# glued together as a single token. Without this case, the standard rule
|
# of those characters to be glued together as a single token.
|
||||||
# (case 2) would make each character a separate token. This would be the
|
|
||||||
# correct behavior for word-wrapping, but a messy failure mode for NLP
|
|
||||||
# tokenization.
|
|
||||||
#
|
#
|
||||||
# It is, of course, better to use a tokenizer that is designed for Chinese
|
# Without this case, the standard rule (case 2) would make each character
|
||||||
# or Japanese text. This is effectively a fallback for when the wrong
|
# a separate token. This would be the correct behavior for word-wrapping,
|
||||||
|
# but a messy failure mode for NLP tokenization.
|
||||||
|
#
|
||||||
|
# It is, of course, better to use a tokenizer that is designed for Chinese,
|
||||||
|
# Japanese, or Thai text. This is effectively a fallback for when the wrong
|
||||||
# tokenizer is used.
|
# tokenizer is used.
|
||||||
#
|
#
|
||||||
# This rule is listed first so that it takes precedence.
|
# This rule is listed first so that it takes precedence.
|
||||||
|
|
||||||
[\p{IsIdeo}\p{Script=Hiragana}]+ |
|
[\p{IsIdeo}\p{Script=Hiragana}\p{Script=Thai}]+ |
|
||||||
|
|
||||||
# Case 2: standard Unicode segmentation
|
# Case 2: standard Unicode segmentation
|
||||||
# -------------------------------------
|
# -------------------------------------
|
||||||
|
@ -13,10 +13,14 @@ def merge_lists(input_names, output_name, cutoff=0, max_size=1000000):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv')
|
parser.add_argument('-o', '--output', default='combined-counts.csv',
|
||||||
parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file')
|
help='filename to write the output to')
|
||||||
parser.add_argument('-m', '--max-words', type=int, default=1000000, help='maximum number of words to read from each list')
|
parser.add_argument('-c', '--cutoff', type=int, default=0,
|
||||||
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
help='minimum count to read from an input file')
|
||||||
|
parser.add_argument('-m', '--max-words', type=int, default=1000000,
|
||||||
|
help='maximum number of words to read from each list')
|
||||||
|
parser.add_argument('inputs', nargs='+',
|
||||||
|
help='names of input files to merge')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words)
|
merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words)
|
||||||
|
|
||||||
|
@ -18,10 +18,14 @@ def merge_lists(input_names, output_name, cutoff, lang):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv')
|
parser.add_argument('-o', '--output', default='combined-freqs.csv',
|
||||||
parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2)
|
help='filename to write the output to')
|
||||||
parser.add_argument('-l', '--language', help='language code for which language the words are in', default=None)
|
parser.add_argument('-c', '--cutoff', type=int, default=2,
|
||||||
parser.add_argument('inputs', help='names of input files to merge', nargs='+')
|
help='stop after seeing a count below this')
|
||||||
|
parser.add_argument('-l', '--language', default=None,
|
||||||
|
help='language code for which language the words are in')
|
||||||
|
parser.add_argument('inputs', nargs='+',
|
||||||
|
help='names of input files to merge')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
merge_lists(args.inputs, args.output, args.cutoff, args.language)
|
merge_lists(args.inputs, args.output, args.cutoff, args.language)
|
||||||
|
|
||||||
|
@ -54,11 +54,17 @@ KEEP_THESE_LANGUAGES = {
|
|||||||
|
|
||||||
|
|
||||||
def cld2_reddit_tokenizer(text):
|
def cld2_reddit_tokenizer(text):
|
||||||
|
"""
|
||||||
|
A language-detecting tokenizer with special cases for handling text from
|
||||||
|
Reddit.
|
||||||
|
"""
|
||||||
text = URL_RE.sub('', text)
|
text = URL_RE.sub('', text)
|
||||||
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
|
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
|
||||||
|
|
||||||
lang = cld2_detect_language(text)
|
lang = cld2_detect_language(text)
|
||||||
if lang not in KEEP_THESE_LANGUAGES:
|
if lang not in KEEP_THESE_LANGUAGES:
|
||||||
|
# Reddit is 99.9% English, so if we detected a rare language, it's
|
||||||
|
# much more likely that it's actually English.
|
||||||
lang = 'en'
|
lang = 'en'
|
||||||
|
|
||||||
tokens = tokenize(text, lang, include_punctuation=True)
|
tokens = tokenize(text, lang, include_punctuation=True)
|
||||||
@ -86,7 +92,7 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer):
|
|||||||
"""
|
"""
|
||||||
Process a file by running it through a given tokenizer.
|
Process a file by running it through a given tokenizer.
|
||||||
|
|
||||||
Produces output files that are separated by language, with newlines
|
Produces output files that are separated by language, with spaces
|
||||||
between the tokens.
|
between the tokens.
|
||||||
"""
|
"""
|
||||||
out_files = {}
|
out_files = {}
|
||||||
|
Loading…
Reference in New Issue
Block a user