Merge remote-tracking branch 'origin/master' into big-list

Conflicts:
	wordfreq_builder/wordfreq_builder/cli/merge_counts.py

Former-commit-id: 164a5b1a05
This commit is contained in:
Rob Speer 2016-03-24 14:11:44 -04:00
commit a5fcfd100d
6 changed files with 44 additions and 21 deletions

View File

@ -24,7 +24,8 @@ classifiers = [
] ]
current_dir = os.path.dirname(__file__) current_dir = os.path.dirname(__file__)
README_contents = open(os.path.join(current_dir, 'README.md')).read() README_contents = open(os.path.join(current_dir, 'README.md'),
encoding='utf-8').read()
doclines = README_contents.split("\n") doclines = README_contents.split("\n")
dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015'] dependencies = ['ftfy >= 4', 'msgpack-python', 'langcodes', 'regex >= 2015']
if sys.version_info < (3, 4): if sys.version_info < (3, 4):

View File

@ -180,3 +180,10 @@ def test_ideographic_fallback():
tokenize(ja_text, 'en'), tokenize(ja_text, 'en'),
['ひらがな', 'カタカナ', 'romaji'] ['ひらがな', 'カタカナ', 'romaji']
) )
# Test that we leave Thai letters stuck together. If we had better Thai support,
# we would actually split this into a three-word phrase.
eq_(tokenize('การเล่นดนตรี', 'th'), ['การเล่นดนตรี'])
eq_(tokenize('"การเล่นดนตรี" means "playing music"', 'en'),
['การเล่นดนตรี', 'means', 'playing', 'music'])

View File

@ -3,23 +3,24 @@ import unicodedata
TOKEN_RE = regex.compile(r""" TOKEN_RE = regex.compile(r"""
# Case 1: a special case for Chinese and Japanese # Case 1: a special case for non-spaced languages
# ----------------------------------------------- # -----------------------------------------------
# When we see characters that are Han ideographs (\p{IsIdeo}) or hiragana # When we see characters that are Han ideographs (\p{IsIdeo}), hiragana
# (\p{Script=Hiragana}), we allow a sequence of those characters to be # (\p{Script=Hiragana}), or Thai (\p{Script=Thai}), we allow a sequence
# glued together as a single token. Without this case, the standard rule # of those characters to be glued together as a single token.
# (case 2) would make each character a separate token. This would be the
# correct behavior for word-wrapping, but a messy failure mode for NLP
# tokenization.
# #
# It is, of course, better to use a tokenizer that is designed for Chinese # Without this case, the standard rule (case 2) would make each character
# or Japanese text. This is effectively a fallback for when the wrong # a separate token. This would be the correct behavior for word-wrapping,
# but a messy failure mode for NLP tokenization.
#
# It is, of course, better to use a tokenizer that is designed for Chinese,
# Japanese, or Thai text. This is effectively a fallback for when the wrong
# tokenizer is used. # tokenizer is used.
# #
# This rule is listed first so that it takes precedence. # This rule is listed first so that it takes precedence.
[\p{IsIdeo}\p{Script=Hiragana}]+ | [\p{IsIdeo}\p{Script=Hiragana}\p{Script=Thai}]+ |
# Case 2: standard Unicode segmentation # Case 2: standard Unicode segmentation
# ------------------------------------- # -------------------------------------

View File

@ -13,10 +13,14 @@ def merge_lists(input_names, output_name, cutoff=0, max_size=1000000):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-counts.csv') parser.add_argument('-o', '--output', default='combined-counts.csv',
parser.add_argument('-c', '--cutoff', type=int, default=0, help='minimum count to read from an input file') help='filename to write the output to')
parser.add_argument('-m', '--max-words', type=int, default=1000000, help='maximum number of words to read from each list') parser.add_argument('-c', '--cutoff', type=int, default=0,
parser.add_argument('inputs', help='names of input files to merge', nargs='+') help='minimum count to read from an input file')
parser.add_argument('-m', '--max-words', type=int, default=1000000,
help='maximum number of words to read from each list')
parser.add_argument('inputs', nargs='+',
help='names of input files to merge')
args = parser.parse_args() args = parser.parse_args()
merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words) merge_lists(args.inputs, args.output, cutoff=args.cutoff, max_size=args.max_words)

View File

@ -18,10 +18,14 @@ def merge_lists(input_names, output_name, cutoff, lang):
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('-o', '--output', help='filename to write the output to', default='combined-freqs.csv') parser.add_argument('-o', '--output', default='combined-freqs.csv',
parser.add_argument('-c', '--cutoff', type=int, help='stop after seeing a count below this', default=2) help='filename to write the output to')
parser.add_argument('-l', '--language', help='language code for which language the words are in', default=None) parser.add_argument('-c', '--cutoff', type=int, default=2,
parser.add_argument('inputs', help='names of input files to merge', nargs='+') help='stop after seeing a count below this')
parser.add_argument('-l', '--language', default=None,
help='language code for which language the words are in')
parser.add_argument('inputs', nargs='+',
help='names of input files to merge')
args = parser.parse_args() args = parser.parse_args()
merge_lists(args.inputs, args.output, args.cutoff, args.language) merge_lists(args.inputs, args.output, args.cutoff, args.language)

View File

@ -54,11 +54,17 @@ KEEP_THESE_LANGUAGES = {
def cld2_reddit_tokenizer(text): def cld2_reddit_tokenizer(text):
"""
A language-detecting tokenizer with special cases for handling text from
Reddit.
"""
text = URL_RE.sub('', text) text = URL_RE.sub('', text)
text = MARKDOWN_URL_RESIDUE_RE.sub(']', text) text = MARKDOWN_URL_RESIDUE_RE.sub(']', text)
lang = cld2_detect_language(text) lang = cld2_detect_language(text)
if lang not in KEEP_THESE_LANGUAGES: if lang not in KEEP_THESE_LANGUAGES:
# Reddit is 99.9% English, so if we detected a rare language, it's
# much more likely that it's actually English.
lang = 'en' lang = 'en'
tokens = tokenize(text, lang, include_punctuation=True) tokens = tokenize(text, lang, include_punctuation=True)
@ -86,7 +92,7 @@ def tokenize_by_language(in_filename, out_prefix, tokenizer):
""" """
Process a file by running it through a given tokenizer. Process a file by running it through a given tokenizer.
Produces output files that are separated by language, with newlines Produces output files that are separated by language, with spaces
between the tokens. between the tokens.
""" """
out_files = {} out_files = {}