From 0711fb3c432488f4c020b856f1fecdb39a806ee1 Mon Sep 17 00:00:00 2001 From: Andrew Lin Date: Fri, 31 Jul 2015 19:12:59 -0400 Subject: [PATCH 1/4] Remove redundant reference to wikipedia in builder README. Former-commit-id: 53621c34dfc27a77bf28ecdd88c450585268a3fa --- wordfreq_builder/README.md | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/wordfreq_builder/README.md b/wordfreq_builder/README.md index a17c504..2aedf27 100644 --- a/wordfreq_builder/README.md +++ b/wordfreq_builder/README.md @@ -82,19 +82,6 @@ The specific rules are described by the comments in `rules.ninja`. ## Data sources -### Wikipedia - -Wikipedia is a "free-access, free-content Internet encyclopedia". - -These files can be downloaded from [wikimedia dump][wikipedia] - -The original files are in `data/raw-input/wikipedia`, and they're processed -by the `wiki2text` rule in `rules.ninja`. Parsing wikipedia requires the -[wiki2text][] package. - -[wikipedia]: https://dumps.wikimedia.org/backup-index.html -[wiki2text]: https://github.com/rspeer/wiki2text - ### Leeds Internet Corpus Also known as the "Web as Corpus" project, this is a University of Leeds From b0fac15f98091abf698a47cf564b361e922fd924 Mon Sep 17 00:00:00 2001 From: Andrew Lin Date: Fri, 31 Jul 2015 19:23:42 -0400 Subject: [PATCH 2/4] Switch to more explanatory Unicode escapes when testing NFKC normalization. Former-commit-id: 66c69e6fac30ec69385084ae3cfccf92215ecd74 --- tests/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test.py b/tests/test.py index d38fd14..d68db1e 100644 --- a/tests/test.py +++ b/tests/test.py @@ -149,6 +149,6 @@ def test_ar(): ) eq_( - tokenize('إﻻ', 'ar'), - ['إلا'] + tokenize('\ufefb', 'ar'), + ['\u0644\u0627'] ) From 77610f57e1dae7797a1cc67cd1129b29f392673b Mon Sep 17 00:00:00 2001 From: Andrew Lin Date: Fri, 31 Jul 2015 19:26:18 -0400 Subject: [PATCH 3/4] Stylistic cleanups to word_counts.py. Former-commit-id: 6d40912ef9c36cca0fdec1caaf94221b1c9f5dec --- wordfreq_builder/wordfreq_builder/word_counts.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py index 8accf2b..5127108 100644 --- a/wordfreq_builder/wordfreq_builder/word_counts.py +++ b/wordfreq_builder/wordfreq_builder/word_counts.py @@ -39,20 +39,16 @@ def read_freqs(filename, cutoff=0, lang=None): raw_counts = defaultdict(float) total = 0. with open(filename, encoding='utf-8', newline='') as infile: - reader = csv.reader(infile) - for key, strval in reader: - + for key, strval in csv.reader(infile): val = float(strval) if val < cutoff: break - tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) for token in tokens: - token = fix_text(token) - total += val # Use += so that, if we give the reader concatenated files with # duplicates, it does the right thing - raw_counts[token] += val + raw_counts[fix_text(token)] += val + total += val for word in raw_counts: raw_counts[word] /= total From e88cf3fdaf51c5af4b35e9b3261849800fa93ced Mon Sep 17 00:00:00 2001 From: Andrew Lin Date: Mon, 3 Aug 2015 11:09:44 -0400 Subject: [PATCH 4/4] Document the NFKC-normalized ligature in the Arabic test. Former-commit-id: 41e1dd41d82358fd44f972e501c8586d0bbd64a2 --- tests/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test.py b/tests/test.py index d68db1e..679811c 100644 --- a/tests/test.py +++ b/tests/test.py @@ -149,6 +149,6 @@ def test_ar(): ) eq_( - tokenize('\ufefb', 'ar'), - ['\u0644\u0627'] + tokenize('\ufefb', 'ar'), # An Arabic ligature... + ['\u0644\u0627'] # ...that is affected by NFKC normalization )