From 0711fb3c432488f4c020b856f1fecdb39a806ee1 Mon Sep 17 00:00:00 2001
From: Andrew Lin <alin@luminoso.com>
Date: Fri, 31 Jul 2015 19:12:59 -0400
Subject: [PATCH 1/4] Remove redundant reference to wikipedia in builder
 README.

Former-commit-id: 53621c34dfc27a77bf28ecdd88c450585268a3fa
---
 wordfreq_builder/README.md | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/wordfreq_builder/README.md b/wordfreq_builder/README.md
index a17c504..2aedf27 100644
--- a/wordfreq_builder/README.md
+++ b/wordfreq_builder/README.md
@@ -82,19 +82,6 @@ The specific rules are described by the comments in `rules.ninja`.
 
 ## Data sources
 
-### Wikipedia
-
-Wikipedia is a "free-access, free-content Internet encyclopedia".
-
-These files can be downloaded from [wikimedia dump][wikipedia]
-
-The original files are in `data/raw-input/wikipedia`, and they're processed
-by the `wiki2text` rule in `rules.ninja`. Parsing wikipedia requires the
-[wiki2text][] package.
-
-[wikipedia]: https://dumps.wikimedia.org/backup-index.html
-[wiki2text]: https://github.com/rspeer/wiki2text
-
 ### Leeds Internet Corpus
 
 Also known as the "Web as Corpus" project, this is a University of Leeds

From b0fac15f98091abf698a47cf564b361e922fd924 Mon Sep 17 00:00:00 2001
From: Andrew Lin <alin@luminoso.com>
Date: Fri, 31 Jul 2015 19:23:42 -0400
Subject: [PATCH 2/4] Switch to more explanatory Unicode escapes when testing
 NFKC normalization.

Former-commit-id: 66c69e6fac30ec69385084ae3cfccf92215ecd74
---
 tests/test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index d38fd14..d68db1e 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -149,6 +149,6 @@ def test_ar():
     )
 
     eq_(
-        tokenize('إﻻ', 'ar'),
-        ['إلا']
+        tokenize('\ufefb', 'ar'),
+        ['\u0644\u0627']
     )

From 77610f57e1dae7797a1cc67cd1129b29f392673b Mon Sep 17 00:00:00 2001
From: Andrew Lin <alin@luminoso.com>
Date: Fri, 31 Jul 2015 19:26:18 -0400
Subject: [PATCH 3/4] Stylistic cleanups to word_counts.py.

Former-commit-id: 6d40912ef9c36cca0fdec1caaf94221b1c9f5dec
---
 wordfreq_builder/wordfreq_builder/word_counts.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/wordfreq_builder/wordfreq_builder/word_counts.py b/wordfreq_builder/wordfreq_builder/word_counts.py
index 8accf2b..5127108 100644
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@@ -39,20 +39,16 @@ def read_freqs(filename, cutoff=0, lang=None):
     raw_counts = defaultdict(float)
     total = 0.
     with open(filename, encoding='utf-8', newline='') as infile:
-        reader = csv.reader(infile)
-        for key, strval in reader:
-
+        for key, strval in csv.reader(infile):
             val = float(strval)
             if val < cutoff:
                 break
-
             tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
             for token in tokens:
-                token = fix_text(token)
-                total += val
                 # Use += so that, if we give the reader concatenated files with
                 # duplicates, it does the right thing
-                raw_counts[token] += val
+                raw_counts[fix_text(token)] += val
+                total += val
 
     for word in raw_counts:
         raw_counts[word] /= total

From e88cf3fdaf51c5af4b35e9b3261849800fa93ced Mon Sep 17 00:00:00 2001
From: Andrew Lin <alin@luminoso.com>
Date: Mon, 3 Aug 2015 11:09:44 -0400
Subject: [PATCH 4/4] Document the NFKC-normalized ligature in the Arabic test.

Former-commit-id: 41e1dd41d82358fd44f972e501c8586d0bbd64a2
---
 tests/test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test.py b/tests/test.py
index d68db1e..679811c 100644
--- a/tests/test.py
+++ b/tests/test.py
@@ -149,6 +149,6 @@ def test_ar():
     )
 
     eq_(
-        tokenize('\ufefb', 'ar'),
-        ['\u0644\u0627']
+        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
+        ['\u0644\u0627']  # ...that is affected by NFKC normalization
     )