Merge pull request #21 from LuminosoInsight/review-notes

Review notes Former-commit-id: 2b8089e2b1
2024-12-23 17:31:41 +00:00 · 2015-08-03 14:48:15 -04:00 · 2015-08-03 14:48:15 -04:00 · e15fc14b8e
commit e15fc14b8e
parent ba565ee838 e88cf3fdaf
3 changed files with 5 additions and 22 deletions
--- a/tests/test.py
+++ b/tests/test.py
@ -149,6 +149,6 @@ def test_ar():
    )

    eq_(
-        tokenize('إﻻ', 'ar'),
-        ['إلا']
+        tokenize('\ufefb', 'ar'),  # An Arabic ligature...
+        ['\u0644\u0627']  # ...that is affected by NFKC normalization
    )
--- a/wordfreq_builder/README.md
+++ b/wordfreq_builder/README.md
@ -82,19 +82,6 @@ The specific rules are described by the comments in `rules.ninja`.

 ## Data sources

-### Wikipedia
-
-Wikipedia is a "free-access, free-content Internet encyclopedia".
-
-These files can be downloaded from [wikimedia dump][wikipedia]
-
-The original files are in `data/raw-input/wikipedia`, and they're processed
-by the `wiki2text` rule in `rules.ninja`. Parsing wikipedia requires the
-[wiki2text][] package.
-
-[wikipedia]: https://dumps.wikimedia.org/backup-index.html
-[wiki2text]: https://github.com/rspeer/wiki2text
-
 ### Leeds Internet Corpus

 Also known as the "Web as Corpus" project, this is a University of Leeds
--- a/wordfreq_builder/wordfreq_builder/word_counts.py
+++ b/wordfreq_builder/wordfreq_builder/word_counts.py
@ -39,20 +39,16 @@ def read_freqs(filename, cutoff=0, lang=None):
    raw_counts = defaultdict(float)
    total = 0.
    with open(filename, encoding='utf-8', newline='') as infile:
-        reader = csv.reader(infile)
-        for key, strval in reader:
-
+        for key, strval in csv.reader(infile):
            val = float(strval)
            if val < cutoff:
                break
-
            tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
            for token in tokens:
-                token = fix_text(token)
-                total += val
                # Use += so that, if we give the reader concatenated files with
                # duplicates, it does the right thing
-                raw_counts[token] += val
+                raw_counts[fix_text(token)] += val
+                total += val

    for word in raw_counts:
        raw_counts[word] /= total