Merge pull request #21 from LuminosoInsight/review-notes

Review notes

Former-commit-id: 2b8089e2b1
This commit is contained in:
Robyn Speer 2015-08-03 14:48:15 -04:00
commit e15fc14b8e
3 changed files with 5 additions and 22 deletions

View File

@ -149,6 +149,6 @@ def test_ar():
) )
eq_( eq_(
tokenize('إﻻ', 'ar'), tokenize('\ufefb', 'ar'), # An Arabic ligature...
['إلا'] ['\u0644\u0627'] # ...that is affected by NFKC normalization
) )

View File

@ -82,19 +82,6 @@ The specific rules are described by the comments in `rules.ninja`.
## Data sources ## Data sources
### Wikipedia
Wikipedia is a "free-access, free-content Internet encyclopedia".
These files can be downloaded from [wikimedia dump][wikipedia]
The original files are in `data/raw-input/wikipedia`, and they're processed
by the `wiki2text` rule in `rules.ninja`. Parsing wikipedia requires the
[wiki2text][] package.
[wikipedia]: https://dumps.wikimedia.org/backup-index.html
[wiki2text]: https://github.com/rspeer/wiki2text
### Leeds Internet Corpus ### Leeds Internet Corpus
Also known as the "Web as Corpus" project, this is a University of Leeds Also known as the "Web as Corpus" project, this is a University of Leeds

View File

@ -39,20 +39,16 @@ def read_freqs(filename, cutoff=0, lang=None):
raw_counts = defaultdict(float) raw_counts = defaultdict(float)
total = 0. total = 0.
with open(filename, encoding='utf-8', newline='') as infile: with open(filename, encoding='utf-8', newline='') as infile:
reader = csv.reader(infile) for key, strval in csv.reader(infile):
for key, strval in reader:
val = float(strval) val = float(strval)
if val < cutoff: if val < cutoff:
break break
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
for token in tokens: for token in tokens:
token = fix_text(token)
total += val
# Use += so that, if we give the reader concatenated files with # Use += so that, if we give the reader concatenated files with
# duplicates, it does the right thing # duplicates, it does the right thing
raw_counts[token] += val raw_counts[fix_text(token)] += val
total += val
for word in raw_counts: for word in raw_counts:
raw_counts[word] /= total raw_counts[word] /= total