mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 17:31:41 +00:00
commit
2b8089e2b1
@ -149,6 +149,6 @@ def test_ar():
|
|||||||
)
|
)
|
||||||
|
|
||||||
eq_(
|
eq_(
|
||||||
tokenize('إﻻ', 'ar'),
|
tokenize('\ufefb', 'ar'), # An Arabic ligature...
|
||||||
['إلا']
|
['\u0644\u0627'] # ...that is affected by NFKC normalization
|
||||||
)
|
)
|
||||||
|
@ -82,19 +82,6 @@ The specific rules are described by the comments in `rules.ninja`.
|
|||||||
|
|
||||||
## Data sources
|
## Data sources
|
||||||
|
|
||||||
### Wikipedia
|
|
||||||
|
|
||||||
Wikipedia is a "free-access, free-content Internet encyclopedia".
|
|
||||||
|
|
||||||
These files can be downloaded from [wikimedia dump][wikipedia]
|
|
||||||
|
|
||||||
The original files are in `data/raw-input/wikipedia`, and they're processed
|
|
||||||
by the `wiki2text` rule in `rules.ninja`. Parsing wikipedia requires the
|
|
||||||
[wiki2text][] package.
|
|
||||||
|
|
||||||
[wikipedia]: https://dumps.wikimedia.org/backup-index.html
|
|
||||||
[wiki2text]: https://github.com/rspeer/wiki2text
|
|
||||||
|
|
||||||
### Leeds Internet Corpus
|
### Leeds Internet Corpus
|
||||||
|
|
||||||
Also known as the "Web as Corpus" project, this is a University of Leeds
|
Also known as the "Web as Corpus" project, this is a University of Leeds
|
||||||
|
@ -39,20 +39,16 @@ def read_freqs(filename, cutoff=0, lang=None):
|
|||||||
raw_counts = defaultdict(float)
|
raw_counts = defaultdict(float)
|
||||||
total = 0.
|
total = 0.
|
||||||
with open(filename, encoding='utf-8', newline='') as infile:
|
with open(filename, encoding='utf-8', newline='') as infile:
|
||||||
reader = csv.reader(infile)
|
for key, strval in csv.reader(infile):
|
||||||
for key, strval in reader:
|
|
||||||
|
|
||||||
val = float(strval)
|
val = float(strval)
|
||||||
if val < cutoff:
|
if val < cutoff:
|
||||||
break
|
break
|
||||||
|
|
||||||
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
|
tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key)
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
token = fix_text(token)
|
|
||||||
total += val
|
|
||||||
# Use += so that, if we give the reader concatenated files with
|
# Use += so that, if we give the reader concatenated files with
|
||||||
# duplicates, it does the right thing
|
# duplicates, it does the right thing
|
||||||
raw_counts[token] += val
|
raw_counts[fix_text(token)] += val
|
||||||
|
total += val
|
||||||
|
|
||||||
for word in raw_counts:
|
for word in raw_counts:
|
||||||
raw_counts[word] /= total
|
raw_counts[word] /= total
|
||||||
|
Loading…
Reference in New Issue
Block a user