mirror of
https://github.com/rspeer/wordfreq.git
synced 2024-12-23 09:21:37 +00:00
Normalize words when storing them or looking them up.
This commit is contained in:
parent
ea5de7cb2a
commit
8f00846117
1
setup.py
1
setup.py
@ -49,6 +49,7 @@ class SimpleCommand(Command):
|
||||
def finalize_options(self):
|
||||
pass
|
||||
|
||||
|
||||
class BuildDatabaseCommand(SimpleCommand):
|
||||
description = "Build the word frequency database from raw data"
|
||||
def run(self):
|
||||
|
@ -4,17 +4,17 @@ import codecs
|
||||
import re
|
||||
import os
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from ftfy import ftfy
|
||||
from wordfreq import config, schema
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
from wordfreq.util import standardize_word
|
||||
|
||||
|
||||
def read_csv(filename):
|
||||
"""
|
||||
Load word frequencies from a file of comma-separated values, where
|
||||
each line contains a term, a comma, and its frequency.
|
||||
each line contains a word, a comma, and its frequency.
|
||||
|
||||
Scale the frequencies so they add up to 1.0, and return them as a
|
||||
dictionary.
|
||||
@ -27,16 +27,21 @@ def read_multilingual_csv(filename):
|
||||
Load word frequencies from a file of comma-separated values, where
|
||||
each line is of the form:
|
||||
|
||||
term|lang,freq
|
||||
word|lang,freq
|
||||
|
||||
Scale the frequencies so they add up to 1.0 *for each language*,
|
||||
and return a dictionary from language -> (word -> freq).
|
||||
"""
|
||||
unscaled = defaultdict(dict)
|
||||
raw_freqs = _read_csv_basic(filename)
|
||||
for termlang in raw_freqs:
|
||||
term, lang = termlang.rsplit('|', 1)
|
||||
unscaled[lang][term] = raw_freqs[termlang]
|
||||
for wordlang in raw_freqs:
|
||||
word, lang = wordlang.rsplit('|', 1)
|
||||
word = standardize_word(word)
|
||||
|
||||
# The CSV reader has standardized everything to uppercase.
|
||||
# Fix that for the language codes, which should be lowercase.
|
||||
lang = lang.lower()
|
||||
unscaled[lang][word] = raw_freqs[wordlang]
|
||||
|
||||
scaled = {}
|
||||
for key in unscaled:
|
||||
@ -52,7 +57,7 @@ def _read_csv_basic(filename):
|
||||
line = line.rstrip(u'\n')
|
||||
word, count = line.rsplit(u',', 1)
|
||||
count = float(count)
|
||||
counts[word] = count
|
||||
counts[standardize_word(word)] = count
|
||||
return counts
|
||||
|
||||
|
||||
@ -73,7 +78,7 @@ def read_leeds_corpus(filename):
|
||||
rank = line.split(u' ')[0]
|
||||
if NUMBER_RE.match(rank) and line.count(u' ') == 2:
|
||||
_, freq, token = line.split(u' ')
|
||||
token = ftfy(token).lower()
|
||||
token = standardize_word(ftfy(token))
|
||||
freq = float(freq)
|
||||
counts[token] += freq
|
||||
|
||||
|
@ -5,7 +5,7 @@ DB_DIR = (os.environ.get('WORDFREQ_DATA')
|
||||
or os.path.expanduser('~/.cache/wordfreq'))
|
||||
|
||||
# When the minor version number increments, the data may change.
|
||||
VERSION = '0.1.1'
|
||||
VERSION = '0.2.0'
|
||||
MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
|
||||
|
||||
# Put these options together to make a database filename.
|
||||
|
@ -1,4 +1,5 @@
|
||||
from wordfreq.config import DB_FILENAME, CACHE_SIZE
|
||||
from wordfreq.util import standardize_word
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
@ -35,7 +36,7 @@ def word_frequency(word, lang, wordlist='multi', offset=0.):
|
||||
"""
|
||||
c = CONN.cursor()
|
||||
c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
|
||||
(word, lang, wordlist))
|
||||
(standardize_word(word), lang, wordlist))
|
||||
row = c.fetchone()
|
||||
if row is None:
|
||||
return offset
|
||||
|
Loading…
Reference in New Issue
Block a user