From 8f00846117738f154b10e04ea237df7a9736d45d Mon Sep 17 00:00:00 2001 From: Robyn Speer Date: Wed, 30 Oct 2013 14:59:26 -0400 Subject: [PATCH] Normalize words when storing them or looking them up. --- setup.py | 1 + wordfreq/build.py | 23 ++++++++++++++--------- wordfreq/config.py | 2 +- wordfreq/query.py | 3 ++- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/setup.py b/setup.py index d87903e..0ef1848 100755 --- a/setup.py +++ b/setup.py @@ -49,6 +49,7 @@ class SimpleCommand(Command): def finalize_options(self): pass + class BuildDatabaseCommand(SimpleCommand): description = "Build the word frequency database from raw data" def run(self): diff --git a/wordfreq/build.py b/wordfreq/build.py index 1409388..f1604ac 100644 --- a/wordfreq/build.py +++ b/wordfreq/build.py @@ -4,17 +4,17 @@ import codecs import re import os import logging +logger = logging.getLogger(__name__) from ftfy import ftfy from wordfreq import config, schema - -logger = logging.getLogger(__name__) +from wordfreq.util import standardize_word def read_csv(filename): """ Load word frequencies from a file of comma-separated values, where - each line contains a term, a comma, and its frequency. + each line contains a word, a comma, and its frequency. Scale the frequencies so they add up to 1.0, and return them as a dictionary. @@ -27,16 +27,21 @@ def read_multilingual_csv(filename): Load word frequencies from a file of comma-separated values, where each line is of the form: - term|lang,freq + word|lang,freq Scale the frequencies so they add up to 1.0 *for each language*, and return a dictionary from language -> (word -> freq). """ unscaled = defaultdict(dict) raw_freqs = _read_csv_basic(filename) - for termlang in raw_freqs: - term, lang = termlang.rsplit('|', 1) - unscaled[lang][term] = raw_freqs[termlang] + for wordlang in raw_freqs: + word, lang = wordlang.rsplit('|', 1) + word = standardize_word(word) + + # The CSV reader has standardized everything to uppercase. + # Fix that for the language codes, which should be lowercase. + lang = lang.lower() + unscaled[lang][word] = raw_freqs[wordlang] scaled = {} for key in unscaled: @@ -52,7 +57,7 @@ def _read_csv_basic(filename): line = line.rstrip(u'\n') word, count = line.rsplit(u',', 1) count = float(count) - counts[word] = count + counts[standardize_word(word)] = count return counts @@ -73,7 +78,7 @@ def read_leeds_corpus(filename): rank = line.split(u' ')[0] if NUMBER_RE.match(rank) and line.count(u' ') == 2: _, freq, token = line.split(u' ') - token = ftfy(token).lower() + token = standardize_word(ftfy(token)) freq = float(freq) counts[token] += freq diff --git a/wordfreq/config.py b/wordfreq/config.py index 7f38e17..b98d119 100644 --- a/wordfreq/config.py +++ b/wordfreq/config.py @@ -5,7 +5,7 @@ DB_DIR = (os.environ.get('WORDFREQ_DATA') or os.path.expanduser('~/.cache/wordfreq')) # When the minor version number increments, the data may change. -VERSION = '0.1.1' +VERSION = '0.2.0' MINOR_VERSION = '.'.join(VERSION.split('.')[:2]) # Put these options together to make a database filename. diff --git a/wordfreq/query.py b/wordfreq/query.py index d2cffbd..14c18c5 100644 --- a/wordfreq/query.py +++ b/wordfreq/query.py @@ -1,4 +1,5 @@ from wordfreq.config import DB_FILENAME, CACHE_SIZE +from wordfreq.util import standardize_word import sqlite3 import sys @@ -35,7 +36,7 @@ def word_frequency(word, lang, wordlist='multi', offset=0.): """ c = CONN.cursor() c.execute("SELECT freq from words where word=? and lang=? and wordlist=?", - (word, lang, wordlist)) + (standardize_word(word), lang, wordlist)) row = c.fetchone() if row is None: return offset