From 8f00846117738f154b10e04ea237df7a9736d45d Mon Sep 17 00:00:00 2001
From: Robyn Speer <rspeer@luminoso.com>
Date: Wed, 30 Oct 2013 14:59:26 -0400
Subject: [PATCH] Normalize words when storing them or looking them up.

---
 setup.py           |  1 +
 wordfreq/build.py  | 23 ++++++++++++++---------
 wordfreq/config.py |  2 +-
 wordfreq/query.py  |  3 ++-
 4 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/setup.py b/setup.py
index d87903e..0ef1848 100755
--- a/setup.py
+++ b/setup.py
@@ -49,6 +49,7 @@ class SimpleCommand(Command):
     def finalize_options(self):
         pass
 
+
 class BuildDatabaseCommand(SimpleCommand):
     description = "Build the word frequency database from raw data"
     def run(self):
diff --git a/wordfreq/build.py b/wordfreq/build.py
index 1409388..f1604ac 100644
--- a/wordfreq/build.py
+++ b/wordfreq/build.py
@@ -4,17 +4,17 @@ import codecs
 import re
 import os
 import logging
+logger = logging.getLogger(__name__)
 
 from ftfy import ftfy
 from wordfreq import config, schema
-
-logger = logging.getLogger(__name__)
+from wordfreq.util import standardize_word
 
 
 def read_csv(filename):
     """
     Load word frequencies from a file of comma-separated values, where
-    each line contains a term, a comma, and its frequency.
+    each line contains a word, a comma, and its frequency.
 
     Scale the frequencies so they add up to 1.0, and return them as a
     dictionary.
@@ -27,16 +27,21 @@ def read_multilingual_csv(filename):
     Load word frequencies from a file of comma-separated values, where
     each line is of the form:
 
-        term|lang,freq
+        word|lang,freq
 
     Scale the frequencies so they add up to 1.0 *for each language*,
     and return a dictionary from language -> (word -> freq).
     """
     unscaled = defaultdict(dict)
     raw_freqs = _read_csv_basic(filename)
-    for termlang in raw_freqs:
-        term, lang = termlang.rsplit('|', 1)
-        unscaled[lang][term] = raw_freqs[termlang]
+    for wordlang in raw_freqs:
+        word, lang = wordlang.rsplit('|', 1)
+        word = standardize_word(word)
+
+        # The CSV reader has standardized everything to uppercase.
+        # Fix that for the language codes, which should be lowercase.
+        lang = lang.lower()
+        unscaled[lang][word] = raw_freqs[wordlang]
 
     scaled = {}
     for key in unscaled:
@@ -52,7 +57,7 @@ def _read_csv_basic(filename):
         line = line.rstrip(u'\n')
         word, count = line.rsplit(u',', 1)
         count = float(count)
-        counts[word] = count
+        counts[standardize_word(word)] = count
     return counts
 
 
@@ -73,7 +78,7 @@ def read_leeds_corpus(filename):
             rank = line.split(u' ')[0]
             if NUMBER_RE.match(rank) and line.count(u' ') == 2:
                 _, freq, token = line.split(u' ')
-                token = ftfy(token).lower()
+                token = standardize_word(ftfy(token))
                 freq = float(freq)
                 counts[token] += freq
 
diff --git a/wordfreq/config.py b/wordfreq/config.py
index 7f38e17..b98d119 100644
--- a/wordfreq/config.py
+++ b/wordfreq/config.py
@@ -5,7 +5,7 @@ DB_DIR = (os.environ.get('WORDFREQ_DATA')
           or os.path.expanduser('~/.cache/wordfreq'))
 
 # When the minor version number increments, the data may change.
-VERSION = '0.1.1'
+VERSION = '0.2.0'
 MINOR_VERSION = '.'.join(VERSION.split('.')[:2])
 
 # Put these options together to make a database filename.
diff --git a/wordfreq/query.py b/wordfreq/query.py
index d2cffbd..14c18c5 100644
--- a/wordfreq/query.py
+++ b/wordfreq/query.py
@@ -1,4 +1,5 @@
 from wordfreq.config import DB_FILENAME, CACHE_SIZE
+from wordfreq.util import standardize_word
 import sqlite3
 import sys
 
@@ -35,7 +36,7 @@ def word_frequency(word, lang, wordlist='multi', offset=0.):
     """
     c = CONN.cursor()
     c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
-              (word, lang, wordlist))
+              (standardize_word(word), lang, wordlist))
     row = c.fetchone()
     if row is None:
         return offset