changeset 3:56c58a177cde

First attempt at optimising levenshtein distance
author Jordi Gutiérrez Hermoso <jordigh@gmail.com>
date Thu, 30 Dec 2010 01:26:07 -0600
parents 0748d902784c
children de7bbee608ee
files levenshtein.c++ main.c++
diffstat 2 files changed, 71 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/levenshtein.c++
+++ b/levenshtein.c++
@@ -1,35 +1,84 @@
 #include <string>
 #include <vector>
 #include <iostream>
-
+#include <algorithm>
 
-size_t min(size_t x, size_t y)
-{
-  return x < y ? x : y;
-}
-
-
-size_t levenshtein(const std::string& a,
-                   const std::string& b,
-                   size_t m)
+size_t levenshtein(std::string& a,
+                   std::string& b,
+                   size_t m = 100) //the cutoff maximum
 {
   using namespace std;
-  vector<vector<size_t> > matrix(a.size()+1, vector<size_t>(b.size()+1, 0));
+  size_t asize=a.size(), bsize = b.size();
+  bool swapped = false;
 
-  for(size_t i = 0; i < a.size()+1; i++)
+  if( asize > bsize)
+  {
+    swapped = true;
+    swap(a,b);
+    swap(asize,bsize);
+  }
+
+  //A bit of a dumb heuristic, but seems to help a little
+  if( long(bsize)  - long(asize) > m)
   {
-    matrix[i][0] = i;
-    for(size_t j = 1; j < b.size()+1; j++)
+    if(swapped)
+      swap(a,b); //unswap
+    return m;
+  }
+
+  //Narrowing won't help, let's do the full matrix, easier to handle
+  //this special case here.
+  if( bsize <= m/2)
+  {
+    //Just need two rows at a time
+    vector<size_t> prevrow(bsize+1), thisrow(bsize+1);
+
+    for(size_t i = 0; i <= bsize; i++)
+      prevrow[i] = i;
+
+    for(size_t i = 1; i <= asize; i ++)
     {
-      if(i == 0)
+      thisrow[0] = i;
+      for(size_t j = 1; j <= bsize; j++)
       {
-        matrix[0][j] = j;
-        continue;
+        thisrow[j] = min(prevrow[j-1] + size_t(a[i-1] != b[j-1]),
+                         1 + min(prevrow[j],thisrow[j-1]) );
       }
-      matrix[i][j] = min( matrix[i-1][j-1] + size_t(a[i-1] != b[j-1]),
-                          1 + min(matrix[i-1][j],
-                                  matrix[i][j-1]));
+      swap(thisrow,prevrow);
     }
+
+    if(swapped)
+      swap(a,b); //unswap
+
+    return prevrow[bsize];
   }
-  return matrix[a.size()][b.size()];
+  //Otherwise, we do the narrower algorithm...
+
+  //Only need to look at the diagonal of width 2m+1.
+  size_t diag = 2*m+1;
+
+  //Pad on each side in order to make edge cases easier, fill
+  //everything with our max allowed value, can only go down from
+  //there.
+  vector<size_t> thisrow(diag+2,100), prevrow(diag+2,100);
+
+  for(size_t i = m+1, j = 0; i < diag+2; i++, j++)
+    prevrow[i] = j;
+
+  for(size_t i = 1; i <= asize; i++)
+  {
+    for(size_t j = max(size_t(1), m + 1 - i);
+        j <= min(diag, diag + bsize - m - i); j++)
+    {
+      //This one handles all possible cases, thanks to padding.
+      thisrow[j] = min(prevrow[j-1] + size_t(a[i-1] != b[j-1-m-i]),
+                       1 + min(prevrow[j], thisrow[j-1]) );
+    }
+    swap(thisrow,prevrow);
+  }
+
+  if(swapped)
+    swap(a,b); //unswap
+
+  return min(prevrow[diag+1],m);
 }
--- a/main.c++
+++ b/main.c++
@@ -33,6 +33,7 @@
       if(m == 0)
         break;
     }
+    //cout << word << ": " << m << endl;
     total_distance += m;
   }
   cout << total_distance << endl;