# HG changeset patch # User Jordi GutiƩrrez Hermoso # Date 1293693967 21600 # Node ID 56c58a177cdef85dc7b1d0caf2a5aaac75219b32 # Parent 0748d902784cbca93a703496abfe5d6734dfa1c7 First attempt at optimising levenshtein distance diff --git a/levenshtein.c++ b/levenshtein.c++ --- a/levenshtein.c++ +++ b/levenshtein.c++ @@ -1,35 +1,84 @@ #include #include #include - +#include -size_t min(size_t x, size_t y) -{ - return x < y ? x : y; -} - - -size_t levenshtein(const std::string& a, - const std::string& b, - size_t m) +size_t levenshtein(std::string& a, + std::string& b, + size_t m = 100) //the cutoff maximum { using namespace std; - vector > matrix(a.size()+1, vector(b.size()+1, 0)); + size_t asize=a.size(), bsize = b.size(); + bool swapped = false; - for(size_t i = 0; i < a.size()+1; i++) + if( asize > bsize) + { + swapped = true; + swap(a,b); + swap(asize,bsize); + } + + //A bit of a dumb heuristic, but seems to help a little + if( long(bsize) - long(asize) > m) { - matrix[i][0] = i; - for(size_t j = 1; j < b.size()+1; j++) + if(swapped) + swap(a,b); //unswap + return m; + } + + //Narrowing won't help, let's do the full matrix, easier to handle + //this special case here. + if( bsize <= m/2) + { + //Just need two rows at a time + vector prevrow(bsize+1), thisrow(bsize+1); + + for(size_t i = 0; i <= bsize; i++) + prevrow[i] = i; + + for(size_t i = 1; i <= asize; i ++) { - if(i == 0) + thisrow[0] = i; + for(size_t j = 1; j <= bsize; j++) { - matrix[0][j] = j; - continue; + thisrow[j] = min(prevrow[j-1] + size_t(a[i-1] != b[j-1]), + 1 + min(prevrow[j],thisrow[j-1]) ); } - matrix[i][j] = min( matrix[i-1][j-1] + size_t(a[i-1] != b[j-1]), - 1 + min(matrix[i-1][j], - matrix[i][j-1])); + swap(thisrow,prevrow); } + + if(swapped) + swap(a,b); //unswap + + return prevrow[bsize]; } - return matrix[a.size()][b.size()]; + //Otherwise, we do the narrower algorithm... + + //Only need to look at the diagonal of width 2m+1. + size_t diag = 2*m+1; + + //Pad on each side in order to make edge cases easier, fill + //everything with our max allowed value, can only go down from + //there. + vector thisrow(diag+2,100), prevrow(diag+2,100); + + for(size_t i = m+1, j = 0; i < diag+2; i++, j++) + prevrow[i] = j; + + for(size_t i = 1; i <= asize; i++) + { + for(size_t j = max(size_t(1), m + 1 - i); + j <= min(diag, diag + bsize - m - i); j++) + { + //This one handles all possible cases, thanks to padding. + thisrow[j] = min(prevrow[j-1] + size_t(a[i-1] != b[j-1-m-i]), + 1 + min(prevrow[j], thisrow[j-1]) ); + } + swap(thisrow,prevrow); + } + + if(swapped) + swap(a,b); //unswap + + return min(prevrow[diag+1],m); } diff --git a/main.c++ b/main.c++ --- a/main.c++ +++ b/main.c++ @@ -33,6 +33,7 @@ if(m == 0) break; } + //cout << word << ": " << m << endl; total_distance += m; } cout << total_distance << endl;