# HG changeset patch
# User Jaroslav Hajek <highegg@gmail.com>
# Date 1267607592 -3600
# Node ID 5af0b4bb384d14efd7e449fdb7caae553cfd2f82
# Parent  eb9b2674501eefeea61f08c68c655c23fe385df4
rewrite convn optimizations based on xAXPY

diff --git a/libcruft/ChangeLog b/libcruft/ChangeLog
--- a/libcruft/ChangeLog
+++ b/libcruft/ChangeLog
@@ -1,3 +1,10 @@
+2010-03-03  Jaroslav Hajek  <highegg@gmail.com>
+
+	* blas-xtra/cconv2.f, blas-xtra/csconv2.f, blas-xtra/dconv2.f,
+	blas-xtra/sconv2.f, blas-xtra/zconv2.f, blas-xtra/zdconv2.f:
+	New sources.
+	* blas-xtra/module.mk: Add them here.
+
 2010-03-02  John W. Eaton  <jwe@octave.org>
 
 	* misc/cquit.c (octave_restore_signal_mask): Assume we have
diff --git a/libcruft/blas-xtra/cconv2.f b/libcruft/blas-xtra/cconv2.f
new file mode 100644
--- /dev/null
+++ b/libcruft/blas-xtra/cconv2.f
@@ -0,0 +1,77 @@
+c Copyright (C) 2010  VZLU Prague, a.s., Czech Republic
+c
+c Author: Jaroslav Hajek <highegg@gmail.com>
+c
+c This file is part of Octave.
+c
+c Octave is free software; you can redistribute it and/or modify
+c it under the terms of the GNU General Public License as published by
+c the Free Software Foundation; either version 3 of the License, or
+c (at your option) any later version.
+c
+c This program is distributed in the hope that it will be useful,
+c but WITHOUT ANY WARRANTY; without even the implied warranty of
+c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+c GNU General Public License for more details.
+c
+c You should have received a copy of the GNU General Public License
+c along with this software; see the file COPYING.  If not, see
+c <http://www.gnu.org/licenses/>.
+c
+      subroutine cconv2o(ma,na,a,mb,nb,b,c)
+c purpose:      a 2-dimensional outer additive convolution.
+c               equivalent to the following:
+c                 for i = 1:ma
+c                   for j = 1:na
+c                     c(i:i+mb-1,j:j+mb-1) += a(i,j)*b
+c                   endfor
+c                 endfor
+c arguments:
+c ma,na (in)    dimensions of a
+c a (in)        1st matrix
+c mb,nb (in)    dimensions of b
+c b (in)        2nd matrix
+c c (inout)     accumulator matrix, size (ma+mb-1, na+nb-1)
+c
+      integer ma,na,mb,nb
+      complex a(ma,na),b(mb,nb)
+      complex c(ma+mb-1,na+nb-1)
+      integer i,j,k
+      external caxpy
+      do k = 1,na
+        do j = 1,nb
+          do i = 1,mb
+            call caxpy(ma,b(i,j),a(1,k),1,c(i,j+k-1),1)
+          end do
+        end do
+      end do
+      end subroutine
+
+      subroutine cconv2i(ma,na,a,mb,nb,b,c)
+c purpose:      a 2-dimensional inner additive convolution.
+c               equivalent to the following:
+c                 for i = 1:ma-mb+1
+c                   for j = 1:na-nb+1
+c                     c(i,j) = sum (sum (a(i:i+mb-1,j:j+nb-1) .* b))
+c                   endfor
+c                 endfor
+c arguments:
+c ma,na (in)    dimensions of a
+c a (in)        1st matrix
+c mb,nb (in)    dimensions of b
+c b (in)        2nd matrix
+c c (inout)     accumulator matrix, size (ma+mb-1, na+nb-1)
+c
+      integer ma,na,mb,nb
+      complex a(ma,na),b(mb,nb)
+      complex c(ma-mb+1,na-nb+1)
+      integer i,j,k
+      external caxpy
+      do k = 1,na-nb+1
+        do j = 1,nb
+          do i = 1,mb
+            call caxpy(ma-mb+1,b(i,j),a(i,k+j-1),1,c(1,k),1)
+          end do
+        end do
+      end do
+      end subroutine
diff --git a/libcruft/blas-xtra/csconv2.f b/libcruft/blas-xtra/csconv2.f
new file mode 100644
--- /dev/null
+++ b/libcruft/blas-xtra/csconv2.f
@@ -0,0 +1,83 @@
+c Copyright (C) 2010  VZLU Prague, a.s., Czech Republic
+c
+c Author: Jaroslav Hajek <highegg@gmail.com>
+c
+c This file is part of Octave.
+c
+c Octave is free software; you can redistribute it and/or modify
+c it under the terms of the GNU General Public License as published by
+c the Free Software Foundation; either version 3 of the License, or
+c (at your option) any later version.
+c
+c This program is distributed in the hope that it will be useful,
+c but WITHOUT ANY WARRANTY; without even the implied warranty of
+c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+c GNU General Public License for more details.
+c
+c You should have received a copy of the GNU General Public License
+c along with this software; see the file COPYING.  If not, see
+c <http://www.gnu.org/licenses/>.
+c
+      subroutine csconv2o(ma,na,a,mb,nb,b,c)
+c purpose:      a 2-dimensional outer additive convolution.
+c               equivalent to the following:
+c                 for i = 1:ma
+c                   for j = 1:na
+c                     c(i:i+mb-1,j:j+mb-1) += a(i,j)*b
+c                   endfor
+c                 endfor
+c arguments:
+c ma,na (in)    dimensions of a
+c a (in)        1st matrix
+c mb,nb (in)    dimensions of b
+c b (in)        2nd matrix
+c c (inout)     accumulator matrix, size (ma+mb-1, na+nb-1)
+c
+      integer ma,na,mb,nb
+      complex a(ma,na)
+      real b(mb,nb)
+      complex c(ma+mb-1,na+nb-1)
+      complex btmp
+      integer i,j,k
+      external caxpy
+      do k = 1,na
+        do j = 1,nb
+          do i = 1,mb
+            btmp = b(i,j)
+            call caxpy(ma,btmp,a(1,k),1,c(i,j+k-1),1)
+          end do
+        end do
+      end do
+      end subroutine
+
+      subroutine csconv2i(ma,na,a,mb,nb,b,c)
+c purpose:      a 2-dimensional inner additive convolution.
+c               equivalent to the following:
+c                 for i = 1:ma-mb+1
+c                   for j = 1:na-nb+1
+c                     c(i,j) = sum (sum (a(i:i+mb-1,j:j+nb-1) .* b))
+c                   endfor
+c                 endfor
+c arguments:
+c ma,na (in)    dimensions of a
+c a (in)        1st matrix
+c mb,nb (in)    dimensions of b
+c b (in)        2nd matrix
+c c (inout)     accumulator matrix, size (ma+mb-1, na+nb-1)
+c
+      integer ma,na,mb,nb
+      complex a(ma,na)
+      real b(mb,nb)
+      complex c(ma-mb+1,na-nb+1)
+      complex btmp
+      integer i,j,k
+      external caxpy
+      do k = 1,na-nb+1
+        do j = 1,nb
+          do i = 1,mb
+            btmp = b(i,j)
+            call caxpy(ma-mb+1,btmp,a(i,k+j-1),1,c(1,k),1)
+          end do
+        end do
+      end do
+      end subroutine
diff --git a/libcruft/blas-xtra/dconv2.f b/libcruft/blas-xtra/dconv2.f
new file mode 100644
--- /dev/null
+++ b/libcruft/blas-xtra/dconv2.f
@@ -0,0 +1,77 @@
+c Copyright (C) 2010  VZLU Prague, a.s., Czech Republic
+c
+c Author: Jaroslav Hajek <highegg@gmail.com>
+c
+c This file is part of Octave.
+c
+c Octave is free software; you can redistribute it and/or modify
+c it under the terms of the GNU General Public License as published by
+c the Free Software Foundation; either version 3 of the License, or
+c (at your option) any later version.
+c
+c This program is distributed in the hope that it will be useful,
+c but WITHOUT ANY WARRANTY; without even the implied warranty of
+c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+c GNU General Public License for more details.
+c
+c You should have received a copy of the GNU General Public License
+c along with this software; see the file COPYING.  If not, see
+c <http://www.gnu.org/licenses/>.
+c
+      subroutine dconv2o(ma,na,a,mb,nb,b,c)
+c purpose:      a 2-dimensional outer additive convolution.
+c               equivalent to the following:
+c                 for i = 1:ma
+c                   for j = 1:na
+c                     c(i:i+mb-1,j:j+mb-1) += a(i,j)*b
+c                   endfor
+c                 endfor
+c arguments:
+c ma,na (in)    dimensions of a
+c a (in)        1st matrix
+c mb,nb (in)    dimensions of b
+c b (in)        2nd matrix
+c c (inout)     accumulator matrix, size (ma+mb-1, na+nb-1)
+c
+      integer ma,na,mb,nb
+      double precision a(ma,na),b(mb,nb)
+      double precision c(ma+mb-1,na+nb-1)
+      integer i,j,k
+      external daxpy
+      do k = 1,na
+        do j = 1,nb
+          do i = 1,mb
+            call daxpy(ma,b(i,j),a(1,k),1,c(i,j+k-1),1)
+          end do
+        end do
+      end do
+      end subroutine
+
+      subroutine dconv2i(ma,na,a,mb,nb,b,c)
+c purpose:      a 2-dimensional inner additive convolution.
+c               equivalent to the following:
+c                 for i = 1:ma-mb+1
+c                   for j = 1:na-nb+1
+c                     c(i,j) = sum (sum (a(i:i+mb-1,j:j+nb-1) .* b))
+c                   endfor
+c                 endfor
+c arguments:
+c ma,na (in)    dimensions of a
+c a (in)        1st matrix
+c mb,nb (in)    dimensions of b
+c b (in)        2nd matrix
+c c (inout)     accumulator matrix, size (ma+mb-1, na+nb-1)
+c
+      integer ma,na,mb,nb
+      double precision a(ma,na),b(mb,nb)
+      double precision c(ma-mb+1,na-nb+1)
+      integer i,j,k
+      external daxpy
+      do k = 1,na-nb+1
+        do j = 1,nb
+          do i = 1,mb
+            call daxpy(ma-mb+1,b(i,j),a(i,k+j-1),1,c(1,k),1)
+          end do
+        end do
+      end do
+      end subroutine
diff --git a/libcruft/blas-xtra/module.mk b/libcruft/blas-xtra/module.mk
--- a/libcruft/blas-xtra/module.mk
+++ b/libcruft/blas-xtra/module.mk
@@ -19,4 +19,10 @@
   blas-xtra/xscnrm2.f \
   blas-xtra/xcdotc.f \
   blas-xtra/xcdotu.f \
-  blas-xtra/xerbla.f
+  blas-xtra/xerbla.f \
+  blas-xtra/cconv2.f \
+  blas-xtra/csconv2.f \
+  blas-xtra/dconv2.f \
+  blas-xtra/sconv2.f \
+  blas-xtra/zconv2.f \
+  blas-xtra/zdconv2.f
diff --git a/libcruft/blas-xtra/sconv2.f b/libcruft/blas-xtra/sconv2.f
new file mode 100644
--- /dev/null
+++ b/libcruft/blas-xtra/sconv2.f
@@ -0,0 +1,77 @@
+c Copyright (C) 2010  VZLU Prague, a.s., Czech Republic
+c
+c Author: Jaroslav Hajek <highegg@gmail.com>
+c
+c This file is part of Octave.
+c
+c Octave is free software; you can redistribute it and/or modify
+c it under the terms of the GNU General Public License as published by
+c the Free Software Foundation; either version 3 of the License, or
+c (at your option) any later version.
+c
+c This program is distributed in the hope that it will be useful,
+c but WITHOUT ANY WARRANTY; without even the implied warranty of
+c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+c GNU General Public License for more details.
+c
+c You should have received a copy of the GNU General Public License
+c along with this software; see the file COPYING.  If not, see
+c <http://www.gnu.org/licenses/>.
+c
+      subroutine sconv2o(ma,na,a,mb,nb,b,c)
+c purpose:      a 2-dimensional outer additive convolution.
+c               equivalent to the following:
+c                 for i = 1:ma
+c                   for j = 1:na
+c                     c(i:i+mb-1,j:j+mb-1) += a(i,j)*b
+c                   endfor
+c                 endfor
+c arguments:
+c ma,na (in)    dimensions of a
+c a (in)        1st matrix
+c mb,nb (in)    dimensions of b
+c b (in)        2nd matrix
+c c (inout)     accumulator matrix, size (ma+mb-1, na+nb-1)
+c
+      integer ma,na,mb,nb
+      real a(ma,na),b(mb,nb)
+      real c(ma+mb-1,na+nb-1)
+      integer i,j,k
+      external saxpy
+      do k = 1,na
+        do j = 1,nb
+          do i = 1,mb
+            call saxpy(ma,b(i,j),a(1,k),1,c(i,j+k-1),1)
+          end do
+        end do
+      end do
+      end subroutine
+
+      subroutine sconv2i(ma,na,a,mb,nb,b,c)
+c purpose:      a 2-dimensional inner additive convolution.
+c               equivalent to the following:
+c                 for i = 1:ma-mb+1
+c                   for j = 1:na-nb+1
+c                     c(i,j) = sum (sum (a(i:i+mb-1,j:j+nb-1) .* b))
+c                   endfor
+c                 endfor
+c arguments:
+c ma,na (in)    dimensions of a
+c a (in)        1st matrix
+c mb,nb (in)    dimensions of b
+c b (in)        2nd matrix
+c c (inout)     accumulator matrix, size (ma+mb-1, na+nb-1)
+c
+      integer ma,na,mb,nb
+      real a(ma,na),b(mb,nb)
+      real c(ma-mb+1,na-nb+1)
+      integer i,j,k
+      external saxpy
+      do k = 1,na-nb+1
+        do j = 1,nb
+          do i = 1,mb
+            call saxpy(ma-mb+1,b(i,j),a(i,k+j-1),1,c(1,k),1)
+          end do
+        end do
+      end do
+      end subroutine
diff --git a/libcruft/blas-xtra/zconv2.f b/libcruft/blas-xtra/zconv2.f
new file mode 100644
--- /dev/null
+++ b/libcruft/blas-xtra/zconv2.f
@@ -0,0 +1,77 @@
+c Copyright (C) 2010  VZLU Prague, a.s., Czech Republic
+c
+c Author: Jaroslav Hajek <highegg@gmail.com>
+c
+c This file is part of Octave.
+c
+c Octave is free software; you can redistribute it and/or modify
+c it under the terms of the GNU General Public License as published by
+c the Free Software Foundation; either version 3 of the License, or
+c (at your option) any later version.
+c
+c This program is distributed in the hope that it will be useful,
+c but WITHOUT ANY WARRANTY; without even the implied warranty of
+c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+c GNU General Public License for more details.
+c
+c You should have received a copy of the GNU General Public License
+c along with this software; see the file COPYING.  If not, see
+c <http://www.gnu.org/licenses/>.
+c
+      subroutine zconv2o(ma,na,a,mb,nb,b,c)
+c purpose:      a 2-dimensional outer additive convolution.
+c               equivalent to the following:
+c                 for i = 1:ma
+c                   for j = 1:na
+c                     c(i:i+mb-1,j:j+mb-1) += a(i,j)*b
+c                   endfor
+c                 endfor
+c arguments:
+c ma,na (in)    dimensions of a
+c a (in)        1st matrix
+c mb,nb (in)    dimensions of b
+c b (in)        2nd matrix
+c c (inout)     accumulator matrix, size (ma+mb-1, na+nb-1)
+c
+      integer ma,na,mb,nb
+      double complex a(ma,na),b(mb,nb)
+      double complex c(ma+mb-1,na+nb-1)
+      integer i,j,k
+      external zaxpy
+      do k = 1,na
+        do j = 1,nb
+          do i = 1,mb
+            call zaxpy(ma,b(i,j),a(1,k),1,c(i,j+k-1),1)
+          end do
+        end do
+      end do
+      end subroutine
+
+      subroutine zconv2i(ma,na,a,mb,nb,b,c)
+c purpose:      a 2-dimensional inner additive convolution.
+c               equivalent to the following:
+c                 for i = 1:ma-mb+1
+c                   for j = 1:na-nb+1
+c                     c(i,j) = sum (sum (a(i:i+mb-1,j:j+nb-1) .* b))
+c                   endfor
+c                 endfor
+c arguments:
+c ma,na (in)    dimensions of a
+c a (in)        1st matrix
+c mb,nb (in)    dimensions of b
+c b (in)        2nd matrix
+c c (inout)     accumulator matrix, size (ma+mb-1, na+nb-1)
+c
+      integer ma,na,mb,nb
+      double complex a(ma,na),b(mb,nb)
+      double complex c(ma-mb+1,na-nb+1)
+      integer i,j,k
+      external zaxpy
+      do k = 1,na-nb+1
+        do j = 1,nb
+          do i = 1,mb
+            call zaxpy(ma-mb+1,b(i,j),a(i,k+j-1),1,c(1,k),1)
+          end do
+        end do
+      end do
+      end subroutine
diff --git a/libcruft/blas-xtra/zdconv2.f b/libcruft/blas-xtra/zdconv2.f
new file mode 100644
--- /dev/null
+++ b/libcruft/blas-xtra/zdconv2.f
@@ -0,0 +1,83 @@
+c Copyright (C) 2010  VZLU Prague, a.s., Czech Republic
+c
+c Author: Jaroslav Hajek <highegg@gmail.com>
+c
+c This file is part of Octave.
+c
+c Octave is free software; you can redistribute it and/or modify
+c it under the terms of the GNU General Public License as published by
+c the Free Software Foundation; either version 3 of the License, or
+c (at your option) any later version.
+c
+c This program is distributed in the hope that it will be useful,
+c but WITHOUT ANY WARRANTY; without even the implied warranty of
+c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+c GNU General Public License for more details.
+c
+c You should have received a copy of the GNU General Public License
+c along with this software; see the file COPYING.  If not, see
+c <http://www.gnu.org/licenses/>.
+c
+      subroutine zdconv2o(ma,na,a,mb,nb,b,c)
+c purpose:      a 2-dimensional outer additive convolution.
+c               equivalent to the following:
+c                 for i = 1:ma
+c                   for j = 1:na
+c                     c(i:i+mb-1,j:j+mb-1) += a(i,j)*b
+c                   endfor
+c                 endfor
+c arguments:
+c ma,na (in)    dimensions of a
+c a (in)        1st matrix
+c mb,nb (in)    dimensions of b
+c b (in)        2nd matrix
+c c (inout)     accumulator matrix, size (ma+mb-1, na+nb-1)
+c
+      integer ma,na,mb,nb
+      double complex a(ma,na)
+      double precision b(mb,nb)
+      double complex c(ma+mb-1,na+nb-1)
+      double complex btmp
+      integer i,j,k
+      external zaxpy
+      do k = 1,na
+        do j = 1,nb
+          do i = 1,mb
+            btmp = b(i,j)
+            call zaxpy(ma,btmp,a(1,k),1,c(i,j+k-1),1)
+          end do
+        end do
+      end do
+      end subroutine
+
+      subroutine zdconv2i(ma,na,a,mb,nb,b,c)
+c purpose:      a 2-dimensional inner additive convolution.
+c               equivalent to the following:
+c                 for i = 1:ma-mb+1
+c                   for j = 1:na-nb+1
+c                     c(i,j) = sum (sum (a(i:i+mb-1,j:j+nb-1) .* b))
+c                   endfor
+c                 endfor
+c arguments:
+c ma,na (in)    dimensions of a
+c a (in)        1st matrix
+c mb,nb (in)    dimensions of b
+c b (in)        2nd matrix
+c c (inout)     accumulator matrix, size (ma+mb-1, na+nb-1)
+c
+      integer ma,na,mb,nb
+      double complex a(ma,na)
+      double precision b(mb,nb)
+      double complex c(ma-mb+1,na-nb+1)
+      double complex btmp
+      integer i,j,k
+      external zaxpy
+      do k = 1,na-nb+1
+        do j = 1,nb
+          do i = 1,mb
+            btmp = b(i,j)
+            call zaxpy(ma-mb+1,btmp,a(i,k+j-1),1,c(1,k),1)
+          end do
+        end do
+      end do
+      end subroutine
diff --git a/liboctave/ChangeLog b/liboctave/ChangeLog
--- a/liboctave/ChangeLog
+++ b/liboctave/ChangeLog
@@ -1,3 +1,10 @@
+2010-03-03  Jaroslav Hajek  <highegg@gmail.com>
+
+	* oct-convn.cc (convolve_2d_axpy_kernel, convolve_2d_axpy): Remove.
+	(convolve_2d): Forward to Fortran implementations, add inner flag.
+	(convolve_nd): Handle inner-convolution case.
+	(convolve): Ditto.
+
 2010-03-02  Jaroslav Hajek  <highegg@gmail.com>
 
 	* oct-convn.h, oct-convn.cc: New sources.
diff --git a/liboctave/oct-convn.cc b/liboctave/oct-convn.cc
--- a/liboctave/oct-convn.cc
+++ b/liboctave/oct-convn.cc
@@ -26,133 +26,77 @@
 
 #include <iostream>
 #include <algorithm>
+
+#include "f77-fcn.h"
+
 #include "oct-convn.h"
 #include "oct-locbuf.h"
 
-
-// FIXME: Only the axpy-form accumulation is used. This is natural for outer
-// convolution as it requires no boundary treatment.
-// This typically requires one more memory store per operation, but as the
-// memory access pattern is equivalent (switching ro and rw parts), I wouldn't
-// expect a significant difference. cf. for instance sum(), where row-wise sum
-// (axpy-based) is no slower than column-wise (dot-based).
-// It would be nice, however, if the inner convolution was computed directly by
-// dot-based accumulation.
-
-// FIXME: Specifying the kernel as outer product should probably get special
-// treatment.
-
-// All kernels smaller than 7x5 get specialized code.
-#define MAX_KERNEL_SIZE_M 7
-#define MAX_KERNEL_SIZE_N 5
-
-template <class T, class R, int mb, int nb>
-static void 
-convolve_2d_kernel_axpy (const T *a, octave_idx_type ma, octave_idx_type na,
-                         const R *b, T *c, octave_idx_type ldc)
-{
-  for (octave_idx_type ja = 0; ja < na; ja++)
-    for (octave_idx_type ia = 0; ia < ma; ia++)
-      {
-        T aij = a[ma*ja + ia];
-        // The following double loop is a candidate for complete unrolling.
-        for (int jb = 0; jb < nb; jb++)
-          for (int ib = 0; ib < mb; ib++)
-            c[(ja+jb)*ldc + (ia+ib)] += aij * b[mb*jb+ib];
-      }
-}
-
-// Kernel dispatcher.
-template <class T, class R>
-static void
-convolve_2d_axpy (const T *a, octave_idx_type ma, octave_idx_type na,
-                  const R *b, octave_idx_type mb, octave_idx_type nb,
-                  T *c, octave_idx_type ldc)
-{
-  // Table of kernels.
-  static void (*table[MAX_KERNEL_SIZE_M][MAX_KERNEL_SIZE_N]) 
-    (const T *, octave_idx_type, octave_idx_type, const R *, T *, octave_idx_type)
-    = {
-        // This must be repeated MAX_KERNEL_SIZE-times. I do not see a way to
-        // automate this.
-#define STORE_KERNEL_POINTER(M,N) \
-        convolve_2d_kernel_axpy<T,R,M,N>
-#define STORE_KERNEL_ROW(M) \
-          { \
-            STORE_KERNEL_POINTER(M,1), \
-            STORE_KERNEL_POINTER(M,2), \
-            STORE_KERNEL_POINTER(M,3), \
-            STORE_KERNEL_POINTER(M,4), \
-            STORE_KERNEL_POINTER(M,5), \
-          }
-
-        STORE_KERNEL_ROW(1),
-        STORE_KERNEL_ROW(2),
-        STORE_KERNEL_ROW(3),
-        STORE_KERNEL_ROW(4),
-        STORE_KERNEL_ROW(5),
-        STORE_KERNEL_ROW(6),
-        STORE_KERNEL_ROW(7),
-    };
-
-  if (mb != 0 && nb != 0)
-    (*table[mb-1][nb-1]) (a, ma, na, b, c, ldc);
-}
-
 // 2d convolution with a matrix kernel.
 template <class T, class R>
 static void 
 convolve_2d (const T *a, octave_idx_type ma, octave_idx_type na,
              const R *b, octave_idx_type mb, octave_idx_type nb,
-             T *c)
-{
-  octave_idx_type ldc = ma + mb - 1;
-  if (mb <= MAX_KERNEL_SIZE_M && nb <= MAX_KERNEL_SIZE_N)
-    {
-      // Call kernel directly on b.
-      convolve_2d_axpy (a, ma, na, b, mb, nb, c, ldc);
-    }
-  else
-    {
-      // Split b to blocks.
-      OCTAVE_LOCAL_BUFFER (R, b1, MAX_KERNEL_SIZE_M*MAX_KERNEL_SIZE_N);
-      for (octave_idx_type jb = 0; jb < nb; jb += MAX_KERNEL_SIZE_N)
-        {
-          octave_idx_type nb1 = std::min (nb - jb, MAX_KERNEL_SIZE_N);
-          for (octave_idx_type ib = 0; ib < mb; ib += MAX_KERNEL_SIZE_M)
-            {
-              octave_idx_type mb1 = std::min (mb - ib, MAX_KERNEL_SIZE_M);
+             T *c, bool inner);
 
-              // Copy block to buffer.
-              R *bf = b1;
-              for (octave_idx_type j = jb; j < jb+nb1; j++)
-                for (octave_idx_type i = ib; i < ib+mb1; i++)
-                  *bf++ = b[j*mb + i];
+// Forward instances to our Fortran implementations.
+#define FORWARD_IMPL(T,R,f,F) \
+extern "C" \
+F77_RET_T \
+F77_FUNC (f##conv2o, F##CONV2O) (const octave_idx_type&, const octave_idx_type&, \
+                                 const T*, \
+                                 const octave_idx_type&, const octave_idx_type&, \
+                                 const R*, T *); \
+\
+extern "C" \
+F77_RET_T \
+F77_FUNC (f##conv2i, F##CONV2I) (const octave_idx_type&, const octave_idx_type&, \
+                                 const T*, \
+                                 const octave_idx_type&, const octave_idx_type&, \
+                                 const R*, T *); \
+\
+template <> void \
+convolve_2d<T, R> (const T *a, octave_idx_type ma, octave_idx_type na, \
+                   const R *b, octave_idx_type mb, octave_idx_type nb, \
+                   T *c, bool inner) \
+{ \
+  if (inner) \
+    F77_XFCN (f##conv2i, F##CONV2I, (ma, na, a, mb, nb, b, c)); \
+  else \
+    F77_XFCN (f##conv2o, F##CONV2O, (ma, na, a, mb, nb, b, c)); \
+}
 
-              // Call kernel.
-              convolve_2d_axpy (a, ma, na, b1, mb1, nb1, c + ldc*jb + ib, ldc);
-            }
-        }
-    }
-
-}
+FORWARD_IMPL (double, double, d, D)
+FORWARD_IMPL (float, float, s, S)
+FORWARD_IMPL (Complex, Complex, z, Z)
+FORWARD_IMPL (FloatComplex, FloatComplex, c, C)
+FORWARD_IMPL (Complex, double, zd, ZD)
+FORWARD_IMPL (FloatComplex, float, cs, CS)
 
 template <class T, class R>
 void convolve_nd (const T *a, const dim_vector& ad, const dim_vector& acd,
                   const R *b, const dim_vector& bd, const dim_vector& bcd,
-                  T *c, const dim_vector& ccd, int nd)
+                  T *c, const dim_vector& ccd, int nd, bool inner)
 {
   if (nd == 2)
-    convolve_2d<T, R> (a, ad(0), ad(1), b, bd(0), bd(1), c);
+    convolve_2d<T, R> (a, ad(0), ad(1), b, bd(0), bd(1), c, inner);
   else
     {
       octave_idx_type ma = acd(nd-2), na = ad(nd-1), mb = bcd(nd-2), nb = bd(nd-1);
       octave_idx_type ldc = ccd(nd-2);
-      for (octave_idx_type jb = 0; jb < nb; jb++)
+      if (inner)
+        {
+          for (octave_idx_type ja = 0; ja < na - nb + 1; ja++)
+            for (octave_idx_type jb = 0; jb < nb; jb++)
+              convolve_nd<T, R> (a + ma*(ja + jb), ad, acd, b + mb*jb, bd, bcd,
+                                 c + ldc*ja, ccd, nd-1, inner);
+        }
+      else
         {
           for (octave_idx_type ja = 0; ja < na; ja++)
-            convolve_nd<T, R> (a + ma*ja, ad, acd, b + mb*jb, bd, bcd, 
-                               c + ldc*(ja+jb), ccd, nd-1);
+            for (octave_idx_type jb = 0; jb < nb; jb++)
+              convolve_nd<T, R> (a + ma*ja, ad, acd, b + mb*jb, bd, bcd,
+                                 c + ldc*(ja+jb), ccd, nd-1, inner);
         }
     }
 }
@@ -172,35 +116,27 @@
   dim_vector cdims = dim_vector::alloc (nd);
 
   for (int i = 0; i < nd; i++)
-    cdims(i) = std::max (adims(i) + bdims(i) - 1, 0);
+    {
+      if (ct == convn_valid)
+        cdims(i) = std::max (adims(i) - bdims(i) + 1, 0);
+      else
+        cdims(i) = std::max (adims(i) + bdims(i) - 1, 0);
+    }
 
   MArray<T> c (cdims, T());
 
   convolve_nd<T, R> (a.fortran_vec (), adims, adims.cumulative (),
                      b.fortran_vec (), bdims, bdims.cumulative (),
-                     c.fortran_vec (), cdims.cumulative (), nd);
+                     c.fortran_vec (), cdims.cumulative (), nd, ct == convn_valid);
 
-  // Pick the relevant part.
-  Array<idx_vector> sidx (nd, 1);
-
-  switch (ct)
+  if (ct == convn_same)
     {
-    case convn_valid:
-        {
-          for (int i = 0; i < nd; i++)
-            sidx(i) = idx_vector (bdims(i)-1, adims(i));
-          c = c.index (sidx);
-          break;
-        }
-    case convn_same:
-        {
-          for (int i = 0; i < nd; i++)
-            sidx(i) = idx_vector::make_range ((bdims(i)-1)/2, 1, adims(i));
-          c = c.index (sidx);
-          break;
-        }
-    default:
-      break;
+      // Pick the relevant part.
+      Array<idx_vector> sidx (nd, 1);
+
+      for (int i = 0; i < nd; i++)
+        sidx(i) = idx_vector::make_range ((bdims(i)-1)/2, 1, adims(i));
+      c = c.index (sidx);
     }
 
   return c;