# HG changeset patch # User Jaroslav Hajek # Date 1267607592 -3600 # Node ID 5af0b4bb384d14efd7e449fdb7caae553cfd2f82 # Parent eb9b2674501eefeea61f08c68c655c23fe385df4 rewrite convn optimizations based on xAXPY diff --git a/libcruft/ChangeLog b/libcruft/ChangeLog --- a/libcruft/ChangeLog +++ b/libcruft/ChangeLog @@ -1,3 +1,10 @@ +2010-03-03 Jaroslav Hajek + + * blas-xtra/cconv2.f, blas-xtra/csconv2.f, blas-xtra/dconv2.f, + blas-xtra/sconv2.f, blas-xtra/zconv2.f, blas-xtra/zdconv2.f: + New sources. + * blas-xtra/module.mk: Add them here. + 2010-03-02 John W. Eaton * misc/cquit.c (octave_restore_signal_mask): Assume we have diff --git a/libcruft/blas-xtra/cconv2.f b/libcruft/blas-xtra/cconv2.f new file mode 100644 --- /dev/null +++ b/libcruft/blas-xtra/cconv2.f @@ -0,0 +1,77 @@ +c Copyright (C) 2010 VZLU Prague, a.s., Czech Republic +c +c Author: Jaroslav Hajek +c +c This file is part of Octave. +c +c Octave is free software; you can redistribute it and/or modify +c it under the terms of the GNU General Public License as published by +c the Free Software Foundation; either version 3 of the License, or +c (at your option) any later version. +c +c This program is distributed in the hope that it will be useful, +c but WITHOUT ANY WARRANTY; without even the implied warranty of +c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +c GNU General Public License for more details. +c +c You should have received a copy of the GNU General Public License +c along with this software; see the file COPYING. If not, see +c . +c + subroutine cconv2o(ma,na,a,mb,nb,b,c) +c purpose: a 2-dimensional outer additive convolution. +c equivalent to the following: +c for i = 1:ma +c for j = 1:na +c c(i:i+mb-1,j:j+mb-1) += a(i,j)*b +c endfor +c endfor +c arguments: +c ma,na (in) dimensions of a +c a (in) 1st matrix +c mb,nb (in) dimensions of b +c b (in) 2nd matrix +c c (inout) accumulator matrix, size (ma+mb-1, na+nb-1) +c + integer ma,na,mb,nb + complex a(ma,na),b(mb,nb) + complex c(ma+mb-1,na+nb-1) + integer i,j,k + external caxpy + do k = 1,na + do j = 1,nb + do i = 1,mb + call caxpy(ma,b(i,j),a(1,k),1,c(i,j+k-1),1) + end do + end do + end do + end subroutine + + subroutine cconv2i(ma,na,a,mb,nb,b,c) +c purpose: a 2-dimensional inner additive convolution. +c equivalent to the following: +c for i = 1:ma-mb+1 +c for j = 1:na-nb+1 +c c(i,j) = sum (sum (a(i:i+mb-1,j:j+nb-1) .* b)) +c endfor +c endfor +c arguments: +c ma,na (in) dimensions of a +c a (in) 1st matrix +c mb,nb (in) dimensions of b +c b (in) 2nd matrix +c c (inout) accumulator matrix, size (ma+mb-1, na+nb-1) +c + integer ma,na,mb,nb + complex a(ma,na),b(mb,nb) + complex c(ma-mb+1,na-nb+1) + integer i,j,k + external caxpy + do k = 1,na-nb+1 + do j = 1,nb + do i = 1,mb + call caxpy(ma-mb+1,b(i,j),a(i,k+j-1),1,c(1,k),1) + end do + end do + end do + end subroutine diff --git a/libcruft/blas-xtra/csconv2.f b/libcruft/blas-xtra/csconv2.f new file mode 100644 --- /dev/null +++ b/libcruft/blas-xtra/csconv2.f @@ -0,0 +1,83 @@ +c Copyright (C) 2010 VZLU Prague, a.s., Czech Republic +c +c Author: Jaroslav Hajek +c +c This file is part of Octave. +c +c Octave is free software; you can redistribute it and/or modify +c it under the terms of the GNU General Public License as published by +c the Free Software Foundation; either version 3 of the License, or +c (at your option) any later version. +c +c This program is distributed in the hope that it will be useful, +c but WITHOUT ANY WARRANTY; without even the implied warranty of +c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +c GNU General Public License for more details. +c +c You should have received a copy of the GNU General Public License +c along with this software; see the file COPYING. If not, see +c . +c + subroutine csconv2o(ma,na,a,mb,nb,b,c) +c purpose: a 2-dimensional outer additive convolution. +c equivalent to the following: +c for i = 1:ma +c for j = 1:na +c c(i:i+mb-1,j:j+mb-1) += a(i,j)*b +c endfor +c endfor +c arguments: +c ma,na (in) dimensions of a +c a (in) 1st matrix +c mb,nb (in) dimensions of b +c b (in) 2nd matrix +c c (inout) accumulator matrix, size (ma+mb-1, na+nb-1) +c + integer ma,na,mb,nb + complex a(ma,na) + real b(mb,nb) + complex c(ma+mb-1,na+nb-1) + complex btmp + integer i,j,k + external caxpy + do k = 1,na + do j = 1,nb + do i = 1,mb + btmp = b(i,j) + call caxpy(ma,btmp,a(1,k),1,c(i,j+k-1),1) + end do + end do + end do + end subroutine + + subroutine csconv2i(ma,na,a,mb,nb,b,c) +c purpose: a 2-dimensional inner additive convolution. +c equivalent to the following: +c for i = 1:ma-mb+1 +c for j = 1:na-nb+1 +c c(i,j) = sum (sum (a(i:i+mb-1,j:j+nb-1) .* b)) +c endfor +c endfor +c arguments: +c ma,na (in) dimensions of a +c a (in) 1st matrix +c mb,nb (in) dimensions of b +c b (in) 2nd matrix +c c (inout) accumulator matrix, size (ma+mb-1, na+nb-1) +c + integer ma,na,mb,nb + complex a(ma,na) + real b(mb,nb) + complex c(ma-mb+1,na-nb+1) + complex btmp + integer i,j,k + external caxpy + do k = 1,na-nb+1 + do j = 1,nb + do i = 1,mb + btmp = b(i,j) + call caxpy(ma-mb+1,btmp,a(i,k+j-1),1,c(1,k),1) + end do + end do + end do + end subroutine diff --git a/libcruft/blas-xtra/dconv2.f b/libcruft/blas-xtra/dconv2.f new file mode 100644 --- /dev/null +++ b/libcruft/blas-xtra/dconv2.f @@ -0,0 +1,77 @@ +c Copyright (C) 2010 VZLU Prague, a.s., Czech Republic +c +c Author: Jaroslav Hajek +c +c This file is part of Octave. +c +c Octave is free software; you can redistribute it and/or modify +c it under the terms of the GNU General Public License as published by +c the Free Software Foundation; either version 3 of the License, or +c (at your option) any later version. +c +c This program is distributed in the hope that it will be useful, +c but WITHOUT ANY WARRANTY; without even the implied warranty of +c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +c GNU General Public License for more details. +c +c You should have received a copy of the GNU General Public License +c along with this software; see the file COPYING. If not, see +c . +c + subroutine dconv2o(ma,na,a,mb,nb,b,c) +c purpose: a 2-dimensional outer additive convolution. +c equivalent to the following: +c for i = 1:ma +c for j = 1:na +c c(i:i+mb-1,j:j+mb-1) += a(i,j)*b +c endfor +c endfor +c arguments: +c ma,na (in) dimensions of a +c a (in) 1st matrix +c mb,nb (in) dimensions of b +c b (in) 2nd matrix +c c (inout) accumulator matrix, size (ma+mb-1, na+nb-1) +c + integer ma,na,mb,nb + double precision a(ma,na),b(mb,nb) + double precision c(ma+mb-1,na+nb-1) + integer i,j,k + external daxpy + do k = 1,na + do j = 1,nb + do i = 1,mb + call daxpy(ma,b(i,j),a(1,k),1,c(i,j+k-1),1) + end do + end do + end do + end subroutine + + subroutine dconv2i(ma,na,a,mb,nb,b,c) +c purpose: a 2-dimensional inner additive convolution. +c equivalent to the following: +c for i = 1:ma-mb+1 +c for j = 1:na-nb+1 +c c(i,j) = sum (sum (a(i:i+mb-1,j:j+nb-1) .* b)) +c endfor +c endfor +c arguments: +c ma,na (in) dimensions of a +c a (in) 1st matrix +c mb,nb (in) dimensions of b +c b (in) 2nd matrix +c c (inout) accumulator matrix, size (ma+mb-1, na+nb-1) +c + integer ma,na,mb,nb + double precision a(ma,na),b(mb,nb) + double precision c(ma-mb+1,na-nb+1) + integer i,j,k + external daxpy + do k = 1,na-nb+1 + do j = 1,nb + do i = 1,mb + call daxpy(ma-mb+1,b(i,j),a(i,k+j-1),1,c(1,k),1) + end do + end do + end do + end subroutine diff --git a/libcruft/blas-xtra/module.mk b/libcruft/blas-xtra/module.mk --- a/libcruft/blas-xtra/module.mk +++ b/libcruft/blas-xtra/module.mk @@ -19,4 +19,10 @@ blas-xtra/xscnrm2.f \ blas-xtra/xcdotc.f \ blas-xtra/xcdotu.f \ - blas-xtra/xerbla.f + blas-xtra/xerbla.f \ + blas-xtra/cconv2.f \ + blas-xtra/csconv2.f \ + blas-xtra/dconv2.f \ + blas-xtra/sconv2.f \ + blas-xtra/zconv2.f \ + blas-xtra/zdconv2.f diff --git a/libcruft/blas-xtra/sconv2.f b/libcruft/blas-xtra/sconv2.f new file mode 100644 --- /dev/null +++ b/libcruft/blas-xtra/sconv2.f @@ -0,0 +1,77 @@ +c Copyright (C) 2010 VZLU Prague, a.s., Czech Republic +c +c Author: Jaroslav Hajek +c +c This file is part of Octave. +c +c Octave is free software; you can redistribute it and/or modify +c it under the terms of the GNU General Public License as published by +c the Free Software Foundation; either version 3 of the License, or +c (at your option) any later version. +c +c This program is distributed in the hope that it will be useful, +c but WITHOUT ANY WARRANTY; without even the implied warranty of +c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +c GNU General Public License for more details. +c +c You should have received a copy of the GNU General Public License +c along with this software; see the file COPYING. If not, see +c . +c + subroutine sconv2o(ma,na,a,mb,nb,b,c) +c purpose: a 2-dimensional outer additive convolution. +c equivalent to the following: +c for i = 1:ma +c for j = 1:na +c c(i:i+mb-1,j:j+mb-1) += a(i,j)*b +c endfor +c endfor +c arguments: +c ma,na (in) dimensions of a +c a (in) 1st matrix +c mb,nb (in) dimensions of b +c b (in) 2nd matrix +c c (inout) accumulator matrix, size (ma+mb-1, na+nb-1) +c + integer ma,na,mb,nb + real a(ma,na),b(mb,nb) + real c(ma+mb-1,na+nb-1) + integer i,j,k + external saxpy + do k = 1,na + do j = 1,nb + do i = 1,mb + call saxpy(ma,b(i,j),a(1,k),1,c(i,j+k-1),1) + end do + end do + end do + end subroutine + + subroutine sconv2i(ma,na,a,mb,nb,b,c) +c purpose: a 2-dimensional inner additive convolution. +c equivalent to the following: +c for i = 1:ma-mb+1 +c for j = 1:na-nb+1 +c c(i,j) = sum (sum (a(i:i+mb-1,j:j+nb-1) .* b)) +c endfor +c endfor +c arguments: +c ma,na (in) dimensions of a +c a (in) 1st matrix +c mb,nb (in) dimensions of b +c b (in) 2nd matrix +c c (inout) accumulator matrix, size (ma+mb-1, na+nb-1) +c + integer ma,na,mb,nb + real a(ma,na),b(mb,nb) + real c(ma-mb+1,na-nb+1) + integer i,j,k + external saxpy + do k = 1,na-nb+1 + do j = 1,nb + do i = 1,mb + call saxpy(ma-mb+1,b(i,j),a(i,k+j-1),1,c(1,k),1) + end do + end do + end do + end subroutine diff --git a/libcruft/blas-xtra/zconv2.f b/libcruft/blas-xtra/zconv2.f new file mode 100644 --- /dev/null +++ b/libcruft/blas-xtra/zconv2.f @@ -0,0 +1,77 @@ +c Copyright (C) 2010 VZLU Prague, a.s., Czech Republic +c +c Author: Jaroslav Hajek +c +c This file is part of Octave. +c +c Octave is free software; you can redistribute it and/or modify +c it under the terms of the GNU General Public License as published by +c the Free Software Foundation; either version 3 of the License, or +c (at your option) any later version. +c +c This program is distributed in the hope that it will be useful, +c but WITHOUT ANY WARRANTY; without even the implied warranty of +c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +c GNU General Public License for more details. +c +c You should have received a copy of the GNU General Public License +c along with this software; see the file COPYING. If not, see +c . +c + subroutine zconv2o(ma,na,a,mb,nb,b,c) +c purpose: a 2-dimensional outer additive convolution. +c equivalent to the following: +c for i = 1:ma +c for j = 1:na +c c(i:i+mb-1,j:j+mb-1) += a(i,j)*b +c endfor +c endfor +c arguments: +c ma,na (in) dimensions of a +c a (in) 1st matrix +c mb,nb (in) dimensions of b +c b (in) 2nd matrix +c c (inout) accumulator matrix, size (ma+mb-1, na+nb-1) +c + integer ma,na,mb,nb + double complex a(ma,na),b(mb,nb) + double complex c(ma+mb-1,na+nb-1) + integer i,j,k + external zaxpy + do k = 1,na + do j = 1,nb + do i = 1,mb + call zaxpy(ma,b(i,j),a(1,k),1,c(i,j+k-1),1) + end do + end do + end do + end subroutine + + subroutine zconv2i(ma,na,a,mb,nb,b,c) +c purpose: a 2-dimensional inner additive convolution. +c equivalent to the following: +c for i = 1:ma-mb+1 +c for j = 1:na-nb+1 +c c(i,j) = sum (sum (a(i:i+mb-1,j:j+nb-1) .* b)) +c endfor +c endfor +c arguments: +c ma,na (in) dimensions of a +c a (in) 1st matrix +c mb,nb (in) dimensions of b +c b (in) 2nd matrix +c c (inout) accumulator matrix, size (ma+mb-1, na+nb-1) +c + integer ma,na,mb,nb + double complex a(ma,na),b(mb,nb) + double complex c(ma-mb+1,na-nb+1) + integer i,j,k + external zaxpy + do k = 1,na-nb+1 + do j = 1,nb + do i = 1,mb + call zaxpy(ma-mb+1,b(i,j),a(i,k+j-1),1,c(1,k),1) + end do + end do + end do + end subroutine diff --git a/libcruft/blas-xtra/zdconv2.f b/libcruft/blas-xtra/zdconv2.f new file mode 100644 --- /dev/null +++ b/libcruft/blas-xtra/zdconv2.f @@ -0,0 +1,83 @@ +c Copyright (C) 2010 VZLU Prague, a.s., Czech Republic +c +c Author: Jaroslav Hajek +c +c This file is part of Octave. +c +c Octave is free software; you can redistribute it and/or modify +c it under the terms of the GNU General Public License as published by +c the Free Software Foundation; either version 3 of the License, or +c (at your option) any later version. +c +c This program is distributed in the hope that it will be useful, +c but WITHOUT ANY WARRANTY; without even the implied warranty of +c MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +c GNU General Public License for more details. +c +c You should have received a copy of the GNU General Public License +c along with this software; see the file COPYING. If not, see +c . +c + subroutine zdconv2o(ma,na,a,mb,nb,b,c) +c purpose: a 2-dimensional outer additive convolution. +c equivalent to the following: +c for i = 1:ma +c for j = 1:na +c c(i:i+mb-1,j:j+mb-1) += a(i,j)*b +c endfor +c endfor +c arguments: +c ma,na (in) dimensions of a +c a (in) 1st matrix +c mb,nb (in) dimensions of b +c b (in) 2nd matrix +c c (inout) accumulator matrix, size (ma+mb-1, na+nb-1) +c + integer ma,na,mb,nb + double complex a(ma,na) + double precision b(mb,nb) + double complex c(ma+mb-1,na+nb-1) + double complex btmp + integer i,j,k + external zaxpy + do k = 1,na + do j = 1,nb + do i = 1,mb + btmp = b(i,j) + call zaxpy(ma,btmp,a(1,k),1,c(i,j+k-1),1) + end do + end do + end do + end subroutine + + subroutine zdconv2i(ma,na,a,mb,nb,b,c) +c purpose: a 2-dimensional inner additive convolution. +c equivalent to the following: +c for i = 1:ma-mb+1 +c for j = 1:na-nb+1 +c c(i,j) = sum (sum (a(i:i+mb-1,j:j+nb-1) .* b)) +c endfor +c endfor +c arguments: +c ma,na (in) dimensions of a +c a (in) 1st matrix +c mb,nb (in) dimensions of b +c b (in) 2nd matrix +c c (inout) accumulator matrix, size (ma+mb-1, na+nb-1) +c + integer ma,na,mb,nb + double complex a(ma,na) + double precision b(mb,nb) + double complex c(ma-mb+1,na-nb+1) + double complex btmp + integer i,j,k + external zaxpy + do k = 1,na-nb+1 + do j = 1,nb + do i = 1,mb + btmp = b(i,j) + call zaxpy(ma-mb+1,btmp,a(i,k+j-1),1,c(1,k),1) + end do + end do + end do + end subroutine diff --git a/liboctave/ChangeLog b/liboctave/ChangeLog --- a/liboctave/ChangeLog +++ b/liboctave/ChangeLog @@ -1,3 +1,10 @@ +2010-03-03 Jaroslav Hajek + + * oct-convn.cc (convolve_2d_axpy_kernel, convolve_2d_axpy): Remove. + (convolve_2d): Forward to Fortran implementations, add inner flag. + (convolve_nd): Handle inner-convolution case. + (convolve): Ditto. + 2010-03-02 Jaroslav Hajek * oct-convn.h, oct-convn.cc: New sources. diff --git a/liboctave/oct-convn.cc b/liboctave/oct-convn.cc --- a/liboctave/oct-convn.cc +++ b/liboctave/oct-convn.cc @@ -26,133 +26,77 @@ #include #include + +#include "f77-fcn.h" + #include "oct-convn.h" #include "oct-locbuf.h" - -// FIXME: Only the axpy-form accumulation is used. This is natural for outer -// convolution as it requires no boundary treatment. -// This typically requires one more memory store per operation, but as the -// memory access pattern is equivalent (switching ro and rw parts), I wouldn't -// expect a significant difference. cf. for instance sum(), where row-wise sum -// (axpy-based) is no slower than column-wise (dot-based). -// It would be nice, however, if the inner convolution was computed directly by -// dot-based accumulation. - -// FIXME: Specifying the kernel as outer product should probably get special -// treatment. - -// All kernels smaller than 7x5 get specialized code. -#define MAX_KERNEL_SIZE_M 7 -#define MAX_KERNEL_SIZE_N 5 - -template -static void -convolve_2d_kernel_axpy (const T *a, octave_idx_type ma, octave_idx_type na, - const R *b, T *c, octave_idx_type ldc) -{ - for (octave_idx_type ja = 0; ja < na; ja++) - for (octave_idx_type ia = 0; ia < ma; ia++) - { - T aij = a[ma*ja + ia]; - // The following double loop is a candidate for complete unrolling. - for (int jb = 0; jb < nb; jb++) - for (int ib = 0; ib < mb; ib++) - c[(ja+jb)*ldc + (ia+ib)] += aij * b[mb*jb+ib]; - } -} - -// Kernel dispatcher. -template -static void -convolve_2d_axpy (const T *a, octave_idx_type ma, octave_idx_type na, - const R *b, octave_idx_type mb, octave_idx_type nb, - T *c, octave_idx_type ldc) -{ - // Table of kernels. - static void (*table[MAX_KERNEL_SIZE_M][MAX_KERNEL_SIZE_N]) - (const T *, octave_idx_type, octave_idx_type, const R *, T *, octave_idx_type) - = { - // This must be repeated MAX_KERNEL_SIZE-times. I do not see a way to - // automate this. -#define STORE_KERNEL_POINTER(M,N) \ - convolve_2d_kernel_axpy -#define STORE_KERNEL_ROW(M) \ - { \ - STORE_KERNEL_POINTER(M,1), \ - STORE_KERNEL_POINTER(M,2), \ - STORE_KERNEL_POINTER(M,3), \ - STORE_KERNEL_POINTER(M,4), \ - STORE_KERNEL_POINTER(M,5), \ - } - - STORE_KERNEL_ROW(1), - STORE_KERNEL_ROW(2), - STORE_KERNEL_ROW(3), - STORE_KERNEL_ROW(4), - STORE_KERNEL_ROW(5), - STORE_KERNEL_ROW(6), - STORE_KERNEL_ROW(7), - }; - - if (mb != 0 && nb != 0) - (*table[mb-1][nb-1]) (a, ma, na, b, c, ldc); -} - // 2d convolution with a matrix kernel. template static void convolve_2d (const T *a, octave_idx_type ma, octave_idx_type na, const R *b, octave_idx_type mb, octave_idx_type nb, - T *c) -{ - octave_idx_type ldc = ma + mb - 1; - if (mb <= MAX_KERNEL_SIZE_M && nb <= MAX_KERNEL_SIZE_N) - { - // Call kernel directly on b. - convolve_2d_axpy (a, ma, na, b, mb, nb, c, ldc); - } - else - { - // Split b to blocks. - OCTAVE_LOCAL_BUFFER (R, b1, MAX_KERNEL_SIZE_M*MAX_KERNEL_SIZE_N); - for (octave_idx_type jb = 0; jb < nb; jb += MAX_KERNEL_SIZE_N) - { - octave_idx_type nb1 = std::min (nb - jb, MAX_KERNEL_SIZE_N); - for (octave_idx_type ib = 0; ib < mb; ib += MAX_KERNEL_SIZE_M) - { - octave_idx_type mb1 = std::min (mb - ib, MAX_KERNEL_SIZE_M); + T *c, bool inner); - // Copy block to buffer. - R *bf = b1; - for (octave_idx_type j = jb; j < jb+nb1; j++) - for (octave_idx_type i = ib; i < ib+mb1; i++) - *bf++ = b[j*mb + i]; +// Forward instances to our Fortran implementations. +#define FORWARD_IMPL(T,R,f,F) \ +extern "C" \ +F77_RET_T \ +F77_FUNC (f##conv2o, F##CONV2O) (const octave_idx_type&, const octave_idx_type&, \ + const T*, \ + const octave_idx_type&, const octave_idx_type&, \ + const R*, T *); \ +\ +extern "C" \ +F77_RET_T \ +F77_FUNC (f##conv2i, F##CONV2I) (const octave_idx_type&, const octave_idx_type&, \ + const T*, \ + const octave_idx_type&, const octave_idx_type&, \ + const R*, T *); \ +\ +template <> void \ +convolve_2d (const T *a, octave_idx_type ma, octave_idx_type na, \ + const R *b, octave_idx_type mb, octave_idx_type nb, \ + T *c, bool inner) \ +{ \ + if (inner) \ + F77_XFCN (f##conv2i, F##CONV2I, (ma, na, a, mb, nb, b, c)); \ + else \ + F77_XFCN (f##conv2o, F##CONV2O, (ma, na, a, mb, nb, b, c)); \ +} - // Call kernel. - convolve_2d_axpy (a, ma, na, b1, mb1, nb1, c + ldc*jb + ib, ldc); - } - } - } - -} +FORWARD_IMPL (double, double, d, D) +FORWARD_IMPL (float, float, s, S) +FORWARD_IMPL (Complex, Complex, z, Z) +FORWARD_IMPL (FloatComplex, FloatComplex, c, C) +FORWARD_IMPL (Complex, double, zd, ZD) +FORWARD_IMPL (FloatComplex, float, cs, CS) template void convolve_nd (const T *a, const dim_vector& ad, const dim_vector& acd, const R *b, const dim_vector& bd, const dim_vector& bcd, - T *c, const dim_vector& ccd, int nd) + T *c, const dim_vector& ccd, int nd, bool inner) { if (nd == 2) - convolve_2d (a, ad(0), ad(1), b, bd(0), bd(1), c); + convolve_2d (a, ad(0), ad(1), b, bd(0), bd(1), c, inner); else { octave_idx_type ma = acd(nd-2), na = ad(nd-1), mb = bcd(nd-2), nb = bd(nd-1); octave_idx_type ldc = ccd(nd-2); - for (octave_idx_type jb = 0; jb < nb; jb++) + if (inner) + { + for (octave_idx_type ja = 0; ja < na - nb + 1; ja++) + for (octave_idx_type jb = 0; jb < nb; jb++) + convolve_nd (a + ma*(ja + jb), ad, acd, b + mb*jb, bd, bcd, + c + ldc*ja, ccd, nd-1, inner); + } + else { for (octave_idx_type ja = 0; ja < na; ja++) - convolve_nd (a + ma*ja, ad, acd, b + mb*jb, bd, bcd, - c + ldc*(ja+jb), ccd, nd-1); + for (octave_idx_type jb = 0; jb < nb; jb++) + convolve_nd (a + ma*ja, ad, acd, b + mb*jb, bd, bcd, + c + ldc*(ja+jb), ccd, nd-1, inner); } } } @@ -172,35 +116,27 @@ dim_vector cdims = dim_vector::alloc (nd); for (int i = 0; i < nd; i++) - cdims(i) = std::max (adims(i) + bdims(i) - 1, 0); + { + if (ct == convn_valid) + cdims(i) = std::max (adims(i) - bdims(i) + 1, 0); + else + cdims(i) = std::max (adims(i) + bdims(i) - 1, 0); + } MArray c (cdims, T()); convolve_nd (a.fortran_vec (), adims, adims.cumulative (), b.fortran_vec (), bdims, bdims.cumulative (), - c.fortran_vec (), cdims.cumulative (), nd); + c.fortran_vec (), cdims.cumulative (), nd, ct == convn_valid); - // Pick the relevant part. - Array sidx (nd, 1); - - switch (ct) + if (ct == convn_same) { - case convn_valid: - { - for (int i = 0; i < nd; i++) - sidx(i) = idx_vector (bdims(i)-1, adims(i)); - c = c.index (sidx); - break; - } - case convn_same: - { - for (int i = 0; i < nd; i++) - sidx(i) = idx_vector::make_range ((bdims(i)-1)/2, 1, adims(i)); - c = c.index (sidx); - break; - } - default: - break; + // Pick the relevant part. + Array sidx (nd, 1); + + for (int i = 0; i < nd; i++) + sidx(i) = idx_vector::make_range ((bdims(i)-1)/2, 1, adims(i)); + c = c.index (sidx); } return c;