Mercurial > hg > octave-avbm
changeset 9876:21d81d06b221
cache-aligned loop for rowwise dot
author | Jaroslav Hajek <highegg@gmail.com> |
---|---|
date | Thu, 26 Nov 2009 21:06:21 +0100 |
parents | 47f36dd27203 |
children | cac3b4e5035b |
files | libcruft/ChangeLog libcruft/blas-xtra/cdotc3.f libcruft/blas-xtra/ddot3.f libcruft/blas-xtra/sdot3.f libcruft/blas-xtra/zdotc3.f src/ChangeLog src/DLD-FUNCTIONS/dot.cc |
diffstat | 7 files changed, 42 insertions(+), 42 deletions(-) [+] |
line wrap: on
line diff
--- a/libcruft/ChangeLog +++ b/libcruft/ChangeLog @@ -1,3 +1,10 @@ +2009-11-26 Jaroslav Hajek <highegg@gmail.com> + + * blas-xtra/sdot3.f: Use nested cache-aligned loop for general case. + * blas-xtra/ddot3.f: Ditto. + * blas-xtra/cdotc3.f: Ditto. + * blas-xtra/zdotc3.f: Ditto. + 2009-11-26 Jaroslav Hajek <highegg@gmail.com> * blas-xtra/sdot3.f: New source.
--- a/libcruft/blas-xtra/cdotc3.f +++ b/libcruft/blas-xtra/cdotc3.f @@ -29,8 +29,6 @@ complex a(m,k,n),b(m,k,n) complex c(m,n) - integer kk - parameter (kk = 64) complex cdotc external cdotc @@ -38,22 +36,21 @@ if (m <= 0 .or. n <= 0) return if (m == 1) then -c the column-major case +c the column-major case. do j = 1,n c(1,j) = cdotc(k,a(1,1,j),1,b(1,1,j),1) end do else -c here the product is row-wise, but we'd still like to use BLAS's dot -c for its usually better accuracy. let's do a compromise and split the -c middle dimension. +c We prefer performance here, because that's what we generally +c do by default in reduction functions. Besides, the accuracy +c of xDOT is questionable. Hence, do a cache-aligned nested loop. do j = 1,n - l = mod(k,kk) do i = 1,m - c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m) + c(i,j) = 0e0 end do - do l = l+1,k,kk + do l = 1,k do i = 1,m - c(i,j) = c(i,j) + cdotc(kk,a(i,l,j),m,b(i,l,j),m) + c(i,j) = c(i,j) + conjg(a(i,l,j))*b(i,l,j) end do end do end do
--- a/libcruft/blas-xtra/ddot3.f +++ b/libcruft/blas-xtra/ddot3.f @@ -29,8 +29,6 @@ double precision a(m,k,n),b(m,k,n) double precision c(m,n) - integer kk - parameter (kk = 64) double precision ddot external ddot @@ -39,22 +37,21 @@ if (m <= 0 .or. n <= 0) return if (m == 1) then -c the column-major case +c the column-major case. do j = 1,n c(1,j) = ddot(k,a(1,1,j),1,b(1,1,j),1) end do else -c here the product is row-wise, but we'd still like to use BLAS's dot -c for its usually better accuracy. let's do a compromise and split the -c middle dimension. +c We prefer performance here, because that's what we generally +c do by default in reduction functions. Besides, the accuracy +c of xDOT is questionable. Hence, do a cache-aligned nested loop. do j = 1,n - l = mod(k,kk) do i = 1,m - c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m) + c(i,j) = 0d0 end do - do l = l+1,k,kk + do l = 1,k do i = 1,m - c(i,j) = c(i,j) + ddot(kk,a(i,l,j),m,b(i,l,j),m) + c(i,j) = c(i,j) + a(i,l,j)*b(i,l,j) end do end do end do
--- a/libcruft/blas-xtra/sdot3.f +++ b/libcruft/blas-xtra/sdot3.f @@ -29,8 +29,6 @@ real a(m,k,n),b(m,k,n) real c(m,n) - integer kk - parameter (kk = 64) real sdot external sdot @@ -38,22 +36,21 @@ if (m <= 0 .or. n <= 0) return if (m == 1) then -c the column-major case +c the column-major case. do j = 1,n c(1,j) = sdot(k,a(1,1,j),1,b(1,1,j),1) end do else -c here the product is row-wise, but we'd still like to use BLAS's dot -c for its usually better accuracy. let's do a compromise and split the -c middle dimension. +c We prefer performance here, because that's what we generally +c do by default in reduction functions. Besides, the accuracy +c of xDOT is questionable. Hence, do a cache-aligned nested loop. do j = 1,n - l = mod(k,kk) do i = 1,m - c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m) + c(i,j) = 0d0 end do - do l = l+1,k,kk + do l = 1,k do i = 1,m - c(i,j) = c(i,j) + sdot(kk,a(i,l,j),m,b(i,l,j),m) + c(i,j) = c(i,j) + a(i,l,j)*b(i,l,j) end do end do end do
--- a/libcruft/blas-xtra/zdotc3.f +++ b/libcruft/blas-xtra/zdotc3.f @@ -29,8 +29,6 @@ double complex a(m,k,n),b(m,k,n) double complex c(m,n) - integer kk - parameter (kk = 64) double complex zdotc external zdotc @@ -38,22 +36,21 @@ if (m <= 0 .or. n <= 0) return if (m == 1) then -c the column-major case +c the column-major case. do j = 1,n c(1,j) = zdotc(k,a(1,1,j),1,b(1,1,j),1) end do else -c here the product is row-wise, but we'd still like to use BLAS's dot -c for its usually better accuracy. let's do a compromise and split the -c middle dimension. +c We prefer performance here, because that's what we generally +c do by default in reduction functions. Besides, the accuracy +c of xDOT is questionable. Hence, do a cache-aligned nested loop. do j = 1,n - l = mod(k,kk) do i = 1,m - c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m) + c(i,j) = 0d0 end do - do l = l+1,k,kk + do l = 1,k do i = 1,m - c(i,j) = c(i,j) + zdotc(kk,a(i,l,j),m,b(i,l,j),m) + c(i,j) = c(i,j) + conjg(a(i,l,j))*b(i,l,j) end do end do end do
--- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,7 @@ +2009-11-26 Jaroslav Hajek <highegg@gmail.com> + + * DLD-FUNCTIONS/dot.cc (Fdot): Update docs. + 2009-11-26 Jaroslav Hajek <highegg@gmail.com> * DLD-FUNCTIONS/dot.cc: New source.
--- a/src/DLD-FUNCTIONS/dot.cc +++ b/src/DLD-FUNCTIONS/dot.cc @@ -111,8 +111,9 @@ given, calculate the dot products along this dimension.\n\ \n\ This is equivalent to doing @code{sum (conj (@var{X}) .* @var{Y}, @var{dim})},\n\ -but avoids forming a temporary array and uses the BLAS xDOT functions,\n\ -usually resulting in increased accuracy of the computation.\n\ +but avoids forming a temporary array and is faster.\n\ +When @var{X} and @var{Y} are column vectors, the result is equivalent to\n\ +@code{ @var{X}'*@var{Y} }.\n\ @end deftypefn") { octave_value retval;