# HG changeset patch # User Jaroslav Hajek # Date 1259265981 -3600 # Node ID 21d81d06b221f6fa307db82f6e7c130e075b5db7 # Parent 47f36dd27203d8d67bb0da48ca48c25621c4c149 cache-aligned loop for rowwise dot diff --git a/libcruft/ChangeLog b/libcruft/ChangeLog --- a/libcruft/ChangeLog +++ b/libcruft/ChangeLog @@ -1,3 +1,10 @@ +2009-11-26 Jaroslav Hajek + + * blas-xtra/sdot3.f: Use nested cache-aligned loop for general case. + * blas-xtra/ddot3.f: Ditto. + * blas-xtra/cdotc3.f: Ditto. + * blas-xtra/zdotc3.f: Ditto. + 2009-11-26 Jaroslav Hajek * blas-xtra/sdot3.f: New source. diff --git a/libcruft/blas-xtra/cdotc3.f b/libcruft/blas-xtra/cdotc3.f --- a/libcruft/blas-xtra/cdotc3.f +++ b/libcruft/blas-xtra/cdotc3.f @@ -29,8 +29,6 @@ complex a(m,k,n),b(m,k,n) complex c(m,n) - integer kk - parameter (kk = 64) complex cdotc external cdotc @@ -38,22 +36,21 @@ if (m <= 0 .or. n <= 0) return if (m == 1) then -c the column-major case +c the column-major case. do j = 1,n c(1,j) = cdotc(k,a(1,1,j),1,b(1,1,j),1) end do else -c here the product is row-wise, but we'd still like to use BLAS's dot -c for its usually better accuracy. let's do a compromise and split the -c middle dimension. +c We prefer performance here, because that's what we generally +c do by default in reduction functions. Besides, the accuracy +c of xDOT is questionable. Hence, do a cache-aligned nested loop. do j = 1,n - l = mod(k,kk) do i = 1,m - c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m) + c(i,j) = 0e0 end do - do l = l+1,k,kk + do l = 1,k do i = 1,m - c(i,j) = c(i,j) + cdotc(kk,a(i,l,j),m,b(i,l,j),m) + c(i,j) = c(i,j) + conjg(a(i,l,j))*b(i,l,j) end do end do end do diff --git a/libcruft/blas-xtra/ddot3.f b/libcruft/blas-xtra/ddot3.f --- a/libcruft/blas-xtra/ddot3.f +++ b/libcruft/blas-xtra/ddot3.f @@ -29,8 +29,6 @@ double precision a(m,k,n),b(m,k,n) double precision c(m,n) - integer kk - parameter (kk = 64) double precision ddot external ddot @@ -39,22 +37,21 @@ if (m <= 0 .or. n <= 0) return if (m == 1) then -c the column-major case +c the column-major case. do j = 1,n c(1,j) = ddot(k,a(1,1,j),1,b(1,1,j),1) end do else -c here the product is row-wise, but we'd still like to use BLAS's dot -c for its usually better accuracy. let's do a compromise and split the -c middle dimension. +c We prefer performance here, because that's what we generally +c do by default in reduction functions. Besides, the accuracy +c of xDOT is questionable. Hence, do a cache-aligned nested loop. do j = 1,n - l = mod(k,kk) do i = 1,m - c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m) + c(i,j) = 0d0 end do - do l = l+1,k,kk + do l = 1,k do i = 1,m - c(i,j) = c(i,j) + ddot(kk,a(i,l,j),m,b(i,l,j),m) + c(i,j) = c(i,j) + a(i,l,j)*b(i,l,j) end do end do end do diff --git a/libcruft/blas-xtra/sdot3.f b/libcruft/blas-xtra/sdot3.f --- a/libcruft/blas-xtra/sdot3.f +++ b/libcruft/blas-xtra/sdot3.f @@ -29,8 +29,6 @@ real a(m,k,n),b(m,k,n) real c(m,n) - integer kk - parameter (kk = 64) real sdot external sdot @@ -38,22 +36,21 @@ if (m <= 0 .or. n <= 0) return if (m == 1) then -c the column-major case +c the column-major case. do j = 1,n c(1,j) = sdot(k,a(1,1,j),1,b(1,1,j),1) end do else -c here the product is row-wise, but we'd still like to use BLAS's dot -c for its usually better accuracy. let's do a compromise and split the -c middle dimension. +c We prefer performance here, because that's what we generally +c do by default in reduction functions. Besides, the accuracy +c of xDOT is questionable. Hence, do a cache-aligned nested loop. do j = 1,n - l = mod(k,kk) do i = 1,m - c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m) + c(i,j) = 0d0 end do - do l = l+1,k,kk + do l = 1,k do i = 1,m - c(i,j) = c(i,j) + sdot(kk,a(i,l,j),m,b(i,l,j),m) + c(i,j) = c(i,j) + a(i,l,j)*b(i,l,j) end do end do end do diff --git a/libcruft/blas-xtra/zdotc3.f b/libcruft/blas-xtra/zdotc3.f --- a/libcruft/blas-xtra/zdotc3.f +++ b/libcruft/blas-xtra/zdotc3.f @@ -29,8 +29,6 @@ double complex a(m,k,n),b(m,k,n) double complex c(m,n) - integer kk - parameter (kk = 64) double complex zdotc external zdotc @@ -38,22 +36,21 @@ if (m <= 0 .or. n <= 0) return if (m == 1) then -c the column-major case +c the column-major case. do j = 1,n c(1,j) = zdotc(k,a(1,1,j),1,b(1,1,j),1) end do else -c here the product is row-wise, but we'd still like to use BLAS's dot -c for its usually better accuracy. let's do a compromise and split the -c middle dimension. +c We prefer performance here, because that's what we generally +c do by default in reduction functions. Besides, the accuracy +c of xDOT is questionable. Hence, do a cache-aligned nested loop. do j = 1,n - l = mod(k,kk) do i = 1,m - c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m) + c(i,j) = 0d0 end do - do l = l+1,k,kk + do l = 1,k do i = 1,m - c(i,j) = c(i,j) + zdotc(kk,a(i,l,j),m,b(i,l,j),m) + c(i,j) = c(i,j) + conjg(a(i,l,j))*b(i,l,j) end do end do end do diff --git a/src/ChangeLog b/src/ChangeLog --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,7 @@ +2009-11-26 Jaroslav Hajek + + * DLD-FUNCTIONS/dot.cc (Fdot): Update docs. + 2009-11-26 Jaroslav Hajek * DLD-FUNCTIONS/dot.cc: New source. diff --git a/src/DLD-FUNCTIONS/dot.cc b/src/DLD-FUNCTIONS/dot.cc --- a/src/DLD-FUNCTIONS/dot.cc +++ b/src/DLD-FUNCTIONS/dot.cc @@ -111,8 +111,9 @@ given, calculate the dot products along this dimension.\n\ \n\ This is equivalent to doing @code{sum (conj (@var{X}) .* @var{Y}, @var{dim})},\n\ -but avoids forming a temporary array and uses the BLAS xDOT functions,\n\ -usually resulting in increased accuracy of the computation.\n\ +but avoids forming a temporary array and is faster.\n\ +When @var{X} and @var{Y} are column vectors, the result is equivalent to\n\ +@code{ @var{X}'*@var{Y} }.\n\ @end deftypefn") { octave_value retval;