# HG changeset patch
# User Jaroslav Hajek <highegg@gmail.com>
# Date 1259265981 -3600
# Node ID 21d81d06b221f6fa307db82f6e7c130e075b5db7
# Parent  47f36dd27203d8d67bb0da48ca48c25621c4c149
cache-aligned loop for rowwise dot

diff --git a/libcruft/ChangeLog b/libcruft/ChangeLog
--- a/libcruft/ChangeLog
+++ b/libcruft/ChangeLog
@@ -1,3 +1,10 @@
+2009-11-26  Jaroslav Hajek  <highegg@gmail.com>
+
+	* blas-xtra/sdot3.f: Use nested cache-aligned loop for general case.
+	* blas-xtra/ddot3.f: Ditto.
+	* blas-xtra/cdotc3.f: Ditto.
+	* blas-xtra/zdotc3.f: Ditto.
+
 2009-11-26  Jaroslav Hajek  <highegg@gmail.com>
 
 	* blas-xtra/sdot3.f: New source.
diff --git a/libcruft/blas-xtra/cdotc3.f b/libcruft/blas-xtra/cdotc3.f
--- a/libcruft/blas-xtra/cdotc3.f
+++ b/libcruft/blas-xtra/cdotc3.f
@@ -29,8 +29,6 @@
       complex a(m,k,n),b(m,k,n)
       complex c(m,n)
 
-      integer kk
-      parameter (kk = 64)
       complex cdotc
       external cdotc
 
@@ -38,22 +36,21 @@
       if (m <= 0 .or. n <= 0) return
 
       if (m == 1) then
-c the column-major case
+c the column-major case.
         do j = 1,n
           c(1,j) = cdotc(k,a(1,1,j),1,b(1,1,j),1)
         end do
       else
-c here the product is row-wise, but we'd still like to use BLAS's dot
-c for its usually better accuracy. let's do a compromise and split the
-c middle dimension.
+c We prefer performance here, because that's what we generally
+c do by default in reduction functions. Besides, the accuracy
+c of xDOT is questionable. Hence, do a cache-aligned nested loop.
         do j = 1,n
-          l = mod(k,kk)
           do i = 1,m
-            c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m)
+            c(i,j) = 0e0
           end do
-          do l = l+1,k,kk
+          do l = 1,k
             do i = 1,m
-              c(i,j) = c(i,j) + cdotc(kk,a(i,l,j),m,b(i,l,j),m)
+              c(i,j) = c(i,j) + conjg(a(i,l,j))*b(i,l,j)
             end do
           end do
         end do
diff --git a/libcruft/blas-xtra/ddot3.f b/libcruft/blas-xtra/ddot3.f
--- a/libcruft/blas-xtra/ddot3.f
+++ b/libcruft/blas-xtra/ddot3.f
@@ -29,8 +29,6 @@
       double precision a(m,k,n),b(m,k,n)
       double precision c(m,n)
 
-      integer kk
-      parameter (kk = 64)
       double precision ddot
       external ddot
 
@@ -39,22 +37,21 @@
       if (m <= 0 .or. n <= 0) return
 
       if (m == 1) then
-c the column-major case
+c the column-major case.
         do j = 1,n
           c(1,j) = ddot(k,a(1,1,j),1,b(1,1,j),1)
         end do
       else
-c here the product is row-wise, but we'd still like to use BLAS's dot
-c for its usually better accuracy. let's do a compromise and split the
-c middle dimension.
+c We prefer performance here, because that's what we generally
+c do by default in reduction functions. Besides, the accuracy
+c of xDOT is questionable. Hence, do a cache-aligned nested loop.
         do j = 1,n
-          l = mod(k,kk)
           do i = 1,m
-            c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m)
+            c(i,j) = 0d0
           end do
-          do l = l+1,k,kk
+          do l = 1,k
             do i = 1,m
-              c(i,j) = c(i,j) + ddot(kk,a(i,l,j),m,b(i,l,j),m)
+              c(i,j) = c(i,j) + a(i,l,j)*b(i,l,j)
             end do
           end do
         end do
diff --git a/libcruft/blas-xtra/sdot3.f b/libcruft/blas-xtra/sdot3.f
--- a/libcruft/blas-xtra/sdot3.f
+++ b/libcruft/blas-xtra/sdot3.f
@@ -29,8 +29,6 @@
       real a(m,k,n),b(m,k,n)
       real c(m,n)
 
-      integer kk
-      parameter (kk = 64)
       real sdot
       external sdot
 
@@ -38,22 +36,21 @@
       if (m <= 0 .or. n <= 0) return
 
       if (m == 1) then
-c the column-major case
+c the column-major case.
         do j = 1,n
           c(1,j) = sdot(k,a(1,1,j),1,b(1,1,j),1)
         end do
       else
-c here the product is row-wise, but we'd still like to use BLAS's dot
-c for its usually better accuracy. let's do a compromise and split the
-c middle dimension.
+c We prefer performance here, because that's what we generally
+c do by default in reduction functions. Besides, the accuracy
+c of xDOT is questionable. Hence, do a cache-aligned nested loop.
         do j = 1,n
-          l = mod(k,kk)
           do i = 1,m
-            c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m)
+            c(i,j) = 0d0
           end do
-          do l = l+1,k,kk
+          do l = 1,k
             do i = 1,m
-              c(i,j) = c(i,j) + sdot(kk,a(i,l,j),m,b(i,l,j),m)
+              c(i,j) = c(i,j) + a(i,l,j)*b(i,l,j)
             end do
           end do
         end do
diff --git a/libcruft/blas-xtra/zdotc3.f b/libcruft/blas-xtra/zdotc3.f
--- a/libcruft/blas-xtra/zdotc3.f
+++ b/libcruft/blas-xtra/zdotc3.f
@@ -29,8 +29,6 @@
       double complex a(m,k,n),b(m,k,n)
       double complex c(m,n)
 
-      integer kk
-      parameter (kk = 64)
       double complex zdotc
       external zdotc
 
@@ -38,22 +36,21 @@
       if (m <= 0 .or. n <= 0) return
 
       if (m == 1) then
-c the column-major case
+c the column-major case.
         do j = 1,n
           c(1,j) = zdotc(k,a(1,1,j),1,b(1,1,j),1)
         end do
       else
-c here the product is row-wise, but we'd still like to use BLAS's dot
-c for its usually better accuracy. let's do a compromise and split the
-c middle dimension.
+c We prefer performance here, because that's what we generally
+c do by default in reduction functions. Besides, the accuracy
+c of xDOT is questionable. Hence, do a cache-aligned nested loop.
         do j = 1,n
-          l = mod(k,kk)
           do i = 1,m
-            c(i,j) = ddot(l,a(i,1,j),m,b(i,1,j),m)
+            c(i,j) = 0d0
           end do
-          do l = l+1,k,kk
+          do l = 1,k
             do i = 1,m
-              c(i,j) = c(i,j) + zdotc(kk,a(i,l,j),m,b(i,l,j),m)
+              c(i,j) = c(i,j) + conjg(a(i,l,j))*b(i,l,j)
             end do
           end do
         end do
diff --git a/src/ChangeLog b/src/ChangeLog
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,7 @@
+2009-11-26  Jaroslav Hajek  <highegg@gmail.com>
+
+	* DLD-FUNCTIONS/dot.cc (Fdot): Update docs.
+
 2009-11-26  Jaroslav Hajek  <highegg@gmail.com>
 
 	* DLD-FUNCTIONS/dot.cc: New source.
diff --git a/src/DLD-FUNCTIONS/dot.cc b/src/DLD-FUNCTIONS/dot.cc
--- a/src/DLD-FUNCTIONS/dot.cc
+++ b/src/DLD-FUNCTIONS/dot.cc
@@ -111,8 +111,9 @@
 given, calculate the dot products along this dimension.\n\
 \n\
 This is equivalent to doing @code{sum (conj (@var{X}) .* @var{Y}, @var{dim})},\n\
-but avoids forming a temporary array and uses the BLAS xDOT functions,\n\
-usually resulting in increased accuracy of the computation.\n\
+but avoids forming a temporary array and is faster.\n\
+When @var{X} and @var{Y} are column vectors, the result is equivalent to\n\
+@code{ @var{X}'*@var{Y} }.\n\
 @end deftypefn")
 {
   octave_value retval;