Mercurial > hg > octave-nkf
diff liboctave/Array.cc @ 7789:82be108cc558
First attempt at single precision tyeps
* * *
corrections to qrupdate single precision routines
* * *
prefer demotion to single over promotion to double
* * *
Add single precision support to log2 function
* * *
Trivial PROJECT file update
* * *
Cache optimized hermitian/transpose methods
* * *
Add tests for tranpose/hermitian and ChangeLog entry for new transpose code
author | David Bateman <dbateman@free.fr> |
---|---|
date | Sun, 27 Apr 2008 22:34:17 +0200 |
parents | ff918ee1a983 |
children | 762801c50b21 |
line wrap: on
line diff
--- a/liboctave/Array.cc +++ b/liboctave/Array.cc @@ -1203,7 +1203,48 @@ octave_idx_type nr = dim1 (); octave_idx_type nc = dim2 (); - if (nr > 1 && nc > 1) + if (nr >= 8 && nc >= 8) + { + Array<T> result (dim_vector (nc, nr)); + + // Blocked transpose to attempt to avoid cache misses. + + // Don't use OCTAVE_LOCAL_BUFFER here as it doesn't work with bool + // on some compilers. + T buf[64]; + + octave_idx_type ii = 0, jj; + for (jj = 0; jj < (nc - 8 + 1); jj += 8) + { + for (ii = 0; ii < (nr - 8 + 1); ii += 8) + { + // Copy to buffer + for (octave_idx_type j = jj, k = 0, idxj = jj * nr; + j < jj + 8; j++, idxj += nr) + for (octave_idx_type i = ii; i < ii + 8; i++) + buf[k++] = xelem (i + idxj); + + // Copy from buffer + for (octave_idx_type i = ii, idxi = ii * nc; i < ii + 8; + i++, idxi += nc) + for (octave_idx_type j = jj, k = i - ii; j < jj + 8; + j++, k+=8) + result.xelem (j + idxi) = buf[k]; + } + + if (ii < nr) + for (octave_idx_type j = jj; j < jj + 8; j++) + for (octave_idx_type i = ii; i < nr; i++) + result.xelem (j, i) = xelem (i, j); + } + + for (octave_idx_type j = jj; j < nc; j++) + for (octave_idx_type i = 0; i < nr; i++) + result.xelem (j, i) = xelem (i, j); + + return result; + } + else if (nr > 1 && nc > 1) { Array<T> result (dim_vector (nc, nr)); @@ -1221,6 +1262,103 @@ } template <class T> +Array<T> +Array<T>::hermitian (T (*fcn) (const T&)) const +{ + assert (ndims () == 2); + + octave_idx_type nr = dim1 (); + octave_idx_type nc = dim2 (); + + if (nr >= 8 && nc >= 8) + { + Array<T> result (dim_vector (nc, nr)); + + // Blocked transpose to attempt to avoid cache misses. + + // Don't use OCTAVE_LOCAL_BUFFER here as it doesn't work with bool + // on some compilers. + T buf[64]; + + octave_idx_type ii = 0, jj; + for (jj = 0; jj < (nc - 8 + 1); jj += 8) + { + for (ii = 0; ii < (nr - 8 + 1); ii += 8) + { + // Copy to buffer + for (octave_idx_type j = jj, k = 0, idxj = jj * nr; + j < jj + 8; j++, idxj += nr) + for (octave_idx_type i = ii; i < ii + 8; i++) + buf[k++] = xelem (i + idxj); + + // Copy from buffer + for (octave_idx_type i = ii, idxi = ii * nc; i < ii + 8; + i++, idxi += nc) + for (octave_idx_type j = jj, k = i - ii; j < jj + 8; + j++, k+=8) + result.xelem (j + idxi) = fcn (buf[k]); + } + + if (ii < nr) + for (octave_idx_type j = jj; j < jj + 8; j++) + for (octave_idx_type i = ii; i < nr; i++) + result.xelem (j, i) = fcn (xelem (i, j)); + } + + for (octave_idx_type j = jj; j < nc; j++) + for (octave_idx_type i = 0; i < nr; i++) + result.xelem (j, i) = fcn (xelem (i, j)); + + return result; + } + else + { + Array<T> result (dim_vector (nc, nr)); + + for (octave_idx_type j = 0; j < nc; j++) + for (octave_idx_type i = 0; i < nr; i++) + result.xelem (j, i) = fcn (xelem (i, j)); + + return result; + } +} + +/* + +%% Tranpose tests for matrices of the tile size and plus or minus a row +%% and with four tiles. + +%!shared m7, mt7, m8, mt8, m9, mt9 +%! m7 = reshape (1 : 7*8, 8, 7); +%! mt7 = [1:7; 1:7, 1:7, 1:7, 1:7; 1:7, 1:7, 1:7]; +%! m8 = reshape (1 : 8*8, 8, 8); +%! mt8 = [1:8; 1:8, 1:8, 1:8, 1:8; 1:8, 1:8, 1:8]; +%! m9 = reshape (1 : 9*8, 8, 9); +%! mt9 = [1:9; 1:9, 1:9, 1:9, 1:9; 1:9, 1:9, 1:9]; + +%!assert (m7', mt7) +%!assert ((1i*m7).', 1i * mt7) +%!assert ((1i*m7)', conj (1i * mt7)) +%!assert (m8', mt8) +%!assert ((1i*m8).', 1i * mt8) +%!assert ((1i*m8)', conj (1i * mt8)) +%!assert (m9', mt9) +%!assert ((1i*m9).', 1i * mt9) +%!assert ((1i*m9)', conj (1i * mt9)) + +%!assert ([m7, m7; m8, m8]', [mt7, mt8; mt7, mt8]) +%!assert ((1i*[m7, m7; m8, m8]).', 1i * [mt7, mt8; mt7, mt8]) +%!assert ((1i*[m7, m7; m8, m8])', conj (1i * [mt7, mt8; mt7, mt8])) +%!assert ([m8, m8; m8, m8]', [mt8, mt8; mt8, mt8]) +%!assert ((1i*[m8, m8; m8, m8]).', 1i * [mt8, mt8; mt8, mt8]) +%!assert ((1i*[m8, m8; m8, m8])', conj (1i * [mt8, mt8; mt8, mt8])) +%!assert ([m9, m9; m8, m8]', [mt9, mt8; mt9, mt8]) +%!assert ((1i*[m9, m9; m8, m8]).', 1i * [mt9, mt8; mt9, mt8]) +%!assert ((1i*[m9, m9; m8, m8])', conj (1i * [mt9, mt8; mt9, mt8])) + +*/ + +template <class T> T * Array<T>::fortran_vec (void) {