diff liboctave/oct-sort.cc @ 8700:314be237cd5b

sorting optimizations
author Jaroslav Hajek <highegg@gmail.com>
date Mon, 09 Feb 2009 13:05:35 +0100
parents e2b4c19c455c
children e9cb742df9eb
line wrap: on
line diff
--- a/liboctave/oct-sort.cc
+++ b/liboctave/oct-sort.cc
@@ -1,6 +1,6 @@
 /*
 Copyright (C) 2003, 2004, 2005, 2006, 2007 David Bateman
-Copyright (C) 2008 Jaroslav Hajek
+Copyright (C) 2008, 2009 Jaroslav Hajek
 
 This file is part of Octave.
 
@@ -33,6 +33,10 @@
   memmove by std::copy is possible if the destination starts before the source.
   If not, std::copy_backward needs to be used.
   
+* templatize comparison operator in most methods, provide possible dispatch
+
+* duplicate methods to avoid by-the-way indexed sorting
+
 
 The Python license is
 
@@ -98,12 +102,8 @@
 #include "quit.h"
 #include "oct-sort.h"
 
-#ifndef IFLT
-#define IFLT(a,b)  if (compare ? compare ((a), (b)) : ((a) < (b)))
-#endif
-
 template <class T>
-octave_sort<T>::octave_sort (void) : compare (0)
+octave_sort<T>::octave_sort (void) : compare (ascending_compare)
 { 
   merge_init ();
   merge_getmem (1024);
@@ -116,38 +116,20 @@
   merge_getmem (1024);
 }
   
-/* Reverse a slice of a list in place, from lo up to (exclusive) hi. */
 template <class T>
+template <class Comp>
 void
-octave_sort<T>::reverse_slice (T *lo, T *hi)
+octave_sort<T>::binarysort (T *data, octave_idx_type nel, 
+                            octave_idx_type start, Comp comp)
 {
-  --hi;
-  while (lo < hi) 
-    {
-      T t = *lo;
-      *lo = *hi;
-      *hi = t;
-      ++lo;
-      --hi;
-    }
-}
-
-template <class T>
-void
-octave_sort<T>::binarysort (T *lo, T *hi, T *start)
-{
-  T *l, *p, *r;
-  T pivot;
-
-  if (lo == start)
+  if (start == 0)
     ++start;
 
-  for (; start < hi; ++start) 
+  for (; start < nel; ++start) 
     {
       /* set l to where *start belongs */
-      l = lo;
-      r = start;
-      pivot = *r;
+      octave_idx_type l = 0, r = start;
+      T pivot = data[start];
       /* Invariants:
        * pivot >= all in [lo, l).
        * pivot  < all in [r, start).
@@ -155,8 +137,8 @@
        */
       do 
 	{
-	  p = l + ((r - l) >> 1);
-	  IFLT (pivot, *p)
+	  octave_idx_type p = l + ((r - l) >> 1);
+	  if (comp (pivot, data[p]))
 	    r = p;
 	  else
 	    l = p+1;
@@ -169,9 +151,58 @@
 	 Slide over to make room.
 	 Caution: using memmove is much slower under MSVC 5;
 	 we're not usually moving many slots. */
-      for (p = start; p > l; --p)
-	*p = *(p-1);
-      *l = pivot;
+      // NOTE: using swap and going upwards appears to be faster.
+      for (octave_idx_type p = l; p < start; p++)
+        std::swap (pivot, data[p]);
+      data[start] = pivot;
+    }
+
+  return;
+}
+
+template <class T>
+template <class Comp>
+void
+octave_sort<T>::binarysort (T *data, octave_idx_type *idx, octave_idx_type nel, 
+                            octave_idx_type start, Comp comp)
+{
+  if (start == 0)
+    ++start;
+
+  for (; start < nel; ++start) 
+    {
+      /* set l to where *start belongs */
+      octave_idx_type l = 0, r = start;
+      T pivot = data[start];
+      /* Invariants:
+       * pivot >= all in [lo, l).
+       * pivot  < all in [r, start).
+       * The second is vacuously true at the start.
+       */
+      do 
+	{
+	  octave_idx_type p = l + ((r - l) >> 1);
+	  if (comp (pivot, data[p]))
+	    r = p;
+	  else
+	    l = p+1;
+	} 
+      while (l < r);
+      /* The invariants still hold, so pivot >= all in [lo, l) and
+	 pivot < all in [l, start), so pivot belongs at l.  Note
+	 that if there are elements equal to pivot, l points to the
+	 first slot after them -- that's why this sort is stable.
+	 Slide over to make room.
+	 Caution: using memmove is much slower under MSVC 5;
+	 we're not usually moving many slots. */
+      // NOTE: using swap and going upwards appears to be faster.
+      for (octave_idx_type p = l; p < start; p++)
+        std::swap (pivot, data[p]);
+      data[start] = pivot;
+      octave_idx_type ipivot = idx[start];
+      for (octave_idx_type p = l; p < start; p++)
+        std::swap (ipivot, idx[p]);
+      idx[start] = ipivot;
     }
 
   return;
@@ -196,10 +227,12 @@
 Returns -1 in case of error.
 */
 template <class T>
+template <class Comp>
 octave_idx_type
-octave_sort<T>::count_run (T *lo, T *hi, bool& descending)
+octave_sort<T>::count_run (T *lo, octave_idx_type nel, bool& descending, Comp comp)
 {
   octave_idx_type n;
+  T *hi = lo + nel;
 
   descending = false;
   ++lo;
@@ -208,12 +241,12 @@
 
   n = 2;
 
-  IFLT (*lo, *(lo-1))
+  if (comp (*lo, *(lo-1)))
     {
       descending = true;
       for (lo = lo+1; lo < hi; ++lo, ++n) 
 	{
-	  IFLT (*lo, *(lo-1))
+	  if (comp (*lo, *(lo-1)))
 	    ;
 	  else
 	    break;
@@ -223,7 +256,7 @@
     {
       for (lo = lo+1; lo < hi; ++lo, ++n) 
 	{
-	  IFLT (*lo, *(lo-1))
+	  if (comp (*lo, *(lo-1)))
 	    break;
 	}
     }
@@ -253,8 +286,10 @@
 Returns -1 on error.  See listsort.txt for info on the method.
 */
 template <class T>
+template <class Comp>
 octave_idx_type
-octave_sort<T>::gallop_left (T key, T *a, octave_idx_type n, octave_idx_type hint)
+octave_sort<T>::gallop_left (T key, T *a, octave_idx_type n, octave_idx_type hint,
+                             Comp comp)
 {
   octave_idx_type ofs;
   octave_idx_type lastofs;
@@ -263,7 +298,7 @@
   a += hint;
   lastofs = 0;
   ofs = 1;
-  IFLT (*a, key)
+  if (comp (*a, key))
     {
       /* a[hint] < key -- gallop right, until
        * a[hint + lastofs] < key <= a[hint + ofs]
@@ -271,7 +306,7 @@
       const octave_idx_type maxofs = n - hint;	/* &a[n-1] is highest */
       while (ofs < maxofs) 
 	{
-	  IFLT (a[ofs], key)
+	  if (comp (a[ofs], key))
 	    {
 	      lastofs = ofs;
 	      ofs = (ofs << 1) + 1;
@@ -295,7 +330,7 @@
       const octave_idx_type maxofs = hint + 1;	/* &a[0] is lowest */
       while (ofs < maxofs) 
 	{
-	  IFLT (*(a-ofs), key)
+	  if (comp (*(a-ofs), key))
 	    break;
 	  /* key <= a[hint - ofs] */
 	  lastofs = ofs;
@@ -321,7 +356,7 @@
     {
       octave_idx_type m = lastofs + ((ofs - lastofs) >> 1);
 
-      IFLT (a[m], key)
+      if (comp (a[m], key))
 	lastofs = m+1;	/* a[m] < key */
       else
 	ofs = m;	/* key <= a[m] */
@@ -345,8 +380,10 @@
 written as one routine with yet another "left or right?" flag.
 */
 template <class T>
+template <class Comp>
 octave_idx_type
-octave_sort<T>::gallop_right (T key, T *a, octave_idx_type n, octave_idx_type hint)
+octave_sort<T>::gallop_right (T key, T *a, octave_idx_type n, octave_idx_type hint,
+                              Comp comp)
 {
   octave_idx_type ofs;
   octave_idx_type lastofs;
@@ -355,7 +392,7 @@
   a += hint;
   lastofs = 0;
   ofs = 1;
-  IFLT (key, *a)
+  if (comp (key, *a))
     {
       /* key < a[hint] -- gallop left, until
        * a[hint - ofs] <= key < a[hint - lastofs]
@@ -363,7 +400,7 @@
       const octave_idx_type maxofs = hint + 1;	/* &a[0] is lowest */
       while (ofs < maxofs) 
 	{
-	  IFLT (key, *(a-ofs))
+	  if (comp (key, *(a-ofs)))
 	    {
 	      lastofs = ofs;
 	      ofs = (ofs << 1) + 1;
@@ -388,7 +425,7 @@
       const octave_idx_type maxofs = n - hint;	/* &a[n-1] is highest */
       while (ofs < maxofs) 
 	{
-	  IFLT (key, a[ofs])
+	  if (comp (key, a[ofs]))
 	    break;
 	  /* a[hint + ofs] <= key */
 	  lastofs = ofs;
@@ -413,7 +450,7 @@
     {
       octave_idx_type m = lastofs + ((ofs - lastofs) >> 1);
 
-      IFLT (key, a[m])
+      if (comp (key, a[m]))
 	ofs = m;	/* key < a[m] */
       else
 	lastofs = m+1;	/* a[m] <= key */
@@ -428,6 +465,7 @@
 octave_sort<T>::merge_init (void)
 {
   ms.a = 0;
+  ms.ia = 0;
   ms.alloced = 0;
   ms.n = 0;
   ms.min_gallop = MIN_GALLOP;
@@ -442,6 +480,7 @@
 octave_sort<T>::merge_freemem (void)
 {
   delete [] ms.a;
+  delete [] ms.ia;
   ms.alloced = 0;
   ms.a = 0;
 }
@@ -509,7 +548,29 @@
   return -1;
 }
 
-#define MERGE_GETMEM(NEED) ((NEED) <= ms.alloced ? 0 : merge_getmem (NEED))
+template <class T>
+int
+octave_sort<T>::merge_getmemi (octave_idx_type need)
+{
+  if (need <= ms.alloced && ms.a && ms.ia)
+    return 0;
+
+  need = roundupsize (need); 
+  /* Don't realloc!  That can cost cycles to copy the old data, but
+   * we don't care what's in the block.
+   */
+  merge_freemem ();
+  ms.a = new T[need];
+  ms.ia = new octave_idx_type[need];
+  if (ms.a && ms.ia)
+    {
+      ms.alloced = need;
+      return 0;
+    }
+  merge_freemem ();	/* reset to sane state */
+
+  return -1;
+}
 
 /* Merge the na elements starting at pa with the nb elements starting at pb
  * in a stable way, in-place.  na and nb must be > 0, and pa + na == pb.
@@ -518,15 +579,18 @@
  * Return 0 if successful, -1 if error.
  */
 template <class T>
+template <class Comp>
 int
-octave_sort<T>::merge_lo (T *pa, octave_idx_type na, T *pb, octave_idx_type nb)
+octave_sort<T>::merge_lo (T *pa, octave_idx_type na, 
+                          T *pb, octave_idx_type nb,
+                          Comp comp)
 {
   octave_idx_type k;
   T *dest;
   int result = -1;	/* guilty until proved innocent */
   octave_idx_type min_gallop = ms.min_gallop;
 
-  if (MERGE_GETMEM (na) < 0)
+  if (merge_getmem (na) < 0)
     return -1;
   std::copy (pa, pa + na, ms.a);
   dest = pa;
@@ -550,7 +614,10 @@
       for (;;)
 	{
 
-	  IFLT (*pb, *pa)
+          // FIXME: these loops are candidates for further optimizations.
+          // Rather than testing everything in each cycle, it may be more
+          // efficient to do it in hunks. 
+	  if (comp (*pb, *pa))
 	    {
 	      *dest++ = *pb++;
 	      ++bcount;
@@ -584,14 +651,13 @@
 	{
 	  min_gallop -= min_gallop > 1;
 	  ms.min_gallop = min_gallop;
-	  k = gallop_right (*pb, pa, na, 0);
+	  k = gallop_right (*pb, pa, na, 0, comp);
 	  acount = k;
 	  if (k)
 	    {
 	      if (k < 0)
 		goto Fail;
-              std::copy (pa, pa + k, dest);
-	      dest += k;
+              dest = std::copy (pa, pa + k, dest);
 	      pa += k;
 	      na -= k;
 	      if (na == 1)
@@ -608,14 +674,13 @@
 	  if (nb == 0)
 	    goto Succeed;
 
-	  k = gallop_left (*pa, pb, nb, 0);
+	  k = gallop_left (*pa, pb, nb, 0, comp);
 	  bcount = k;
 	  if (k)
 	    {
 	      if (k < 0)
 		goto Fail;
-              std::copy (pb, pb + k, dest);
-	      dest += k;
+              dest = std::copy (pb, pb + k, dest);
 	      pb += k;
 	      nb -= k;
 	      if (nb == 0)
@@ -648,6 +713,147 @@
   return 0;
 }
 
+template <class T>
+template <class Comp>
+int
+octave_sort<T>::merge_lo (T *pa, octave_idx_type *ipa, octave_idx_type na, 
+                          T *pb, octave_idx_type *ipb, octave_idx_type nb,
+                          Comp comp)
+{
+  octave_idx_type k;
+  T *dest;
+  octave_idx_type *idest;
+  int result = -1;	/* guilty until proved innocent */
+  octave_idx_type min_gallop = ms.min_gallop;
+
+  if (merge_getmemi (na) < 0)
+    return -1;
+  std::copy (pa, pa + na, ms.a);
+  std::copy (ipa, ipa + na, ms.ia);
+  dest = pa; idest = ipa;
+  pa = ms.a; ipa = ms.ia;
+
+  *dest++ = *pb++; *idest++ = *ipb++;
+  --nb;
+  if (nb == 0)
+    goto Succeed;
+  if (na == 1)
+    goto CopyB;
+
+  for (;;)
+    {
+      octave_idx_type acount = 0;	/* # of times A won in a row */
+      octave_idx_type bcount = 0;	/* # of times B won in a row */
+
+      /* Do the straightforward thing until (if ever) one run
+       * appears to win consistently.
+       */
+      for (;;)
+	{
+
+	  if (comp (*pb, *pa))
+	    {
+	      *dest++ = *pb++; *idest++ = *ipb++;
+	      ++bcount;
+	      acount = 0;
+	      --nb;
+	      if (nb == 0)
+		goto Succeed;
+	      if (bcount >= min_gallop)
+		break;
+	    }
+	  else
+	    {
+	      *dest++ = *pa++; *idest++ = *ipa++;
+	      ++acount;
+	      bcount = 0;
+	      --na;
+	      if (na == 1)
+		goto CopyB;
+	      if (acount >= min_gallop)
+		break;
+	    }
+	}
+
+      /* One run is winning so consistently that galloping may
+       * be a huge win.  So try that, and continue galloping until
+       * (if ever) neither run appears to be winning consistently
+       * anymore.
+       */
+      ++min_gallop;
+      do
+	{
+	  min_gallop -= min_gallop > 1;
+	  ms.min_gallop = min_gallop;
+	  k = gallop_right (*pb, pa, na, 0, comp);
+	  acount = k;
+	  if (k)
+	    {
+	      if (k < 0)
+		goto Fail;
+              dest = std::copy (pa, pa + k, dest);
+              idest = std::copy (ipa, ipa + k, idest);
+	      pa += k; ipa += k;
+	      na -= k;
+	      if (na == 1)
+		goto CopyB;
+	      /* na==0 is impossible now if the comparison
+	       * function is consistent, but we can't assume
+	       * that it is.
+	       */
+	      if (na == 0)
+		goto Succeed;
+	    }
+	  *dest++ = *pb++; *idest++ = *ipb++;
+	  --nb;
+	  if (nb == 0)
+	    goto Succeed;
+
+	  k = gallop_left (*pa, pb, nb, 0, comp);
+	  bcount = k;
+	  if (k)
+	    {
+	      if (k < 0)
+		goto Fail;
+              dest = std::copy (pb, pb + k, dest);
+              idest = std::copy (ipb, ipb + k, idest);
+	      pb += k; ipb += k;
+	      nb -= k;
+	      if (nb == 0)
+		goto Succeed;
+	    }
+	  *dest++ = *pa++; *idest++ = *ipa++;
+	  --na;
+	  if (na == 1)
+	    goto CopyB;
+	}
+      while (acount >= MIN_GALLOP || bcount >= MIN_GALLOP);
+
+      ++min_gallop;	/* penalize it for leaving galloping mode */
+      ms.min_gallop = min_gallop;
+    }
+
+ Succeed:
+  result = 0;
+
+ Fail:
+  if (na)
+    {
+      std::copy (pa, pa + na, dest);
+      std::copy (ipa, ipa + na, idest);
+    }
+  return result;
+
+ CopyB:
+  /* The last element of pa belongs at the end of the merge. */
+  std::copy (pb, pb + nb, dest);
+  std::copy (ipb, ipb + nb, idest);
+  dest[nb] = *pa;
+  idest[nb] = *ipa;
+
+  return 0;
+}
+
 /* Merge the na elements starting at pa with the nb elements starting at pb
  * in a stable way, in-place.  na and nb must be > 0, and pa + na == pb.
  * Must also have that *pb < *pa, that pa[na-1] belongs at the end of the
@@ -655,17 +861,19 @@
  * Return 0 if successful, -1 if error.
  */
 template <class T>
+template <class Comp>
 int
-octave_sort<T>::merge_hi (T *pa, octave_idx_type na, T *pb, octave_idx_type nb)
+octave_sort<T>::merge_hi (T *pa, octave_idx_type na, 
+                          T *pb, octave_idx_type nb,
+                          Comp comp)
 {
   octave_idx_type k;
   T *dest;
   int result = -1;	/* guilty until proved innocent */
-  T *basea;
-  T *baseb;
+  T *basea, *baseb;
   octave_idx_type min_gallop = ms.min_gallop;
 
-  if (MERGE_GETMEM (nb) < 0)
+  if (merge_getmem (nb) < 0)
     return -1;
   dest = pb + nb - 1;
   std::copy (pb, pb + nb, ms.a);
@@ -691,7 +899,7 @@
        */
       for (;;) 
 	{
-	  IFLT (*pb, *pa)
+	  if (comp (*pb, *pa))
 	    {
 	      *dest-- = *pa--;
 	      ++acount;
@@ -725,16 +933,15 @@
 	{
 	  min_gallop -= min_gallop > 1;
 	  ms.min_gallop = min_gallop;
-	  k = gallop_right (*pb, basea, na, na-1);
+	  k = gallop_right (*pb, basea, na, na-1, comp);
 	  if (k < 0)
 	    goto Fail;
 	  k = na - k;
 	  acount = k;
 	  if (k) 
 	    {
-	      dest -= k;
+              dest = std::copy_backward (pa+1 - k, pa+1, dest+1) - 1;
 	      pa -= k;
-              std::copy_backward (pa+1, pa+1 + k, dest+1 + k);
 	      na -= k;
 	      if (na == 0)
 		goto Succeed;
@@ -744,7 +951,7 @@
 	  if (nb == 1)
 	    goto CopyA;
 
-	  k = gallop_left (*pa, baseb, nb, nb-1);
+	  k = gallop_left (*pa, baseb, nb, nb-1, comp);
 	  if (k < 0)
 	    goto Fail;
 	  k = nb - k;
@@ -783,28 +990,176 @@
 
 CopyA:
   /* The first element of pb belongs at the front of the merge. */
-  dest -= na;
+  dest = std::copy_backward (pa+1 - na, pa+1, dest+1) - 1;
   pa -= na;
-  std::copy_backward (pa+1, pa+1 + na, dest+1 + na);
   *dest = *pb;
 
   return 0;
 }
 
+template <class T>
+template <class Comp>
+int
+octave_sort<T>::merge_hi (T *pa, octave_idx_type *ipa, octave_idx_type na, 
+                          T *pb, octave_idx_type *ipb, octave_idx_type nb,
+                          Comp comp)
+{
+  octave_idx_type k;
+  T *dest;
+  octave_idx_type *idest;
+  int result = -1;	/* guilty until proved innocent */
+  T *basea, *baseb;
+  octave_idx_type *ibasea, *ibaseb;
+  octave_idx_type min_gallop = ms.min_gallop;
+
+  if (merge_getmemi (nb) < 0)
+    return -1;
+  dest = pb + nb - 1;
+  idest = ipb + nb - 1;
+  std::copy (pb, pb + nb, ms.a);
+  std::copy (ipb, ipb + nb, ms.ia);
+  basea = pa; ibasea = ipa;
+  baseb = ms.a; ibaseb = ms.ia;
+  pb = ms.a + nb - 1; ipb = ms.ia + nb - 1;
+  pa += na - 1; ipa += na - 1;
+
+  *dest-- = *pa--; *idest-- = *ipa--;
+  --na;
+  if (na == 0)
+    goto Succeed;
+  if (nb == 1)
+    goto CopyA;
+
+  for (;;) 
+    {
+      octave_idx_type acount = 0;	/* # of times A won in a row */
+      octave_idx_type bcount = 0;	/* # of times B won in a row */
+
+      /* Do the straightforward thing until (if ever) one run
+       * appears to win consistently.
+       */
+      for (;;) 
+	{
+	  if (comp (*pb, *pa))
+	    {
+	      *dest-- = *pa--; *idest-- = *ipa--;
+	      ++acount;
+	      bcount = 0;
+	      --na;
+	      if (na == 0)
+		goto Succeed;
+	      if (acount >= min_gallop)
+		break;
+	    }
+	  else 
+	    {
+	      *dest-- = *pb--; *idest-- = *ipb--;
+	      ++bcount;
+	      acount = 0;
+	      --nb;
+	      if (nb == 1)
+		goto CopyA;
+	      if (bcount >= min_gallop)
+		break;
+	    }
+	}
+
+      /* One run is winning so consistently that galloping may
+       * be a huge win.  So try that, and continue galloping until
+       * (if ever) neither run appears to be winning consistently
+       * anymore.
+       */
+      ++min_gallop;
+      do 
+	{
+	  min_gallop -= min_gallop > 1;
+	  ms.min_gallop = min_gallop;
+	  k = gallop_right (*pb, basea, na, na-1, comp);
+	  if (k < 0)
+	    goto Fail;
+	  k = na - k;
+	  acount = k;
+	  if (k) 
+	    {
+              dest = std::copy_backward (pa+1 - k, pa+1, dest+1) - 1;
+              idest = std::copy_backward (ipa+1 - k, ipa+1, idest+1) - 1;
+	      pa -= k; ipa -= k;
+	      na -= k;
+	      if (na == 0)
+		goto Succeed;
+	    }
+	  *dest-- = *pb--; *idest-- = *ipb--;
+	  --nb;
+	  if (nb == 1)
+	    goto CopyA;
+
+	  k = gallop_left (*pa, baseb, nb, nb-1, comp);
+	  if (k < 0)
+	    goto Fail;
+	  k = nb - k;
+	  bcount = k;
+	  if (k) 
+	    {
+	      dest -= k; idest -= k;
+	      pb -= k; ipb -= k;
+              std::copy (pb+1, pb+1 + k, dest+1);
+              std::copy (ipb+1, ipb+1 + k, idest+1);
+	      nb -= k;
+	      if (nb == 1)
+		goto CopyA;
+	      /* nb==0 is impossible now if the comparison
+	       * function is consistent, but we can't assume
+	       * that it is.
+	       */
+	      if (nb == 0)
+		goto Succeed;
+	    }
+	  *dest-- = *pa--; *idest-- = *ipa--;
+	  --na;
+	  if (na == 0)
+	    goto Succeed;
+	} while (acount >= MIN_GALLOP || bcount >= MIN_GALLOP);
+      ++min_gallop;	/* penalize it for leaving galloping mode */
+      ms.min_gallop = min_gallop;
+    }
+
+Succeed:
+  result = 0;
+
+Fail:
+  if (nb)
+    {
+      std::copy (baseb, baseb + nb, dest-(nb-1));
+      std::copy (ibaseb, ibaseb + nb, idest-(nb-1));
+    }
+  return result;
+
+CopyA:
+  /* The first element of pb belongs at the front of the merge. */
+  dest = std::copy_backward (pa+1 - na, pa+1, dest+1) - 1;
+  idest = std::copy_backward (ipa+1 - na, ipa+1, idest+1) - 1;
+  pa -= na; ipa -= na;
+  *dest = *pb; *idest = *ipb;
+
+  return 0;
+}
+
 /* Merge the two runs at stack indices i and i+1.
  * Returns 0 on success, -1 on error.
  */
 template <class T>
+template <class Comp>
 int
-octave_sort<T>::merge_at (octave_idx_type i)
+octave_sort<T>::merge_at (octave_idx_type i, T *data,
+                          Comp comp)
 {
   T *pa, *pb;
   octave_idx_type na, nb;
   octave_idx_type k;
 
-  pa = ms.pending[i].base;
+  pa = data + ms.pending[i].base;
   na = ms.pending[i].len;
-  pb = ms.pending[i+1].base;
+  pb = data + ms.pending[i+1].base;
   nb = ms.pending[i+1].len;
 
   /* Record the length of the combined runs; if i is the 3rd-last
@@ -819,7 +1174,7 @@
   /* Where does b start in a?  Elements in a before that can be
    * ignored (already in place).
    */
-  k = gallop_right (*pb, pa, na, 0);
+  k = gallop_right (*pb, pa, na, 0, comp);
   if (k < 0)
     return -1;
   pa += k;
@@ -830,7 +1185,7 @@
   /* Where does a end in b?  Elements in b after that can be
    * ignored (already in place).
    */
-  nb = gallop_left (pa[na-1], pb, nb, nb-1);
+  nb = gallop_left (pa[na-1], pb, nb, nb-1, comp);
   if (nb <= 0)
     return nb;
 
@@ -838,9 +1193,63 @@
    * min(na, nb) elements.
    */
   if (na <= nb)
-    return merge_lo (pa, na, pb, nb);
+    return merge_lo (pa, na, pb, nb, comp);
   else
-    return merge_hi (pa, na, pb, nb);
+    return merge_hi (pa, na, pb, nb, comp);
+}
+
+template <class T>
+template <class Comp>
+int
+octave_sort<T>::merge_at (octave_idx_type i, T *data, octave_idx_type *idx,
+                          Comp comp)
+{
+  T *pa, *pb;
+  octave_idx_type *ipa, *ipb;
+  octave_idx_type na, nb;
+  octave_idx_type k;
+
+  pa = data + ms.pending[i].base;
+  ipa = idx + ms.pending[i].base;
+  na = ms.pending[i].len;
+  pb = data + ms.pending[i+1].base;
+  ipb = idx + ms.pending[i+1].base;
+  nb = ms.pending[i+1].len;
+
+  /* Record the length of the combined runs; if i is the 3rd-last
+   * run now, also slide over the last run (which isn't involved
+   * in this merge).  The current run i+1 goes away in any case.
+   */
+  ms.pending[i].len = na + nb;
+  if (i == ms.n - 3)
+    ms.pending[i+1] = ms.pending[i+2];
+  --ms.n;
+
+  /* Where does b start in a?  Elements in a before that can be
+   * ignored (already in place).
+   */
+  k = gallop_right (*pb, pa, na, 0, comp);
+  if (k < 0)
+    return -1;
+  pa += k; ipa += k;
+  na -= k;
+  if (na == 0)
+    return 0;
+
+  /* Where does a end in b?  Elements in b after that can be
+   * ignored (already in place).
+   */
+  nb = gallop_left (pa[na-1], pb, nb, nb-1, comp);
+  if (nb <= 0)
+    return nb;
+
+  /* Merge what remains of the runs, using a temp array with
+   * min(na, nb) elements.
+   */
+  if (na <= nb)
+    return merge_lo (pa, ipa, na, pb, ipb, nb, comp);
+  else
+    return merge_hi (pa, ipa, na, pb, ipb, nb, comp);
 }
 
 /* Examine the stack of runs waiting to be merged, merging adjacent runs
@@ -854,8 +1263,9 @@
  * Returns 0 on success, -1 on error.
  */
 template <class T>
+template <class Comp>
 int
-octave_sort<T>::merge_collapse (void)
+octave_sort<T>::merge_collapse (T *data, Comp comp)
 {
   struct s_slice *p = ms.pending;
 
@@ -866,12 +1276,41 @@
 	{
 	  if (p[n-1].len < p[n+1].len)
 	    --n;
-	  if (merge_at (n) < 0)
+	  if (merge_at (n, data, comp) < 0)
 	    return -1;
 	}
       else if (p[n].len <= p[n+1].len) 
 	{
-	  if (merge_at (n) < 0)
+	  if (merge_at (n, data, comp) < 0)
+	    return -1;
+	}
+      else
+	break;
+    }
+
+  return 0;
+}
+
+template <class T>
+template <class Comp>
+int
+octave_sort<T>::merge_collapse (T *data, octave_idx_type *idx, Comp comp)
+{
+  struct s_slice *p = ms.pending;
+
+  while (ms.n > 1) 
+    {
+      octave_idx_type n = ms.n - 2;
+      if (n > 0 && p[n-1].len <= p[n].len + p[n+1].len) 
+	{
+	  if (p[n-1].len < p[n+1].len)
+	    --n;
+	  if (merge_at (n, data, idx, comp) < 0)
+	    return -1;
+	}
+      else if (p[n].len <= p[n+1].len) 
+	{
+	  if (merge_at (n, data, idx, comp) < 0)
 	    return -1;
 	}
       else
@@ -887,8 +1326,9 @@
  * Returns 0 on success, -1 on error.
  */
 template <class T>
+template <class Comp>
 int
-octave_sort<T>::merge_force_collapse (void)
+octave_sort<T>::merge_force_collapse (T *data, Comp comp)
 {
   struct s_slice *p = ms.pending;
 
@@ -897,7 +1337,26 @@
       octave_idx_type n = ms.n - 2;
       if (n > 0 && p[n-1].len < p[n+1].len)
 	--n;
-      if (merge_at (n) < 0)
+      if (merge_at (n, data, comp) < 0)
+	return -1;
+    }
+
+  return 0;
+}
+
+template <class T>
+template <class Comp>
+int
+octave_sort<T>::merge_force_collapse (T *data, octave_idx_type *idx, Comp comp)
+{
+  struct s_slice *p = ms.pending;
+
+  while (ms.n > 1) 
+    {
+      octave_idx_type n = ms.n - 2;
+      if (n > 0 && p[n-1].len < p[n+1].len)
+	--n;
+      if (merge_at (n, data, idx, comp) < 0)
 	return -1;
     }
 
@@ -930,18 +1389,75 @@
 }
 
 template <class T>
+template <class Comp>
 void
-octave_sort<T>::sort (T *v, octave_idx_type elements)
+octave_sort<T>::sort (T *data, octave_idx_type nel, Comp comp)
 {
   /* Re-initialize the Mergestate as this might be the second time called */
   ms.n = 0;
   ms.min_gallop = MIN_GALLOP;
 
-  if (elements > 1)
+  if (nel > 1)
     {
-      octave_idx_type nremaining = elements; 
-      T *lo = v;
-      T *hi = v + elements;
+      octave_idx_type nremaining = nel; 
+      octave_idx_type lo = 0;
+
+      /* March over the array once, left to right, finding natural runs,
+       * and extending short natural runs to minrun elements.
+       */
+      octave_idx_type minrun = merge_compute_minrun (nremaining);
+      do 
+	{
+	  bool descending;
+	  octave_idx_type n;
+
+	  /* Identify next run. */
+	  n = count_run (data + lo, nremaining, descending, comp);
+	  if (n < 0)
+	    goto fail;
+	  if (descending)
+            std::reverse (data + lo, data + lo + n);
+	  /* If short, extend to min(minrun, nremaining). */
+	  if (n < minrun) 
+	    {
+	      const octave_idx_type force = nremaining <= minrun ? nremaining : minrun;
+	      binarysort (data + lo, force, n, comp);
+	      n = force;
+	    }
+	  /* Push run onto pending-runs stack, and maybe merge. */
+	  assert (ms.n < MAX_MERGE_PENDING);
+	  ms.pending[ms.n].base = lo;
+	  ms.pending[ms.n].len = n;
+	  ++ms.n;
+	  if (merge_collapse (data, comp) < 0)
+	    goto fail;
+	  /* Advance to find next run. */
+	  lo += n;
+	  nremaining -= n;
+	}
+      while (nremaining);
+
+      merge_force_collapse (data, comp);
+    }
+
+fail:
+  return;
+}
+
+template <class T>
+template <class Comp>
+void
+octave_sort<T>::sort (T *data, octave_idx_type *idx, octave_idx_type nel, 
+                      Comp comp)
+{
+  /* Re-initialize the Mergestate as this might be the second time called */
+  ms.n = 0;
+  ms.min_gallop = MIN_GALLOP;
+
+  if (nel > 1)
+    {
+      octave_idx_type nremaining = nel; 
+      octave_idx_type lo = 0;
 
       /* March over the array once, left to right, finding natural runs,
        * and extending short natural runs to minrun elements.
@@ -953,16 +1469,19 @@
 	  octave_idx_type n;
 
 	  /* Identify next run. */
-	  n = count_run (lo, hi, descending);
+	  n = count_run (data + lo, nremaining, descending, comp);
 	  if (n < 0)
 	    goto fail;
 	  if (descending)
-	    reverse_slice (lo, lo + n);
+            {
+              std::reverse (data + lo, data + lo + n);
+              std::reverse (idx + lo, idx + lo + n);
+            }
 	  /* If short, extend to min(minrun, nremaining). */
 	  if (n < minrun) 
 	    {
 	      const octave_idx_type force = nremaining <= minrun ? nremaining : minrun;
-	      binarysort (lo, lo + force, lo + n);
+	      binarysort (data + lo, idx + lo, force, n, comp);
 	      n = force;
 	    }
 	  /* Push run onto pending-runs stack, and maybe merge. */
@@ -970,7 +1489,7 @@
 	  ms.pending[ms.n].base = lo;
 	  ms.pending[ms.n].len = n;
 	  ++ms.n;
-	  if (merge_collapse () < 0)
+	  if (merge_collapse (data, idx, comp) < 0)
 	    goto fail;
 	  /* Advance to find next run. */
 	  lo += n;
@@ -978,13 +1497,63 @@
 	}
       while (nremaining);
 
-      merge_force_collapse ();
+      merge_force_collapse (data, idx, comp);
     }
 
 fail:
   return;
 }
 
+template <class T>
+void
+octave_sort<T>::sort (T *data, octave_idx_type nel)
+{
+#ifdef INLINE_ASCENDING_SORT
+  if (compare == ascending_compare)
+    sort (data, nel, std::less<T> ());
+  else
+#endif
+#ifdef INLINE_DESCENDING_SORT    
+    if (compare == descending_compare)
+    sort (data, nel, std::greater<T> ());
+  else
+#endif
+    if (compare)
+      sort (data, nel, compare);
+}
+
+template <class T>
+void
+octave_sort<T>::sort (T *data, octave_idx_type *idx, octave_idx_type nel)
+{
+#ifdef INLINE_ASCENDING_SORT
+  if (compare == ascending_compare)
+    sort (data, idx, nel, std::less<T> ());
+  else
+#endif
+#ifdef INLINE_DESCENDING_SORT    
+    if (compare == descending_compare)
+    sort (data, idx, nel, std::greater<T> ());
+  else
+#endif
+    if (compare)
+      sort (data, idx, nel, compare);
+}
+
+template <class T>
+bool 
+octave_sort<T>::ascending_compare (T x, T y)
+{
+  return x < y;
+}
+
+template <class T>
+bool 
+octave_sort<T>::descending_compare (T x, T y)
+{
+  return x > y;
+}
+
 /*
 ;;; Local Variables: ***
 ;;; mode: C++ ***