changeset 2370:d8ded9c0e20c

(enum re_opcode_t): New opcode on_failure_jump_nastyloop. (print_partial_compiled_pattern, re_compile_fastmap): Handle new opcode. (regex_compile): Use on_failure_jump_nastyloop for non-greedy loops. (re_match_2_internal): Add code for on_failure_jump_nastyloop when executing it as well as when popping it off the stack to find infinite loops in non-greedy repetition operators.
author Stefan Monnier <monnier@iro.umontreal.ca>
date Sun, 26 Mar 2000 23:05:51 +0000
parents e04db7109fa0
children 3274fb530385
files regex.c
diffstat 1 files changed, 50 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/regex.c
+++ b/regex.c
@@ -20,7 +20,6 @@
    USA.	 */
 
 /* TODO:
-   - detect nasty infinite loops like "\\(\\)+?ab" when matching "ac".
    - use analyze_first to optimize non-empty loops
    - optimize succeed_n and jump_n away when possible
    - clean up multibyte issues
@@ -532,6 +531,12 @@
 	   indefinitely).  */
   on_failure_jump_loop,
 
+	/* Just like `on_failure_jump_loop', except that it checks for
+	   a different kind of loop (the kind that shows up with non-greedy
+	   operators).  This operation has to be immediately preceded
+	   by a `no_op'.  */
+  on_failure_jump_nastyloop,
+
         /* A smart `on_failure_jump' used for greedy * and + operators.
 	   It analyses the loop before which it is put and if the
 	   loop does not require backtracking, it changes itself to
@@ -942,6 +947,11 @@
 	  printf ("/on_failure_keep_string_jump to %d", p + mcnt - start);
 	  break;
 
+	case on_failure_jump_nastyloop:
+	  extract_number_and_incr (&mcnt, &p);
+	  printf ("/on_failure_jump_nastyloop to %d", p + mcnt - start);
+	  break;
+
 	case on_failure_jump_loop:
 	  extract_number_and_incr (&mcnt, &p);
 	  printf ("/on_failure_jump_loop to %d", p + mcnt - start);
@@ -2179,19 +2189,19 @@
 	    else		/* not greedy */
 	      { /* I wish the greedy and non-greedy cases could be merged. */
 
+		GET_BUFFER_SPACE (7); /* We might use less.  */
 		if (many_times_ok)
 		  {
 		    /* The non-greedy multiple match looks like a repeat..until:
 		       we only need a conditional jump at the end of the loop */
-		    GET_BUFFER_SPACE (3);
-		    STORE_JUMP (on_failure_jump, b, laststart);
+		    BUF_PUSH (no_op);
+		    STORE_JUMP (on_failure_jump_nastyloop, b, laststart);
 		    b += 3;
 		    if (zero_times_ok)
 		      {
 			/* The repeat...until naturally matches one or more.
 			   To also match zero times, we need to first jump to
 			   the end of the loop (its conditional jump). */
-			GET_BUFFER_SPACE (3);
 			INSERT_JUMP (jump, laststart, b);
 			b += 3;
 		      }
@@ -2199,7 +2209,6 @@
 		else
 		  {
 		    /* non-greedy a?? */
-		    GET_BUFFER_SPACE (6);
 		    INSERT_JUMP (jump, laststart, b + 3);
 		    b += 3;
 		    INSERT_JUMP (on_failure_jump, laststart, laststart + 6);
@@ -3514,6 +3523,7 @@
 	    case on_failure_jump:
 	    case on_failure_keep_string_jump:
 	    case on_failure_jump_loop:
+	    case on_failure_jump_nastyloop:
 	    case on_failure_jump_smart:
 	      p++;
 	      break;
@@ -3526,6 +3536,7 @@
 
 	case on_failure_jump:
 	case on_failure_keep_string_jump:
+	case on_failure_jump_nastyloop:
 	case on_failure_jump_loop:
 	case on_failure_jump_smart:
 	handle_on_failure_jump:
@@ -3708,7 +3719,7 @@
   int anchored_start = 0;
 
   /* Nonzero if we have to concern multibyte character.	 */
-  int multibyte = bufp->multibyte;
+  const boolean multibyte = bufp->multibyte;
 
   /* Check for out-of-range STARTPOS.  */
   if (startpos < 0 || startpos > total_size)
@@ -5067,6 +5078,30 @@
 	  PUSH_FAILURE_POINT (p - 3, NULL);
 	  break;
 
+	  /* A nasty loop is introduced by the non-greedy *? and +?.
+	     With such loops, the stack only ever contains one failure point
+	     at a time, so that a plain on_failure_jump_loop kind of
+	     cycle detection cannot work.  Worse yet, such a detection
+	     can not only fail to detect a cycle, but it can also wrongly
+	     detect a cycle (between different instantiations of the same
+	     loop.
+	     So the method used for those nasty loops is a little different:
+	     We use a special cycle-detection-stack-frame which is pushed
+	     when the on_failure_jump_nastyloop failure-point is *popped*.
+	     This special frame thus marks the beginning of one iteration
+	     through the loop and we can hence easily check right here
+	     whether something matched between the beginning and the end of
+	     the loop.  */
+	case on_failure_jump_nastyloop:
+	  EXTRACT_NUMBER_AND_INCR (mcnt, p);
+	  DEBUG_PRINT3 ("EXECUTING on_failure_jump_nastyloop %d (to %p):\n",
+			mcnt, p + mcnt);
+
+	  assert ((re_opcode_t)p[-4] == no_op);
+	  CHECK_INFINITE_LOOP (p - 4, d);
+	  PUSH_FAILURE_POINT (p - 3, d);
+	  break;
+
 
 	  /* Simple loop detecting on_failure_jump:  just check on the
 	     failure stack if the same spot was already hit earlier.  */
@@ -5428,6 +5463,11 @@
 	      assert (str == NULL);
 	      goto continue_failure_jump;
 
+	    case on_failure_jump_nastyloop:
+	      assert ((re_opcode_t)pat[-2] == no_op);
+	      PUSH_FAILURE_POINT (pat - 2, str);
+	      /* Fallthrough */
+
 	    case on_failure_jump_loop:
 	    case on_failure_jump:
 	    case succeed_n:
@@ -5437,6 +5477,10 @@
 	      p = pat + mcnt;
 	      break;
 
+	    case no_op:
+	      /* A special frame used for nastyloops. */
+	      goto fail;
+
 	    default:
 	      abort();
 	    }