changeset 16705:54b750a813cb

regex: diagnose too-large repeat counts in EREs Previously, the code did not diagnose the too-large repeat count in EREs like 'b{1000000000}'; instead, it silently treated the ERE as if it were 'b\{1000000000}', which is unexpected. * lib/regcomp.c (parse_dup_op): Fail with REG_ESIZE if a repeat count is too large. REG_ESIZE is used nowhere else, and the diagnostic is a reasonable one for this problem. Another option would be to create a new REG_OVERFLOW error for repeat counts that are too large. (fetch_number): Return RE_DUP_MAX + 1, not REG_ERROR, if the repeat count is too large, so that the caller can distinguish the two cases. * lib/regex.h (_REG_ESIZE): Document that this is now a generic "Too large" return code, and that repeat counts are one example of this.
author Paul Eggert <eggert@cs.ucla.edu>
date Fri, 16 Mar 2012 14:17:55 -0700
parents 4f6f5d9d5b77
children bd179cf2c9ea
files ChangeLog lib/regcomp.c lib/regex.h
diffstat 3 files changed, 27 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2012-03-16  Paul Eggert  <eggert@cs.ucla.edu>
+
+	regex: diagnose too-large repeat counts in EREs
+	Previously, the code did not diagnose the too-large repeat count
+	in EREs like 'b{1000000000}'; instead, it silently treated the ERE
+	as if it were 'b\{1000000000}', which is unexpected.
+	* lib/regcomp.c (parse_dup_op): Fail with REG_ESIZE if a repeat count
+	is too large.  REG_ESIZE is used nowhere else, and the diagnostic
+	is a reasonable one for this problem.  Another option would be to
+	create a new REG_OVERFLOW error for repeat counts that are too large.
+	(fetch_number): Return RE_DUP_MAX + 1, not REG_ERROR, if the repeat
+	count is too large, so that the caller can distinguish the two cases.
+	* lib/regex.h (_REG_ESIZE): Document that this is now a generic
+	"Too large" return code, and that repeat counts are one example of this.
+
 2012-03-16  Paul Eggert  <eggert@cs.ucla.edu>
 
 	doc: some glibc x32 integer width issues
--- a/lib/regcomp.c
+++ b/lib/regcomp.c
@@ -2571,6 +2571,12 @@
 	  *err = REG_BADBR;
 	  return NULL;
 	}
+
+      if (BE (RE_DUP_MAX < (end == REG_MISSING ? start : end), 0))
+	{
+	  *err = REG_ESIZE;
+	  return NULL;
+	}
     }
   else
     {
@@ -3751,6 +3757,7 @@
 /* This is intended for the expressions like "a{1,3}".
    Fetch a number from 'input', and return the number.
    Return REG_MISSING if the number field is empty like "{,1}".
+   Return RE_DUP_MAX + 1 if the number field is too large.
    Return REG_ERROR if an error occurred.  */
 
 static Idx
@@ -3769,8 +3776,9 @@
       num = ((token->type != CHARACTER || c < '0' || '9' < c
 	      || num == REG_ERROR)
 	     ? REG_ERROR
-	     : ((num == REG_MISSING) ? c - '0' : num * 10 + c - '0'));
-      num = (num > RE_DUP_MAX) ? REG_ERROR : num;
+	     : num == REG_MISSING
+	     ? c - '0'
+	     : MIN (RE_DUP_MAX + 1, num * 10 + c - '0'));
     }
   return num;
 }
--- a/lib/regex.h
+++ b/lib/regex.h
@@ -304,7 +304,7 @@
 /* RE_DUP_MAX is 2**15 - 1 because an earlier implementation stored
    the counter as a 2-byte signed integer.  This is no longer true, so
    RE_DUP_MAX could be increased to (INT_MAX / 10 - 1), or to
-   ((SIZE_MAX - 2) / 10 - 1) if _REGEX_LARGE_OFFSETS is defined.
+   ((SIZE_MAX - 9) / 10) if _REGEX_LARGE_OFFSETS is defined.
    However, there would be a huge performance problem if someone
    actually used a pattern like a\{214748363\}, so RE_DUP_MAX retains
    its historical value.  */
@@ -375,7 +375,7 @@
 
   /* Error codes we've added.  */
   _REG_EEND,		/* Premature end.  */
-  _REG_ESIZE,		/* Compiled pattern bigger than 2^16 bytes.  */
+  _REG_ESIZE,		/* Too large (e.g., repeat count too large).  */
   _REG_ERPAREN		/* Unmatched ) or \); not returned from regcomp.  */
 } reg_errcode_t;