changeset 7344:72030a247f35

* lib/regex_internal.c (re_string_reconstruct): Handle offset < pstr->valid_raw_len && pstr->offsets_needed case. Ensure no bytes read before raw_mbs array. Pass a saved copy of pstr->valid_len - 1 rather than pstr->valid_raw_len - 1 to re_string_context_at. * m4/regex.m4 (gl_REGEX): Check for locale.h, since the test now requires it. (gl_PREREQ_REGEX): Don't check for locale.h any more, since gl_REGEX now does it for us. (gl_REGEX): Add test taken from http://sourceware.org/ml/libc-hacker/2006-09/msg00008.html.
author Paul Eggert <eggert@cs.ucla.edu>
date Thu, 21 Sep 2006 05:25:11 +0000
parents d0f586b66e09
children 549fa03da97f
files lib/ChangeLog lib/regex_internal.c m4/ChangeLog m4/regex.m4
diffstat 4 files changed, 140 insertions(+), 23 deletions(-) [+]
line wrap: on
line diff
--- a/lib/ChangeLog
+++ b/lib/ChangeLog
@@ -1,3 +1,15 @@
+2006-09-20  Paul Eggert  <eggert@cs.ucla.edu>
+
+	Import this patch from libc:
+
+	2006-09-06  Jakub Jelinek  <jakub@redhat.com>
+
+	* regex_internal.c (re_string_reconstruct): Handle
+	offset < pstr->valid_raw_len && pstr->offsets_needed case.
+	Ensure no bytes read before raw_mbs array.  Pass a saved copy of
+	pstr->valid_len - 1 rather than pstr->valid_raw_len - 1 to
+	re_string_context_at.
+
 2006-09-20  Bruno Haible  <bruno@clisp.org>
 
 	* mkdtemp.c: Import from libc.
--- a/lib/regex_internal.c
+++ b/lib/regex_internal.c
@@ -597,34 +597,98 @@
 
   if (BE (offset != 0, 1))
     {
-      /* Are the characters which are already checked remain?  */
-      if (BE (offset < pstr->valid_raw_len, 1)
-#ifdef RE_ENABLE_I18N
-	  /* Handling this would enlarge the code too much.
-	     Accept a slowdown in that case.  */
-	  && pstr->offsets_needed == 0
-#endif
-	 )
+      /* Should the already checked characters be kept?  */
+      if (BE (offset < pstr->valid_raw_len, 1))
 	{
 	  /* Yes, move them to the front of the buffer.  */
-	  pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
 #ifdef RE_ENABLE_I18N
-	  if (pstr->mb_cur_max > 1)
-	    memmove (pstr->wcs, pstr->wcs + offset,
-		     (pstr->valid_len - offset) * sizeof (wint_t));
+	  if (BE (pstr->offsets_needed, 0))
+	    {
+	      Idx low = 0, high = pstr->valid_len, mid;
+	      do
+		{
+		  mid = (high + low) / 2;
+		  if (pstr->offsets[mid] > offset)
+		    high = mid;
+		  else if (pstr->offsets[mid] < offset)
+		    low = mid + 1;
+		  else
+		    break;
+		}
+	      while (low < high);
+	      if (pstr->offsets[mid] < offset)
+		++mid;
+	      pstr->tip_context = re_string_context_at (pstr, mid - 1,
+							eflags);
+	      /* This can be quite complicated, so handle specially
+		 only the common and easy case where the character with
+		 different length representation of lower and upper
+		 case is present at or after offset.  */
+	      if (pstr->valid_len > offset
+		  && mid == offset && pstr->offsets[mid] == offset)
+		{
+		  memmove (pstr->wcs, pstr->wcs + offset,
+			   (pstr->valid_len - offset) * sizeof (wint_t));
+		  memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
+		  pstr->valid_len -= offset;
+		  pstr->valid_raw_len -= offset;
+		  for (low = 0; low < pstr->valid_len; low++)
+		    pstr->offsets[low] = pstr->offsets[low + offset] - offset;
+		}
+	      else
+		{
+		  /* Otherwise, just find out how long the partial multibyte
+		     character at offset is and fill it with WEOF/255.  */
+		  pstr->len = pstr->raw_len - idx + offset;
+		  pstr->stop = pstr->raw_stop - idx + offset;
+		  pstr->offsets_needed = 0;
+		  while (mid > 0 && pstr->offsets[mid - 1] == offset)
+		    --mid;
+		  while (mid < pstr->valid_len)
+		    if (pstr->wcs[mid] != WEOF)
+		      break;
+		    else
+		      ++mid;
+		  if (mid == pstr->valid_len)
+		    pstr->valid_len = 0;
+		  else
+		    {
+		      pstr->valid_len = pstr->offsets[mid] - offset;
+		      if (pstr->valid_len)
+			{
+			  for (low = 0; low < pstr->valid_len; ++low)
+			    pstr->wcs[low] = WEOF;
+			  memset (pstr->mbs, 255, pstr->valid_len);
+			}
+		    }
+		  pstr->valid_raw_len = pstr->valid_len;
+		}
+	    }
+	  else
+#endif
+	    {
+	      pstr->tip_context = re_string_context_at (pstr, offset - 1,
+							eflags);
+#ifdef RE_ENABLE_I18N
+	      if (pstr->mb_cur_max > 1)
+		memmove (pstr->wcs, pstr->wcs + offset,
+			 (pstr->valid_len - offset) * sizeof (wint_t));
 #endif /* RE_ENABLE_I18N */
-	  if (BE (pstr->mbs_allocated, 0))
-	    memmove (pstr->mbs, pstr->mbs + offset,
-		     pstr->valid_len - offset);
-	  pstr->valid_len -= offset;
-	  pstr->valid_raw_len -= offset;
+	      if (BE (pstr->mbs_allocated, 0))
+		memmove (pstr->mbs, pstr->mbs + offset,
+			 pstr->valid_len - offset);
+	      pstr->valid_len -= offset;
+	      pstr->valid_raw_len -= offset;
 #if DEBUG
-	  assert (pstr->valid_len > 0);
+	      assert (pstr->valid_len > 0);
 #endif
+	    }
 	}
       else
 	{
 	  /* No, skip all characters until IDX.  */
+	  Idx prev_valid_len = pstr->valid_len;
+
 #ifdef RE_ENABLE_I18N
 	  if (BE (pstr->offsets_needed, 0))
 	    {
@@ -648,6 +712,8 @@
 		     byte other than 0x80 - 0xbf.  */
 		  raw = pstr->raw_mbs + pstr->raw_mbs_idx;
 		  end = raw + (offset - pstr->mb_cur_max);
+		  if (end < pstr->raw_mbs)
+		    end = pstr->raw_mbs;
 		  p = raw + offset - 1;
 #ifdef _LIBC
 		  /* We know the wchar_t encoding is UCS4, so for the simple
@@ -655,7 +721,7 @@
 		  if (isascii (*p) && BE (pstr->trans == NULL, 1))
 		    {
 		      memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
-		      pstr->valid_len = 0;
+		      /* pstr->valid_len = 0; */
 		      wc = (wchar_t) *p;
 		    }
 		  else
@@ -698,7 +764,7 @@
 		pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
 	      if (wc == WEOF)
 		pstr->tip_context
-		  = re_string_context_at (pstr, pstr->valid_raw_len - 1, eflags);
+		  = re_string_context_at (pstr, prev_valid_len - 1, eflags);
 	      else
 		pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
 				      && IS_WIDE_WORD_CHAR (wc))
@@ -711,7 +777,7 @@
 		  for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
 		    pstr->wcs[wcs_idx] = WEOF;
 		  if (pstr->mbs_allocated)
-		    memset (pstr->mbs, -1, pstr->valid_len);
+		    memset (pstr->mbs, 255, pstr->valid_len);
 		}
 	      pstr->valid_raw_len = pstr->valid_len;
 	    }
--- a/m4/ChangeLog
+++ b/m4/ChangeLog
@@ -1,5 +1,12 @@
 2006-09-20  Paul Eggert  <eggert@cs.ucla.edu>
 
+	* regex.m4 (gl_REGEX): Check for locale.h, since the test
+	now requires it.
+	(gl_PREREQ_REGEX): Don't check for locale.h any more, since
+	gl_REGEX now does it for us.
+	(gl_REGEX): Add test taken from
+	http://sourceware.org/ml/libc-hacker/2006-09/msg00008.html.
+
 	* mkstemp.m4 (gl_FUNC_MKSTEMP): Require AC_SYS_LARGEFILE.
 	Check that large offsets work.  Modernize Autoconf usages.
 	Prefer "yes" to mean a good thing rather than a bad.
--- a/m4/regex.m4
+++ b/m4/regex.m4
@@ -1,4 +1,4 @@
-#serial 39
+#serial 40
 
 # Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2005,
 # 2006 Free Software Foundation, Inc.
@@ -14,6 +14,8 @@
 
 AC_DEFUN([gl_REGEX],
 [
+  AC_CHECK_HEADERS_ONCE([locale.h])
+
   AC_ARG_WITH([included-regex],
     [AC_HELP_STRING([--without-included-regex],
 		    [don't compile regex; this is the default on
@@ -34,6 +36,9 @@
       [AC_RUN_IFELSE(
 	[AC_LANG_PROGRAM(
 	  [AC_INCLUDES_DEFAULT
+	   #if HAVE_LOCALE_H
+	    #include <locale.h>
+	   #endif
 	   #include <limits.h>
 	   #include <regex.h>
 	   ],
@@ -42,6 +47,33 @@
 	    int i;
 	    const char *s;
 	    struct re_registers regs;
+
+	    #if HAVE_LOCALE_H
+	      /* http://sourceware.org/ml/libc-hacker/2006-09/msg00008.html
+		 This test needs valgrind to catch the bug on Debian
+		 GNU/Linux 3.1 x86, but it might catch the bug better
+		 on other platforms and it shouldn't hurt to try the
+		 test here.  */
+	      if (setlocale (LC_ALL, "en_US.UTF-8"))
+		{
+		  static char const pat[] = "insert into";
+		  static char const data[] =
+		    "\xFF\0\x12\xA2\xAA\xC4\xB1,K\x12\xC4\xB1*\xACK";
+		  re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE
+				 | RE_ICASE);
+		  memset (&regex, 0, sizeof regex);
+		  s = re_compile_pattern (pat, sizeof pat - 1, &regex);
+		  if (s)
+		    return 1;
+		  if (re_search (&regex, data, sizeof data - 1,
+				 0, sizeof data - 1, &regs)
+		      != -1)
+		    return 1;
+		  if (! setlocale (LC_ALL, "C"))
+		    return 1;
+		}
+	    #endif
+
 	    re_set_syntax (RE_SYNTAX_POSIX_EGREP);
 	    memset (&regex, 0, sizeof (regex));
 	    for (i = 0; i <= UCHAR_MAX; i++)
@@ -161,7 +193,7 @@
   AC_REQUIRE([AC_GNU_SOURCE])
   AC_REQUIRE([AC_C_RESTRICT])
   AC_REQUIRE([AM_LANGINFO_CODESET])
-  AC_CHECK_HEADERS_ONCE([locale.h wchar.h wctype.h])
+  AC_CHECK_HEADERS_ONCE([wchar.h wctype.h])
   AC_CHECK_FUNCS_ONCE([mbrtowc mempcpy wcrtomb wcscoll])
   AC_CHECK_DECLS([isblank], [], [], [#include <ctype.h>])
 ])