changeset 6057:a878a8d58823

Make strstr() work in multibyte locales.
author Bruno Haible <bruno@clisp.org>
date Wed, 17 Aug 2005 14:05:33 +0000
parents 4838606fdc03
children a7440145d6a9
files ChangeLog lib/ChangeLog lib/strstr.c lib/strstr.h m4/ChangeLog m4/strstr.m4 modules/strstr
diffstat 7 files changed, 130 insertions(+), 112 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2005-08-17  Bruno Haible  <bruno@clisp.org>
+
+	* modules/strstr (Files): Add m4/mbrtowc.m4.
+	(Depends-on): Add mbuiter.
+
 2005-08-17  Bruno Haible  <bruno@clisp.org>
 
 	* modules/strcase (Depends-on): Add mbuiter. Remove strnlen1, mbchar.
--- a/lib/ChangeLog
+++ b/lib/ChangeLog
@@ -1,3 +1,8 @@
+2005-08-17  Bruno Haible  <bruno@clisp.org>
+
+	* strstr.h: Ignore HAVE_STRSTR, always declare the gnulib function.
+	* strstr.c: Completely rewritten, with multibyte locale support.
+
 2005-08-17  Bruno Haible  <bruno@clisp.org>
 
 	* strcasecmp.c: Use mbuiter.h.
--- a/lib/strstr.c
+++ b/lib/strstr.c
@@ -1,119 +1,126 @@
-/* Copyright (C) 1994, 1999, 2002-2003 Free Software Foundation, Inc.
-This file is part of the GNU C Library.
+/* Searching in a string.
+   Copyright (C) 2005 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2005.
 
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2, or (at your option)
-any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-GNU General Public License for more details.
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
 
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
 
-/*
- * My personal strstr() implementation that beats most other algorithms.
- * Until someone tells me otherwise, I assume that this is the
- * fastest implementation of strstr() in C.
- * I deliberately chose not to comment it.  You should have at least
- * as much fun trying to understand it, as I had to write it :-).
- *
- * Stephen R. van den Berg, berg@pool.informatik.rwth-aachen.de	*/
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
 
 #if HAVE_CONFIG_H
 # include <config.h>
 #endif
 
-#include <string.h>
-
-typedef unsigned chartype;
+/* Specification.  */
+#include "strstr.h"
 
-#undef strstr
+#if HAVE_MBRTOWC
+# include "mbuiter.h"
+#endif
 
+/* Find the first occurrence of NEEDLE in HAYSTACK.  */
 char *
-strstr (const char *phaystack, const char *pneedle)
+strstr (const char *haystack, const char *needle)
 {
-  register const unsigned char *haystack, *needle;
-  register chartype b, c;
-
-  haystack = (const unsigned char *) phaystack;
-  needle = (const unsigned char *) pneedle;
-
-  b = *needle;
-  if (b != '\0')
+  /* Be careful not to look at the entire extent of haystack or needle
+     until needed.  This is useful because of these two cases:
+       - haystack may be very long, and a match of needle found early,
+       - needle may be very long, and not even a short initial segment of
+         needle may be found in haystack.  */
+#if HAVE_MBRTOWC
+  if (MB_CUR_MAX > 1)
     {
-      haystack--;				/* possible ANSI violation */
-      do
-	{
-	  c = *++haystack;
-	  if (c == '\0')
-	    goto ret0;
-	}
-      while (c != b);
+      mbui_iterator_t iter_needle;
 
-      c = *++needle;
-      if (c == '\0')
-	goto foundneedle;
-      ++needle;
-      goto jin;
+      mbui_init (iter_needle, needle);
+      if (mbui_avail (iter_needle))
+	{
+	  mbui_iterator_t iter_haystack;
 
-      for (;;)
-        {
-          register chartype a;
-	  register const unsigned char *rhaystack, *rneedle;
-
-	  do
+	  mbui_init (iter_haystack, haystack);
+	  for (;; mbui_advance (iter_haystack))
 	    {
-	      a = *++haystack;
-	      if (a == '\0')
-		goto ret0;
-	      if (a == b)
-		break;
-	      a = *++haystack;
-	      if (a == '\0')
-		goto ret0;
-shloop:;    }
-          while (a != b);
+	      if (!mbui_avail (iter_haystack))
+		/* No match.  */
+		return NULL;
 
-jin:	  a = *++haystack;
-	  if (a == '\0')
-	    goto ret0;
+	      if (mb_equal (mbui_cur (iter_haystack), mbui_cur (iter_needle)))
+		/* The first character matches.  */
+		{
+		  mbui_iterator_t rhaystack;
+		  mbui_iterator_t rneedle;
 
-	  if (a != c)
-	    goto shloop;
+		  memcpy (&rhaystack, &iter_haystack, sizeof (mbui_iterator_t));
+		  mbui_advance (rhaystack);
 
-	  rhaystack = haystack-- + 1;
-	  rneedle = needle;
-	  a = *rneedle;
+		  mbui_init (rneedle, needle);
+		  if (!mbui_avail (rneedle))
+		    abort ();
+		  mbui_advance (rneedle);
 
-	  if (*rhaystack == a)
-	    do
-	      {
-		if (a == '\0')
-		  goto foundneedle;
-		++rhaystack;
-		a = *++needle;
-		if (*rhaystack != a)
-		  break;
-		if (a == '\0')
-		  goto foundneedle;
-		++rhaystack;
-		a = *++needle;
-	      }
-	    while (*rhaystack == a);
+		  for (;; mbui_advance (rhaystack), mbui_advance (rneedle))
+		    {
+		      if (!mbui_avail (rneedle))
+			/* Found a match.  */
+			return (char *) haystack;
+		      if (!mbui_avail (rhaystack))
+			/* No match.  */
+			return NULL;
+		      if (!mb_equal (mbui_cur (rhaystack), mbui_cur (rneedle)))
+			/* Nothing in this round.  */
+			break;
+		    }
+		}
+	    }
+	}
+      else
+	return (char *) haystack;
+    }
+  else
+#endif
+    {
+      if (*needle != '\0')
+	{
+	  /* Speed up the following searches of needle by caching its first
+	     character.  */
+	  char b = *needle++;
 
-	  needle = rneedle;		   /* took the register-poor approach */
+	  for (;; haystack++)
+	    {
+	      if (*haystack == '\0')
+		/* No match.  */
+		return NULL;
+	      if (*haystack == b)
+		/* The first character matches.  */
+		{
+		  const char *rhaystack = haystack + 1;
+		  const char *rneedle = needle;
 
-	  if (a == '\0')
-	    break;
-        }
+		  for (;; rhaystack++, rneedle++)
+		    {
+		      if (*rneedle == '\0')
+			/* Found a match.  */
+			return (char *) haystack;
+		      if (*rhaystack == '\0')
+			/* No match.  */
+			return NULL;
+		      if (*rhaystack != *rneedle)
+			/* Nothing in this round.  */
+			break;
+		    }
+		}
+	    }
+	}
+      else
+	return (char *) haystack;
     }
-foundneedle:
-  return (char*) haystack;
-ret0:
-  return 0;
 }
--- a/lib/strstr.h
+++ b/lib/strstr.h
@@ -1,5 +1,5 @@
 /* Searching in a string.
-   Copyright (C) 2001-2003 Free Software Foundation, Inc.
+   Copyright (C) 2001-2003, 2005 Free Software Foundation, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -15,13 +15,6 @@
    along with this program; if not, write to the Free Software Foundation,
    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
 
-#if HAVE_STRSTR
-
-/* Get strstr() declaration.  */
-#include <string.h>
-
-#else
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -32,5 +25,3 @@
 #ifdef __cplusplus
 }
 #endif
-
-#endif
--- a/m4/ChangeLog
+++ b/m4/ChangeLog
@@ -1,3 +1,8 @@
+2005-08-17  Bruno Haible  <bruno@clisp.org>
+
+	* strstr.m4 (gl_FUNC_STRSTR): Use the replacement function always.
+	(gl_PREREQ_STRSTR): Use gl_FUNC_MBRTOWC.
+
 2005-08-16  Paul Eggert  <eggert@cs.ucla.edu>
 
 	* getopt.m4 (gl_GETOPT_CHECK_HEADERS): Do not override the results
--- a/m4/strstr.m4
+++ b/m4/strstr.m4
@@ -1,16 +1,19 @@
-# strstr.m4 serial 2
-dnl Copyright (C) 2002-2003 Free Software Foundation, Inc.
+# strstr.m4 serial 3
+dnl Copyright (C) 2002-2003, 2005 Free Software Foundation, Inc.
 dnl This file is free software; the Free Software Foundation
 dnl gives unlimited permission to copy and/or distribute it,
 dnl with or without modifications, as long as this notice is preserved.
 
 AC_DEFUN([gl_FUNC_STRSTR],
 [
-  AC_REPLACE_FUNCS(strstr)
-  if test $ac_cv_func_strstr = no; then
-    gl_PREREQ_STRSTR
-  fi
+  dnl No known system has a strstr() function that works correctly in
+  dnl multibyte locales. Therefore we use our version always.
+  AC_LIBOBJ(strstr)
+  AC_DEFINE(strstr, rpl_strstr, [Define to rpl_strstr always.])
+  gl_PREREQ_STRSTR
 ])
 
 # Prerequisites of lib/strstr.c.
-AC_DEFUN([gl_PREREQ_STRSTR], [:])
+AC_DEFUN([gl_PREREQ_STRSTR], [
+  gl_FUNC_MBRTOWC
+])
--- a/modules/strstr
+++ b/modules/strstr
@@ -5,8 +5,10 @@
 lib/strstr.h
 lib/strstr.c
 m4/strstr.m4
+m4/mbrtowc.m4
 
 Depends-on:
+mbuiter
 
 configure.ac:
 gl_FUNC_STRSTR