changeset 7261:f75588924b8c

New module 'striconv'.
author Bruno Haible <bruno@clisp.org>
date Wed, 06 Sep 2006 12:21:39 +0000
parents 59f9146ac193
children 7d52a0015061
files lib/striconv.c lib/striconv.h modules/striconv
diffstat 3 files changed, 536 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
new file mode 100644
--- /dev/null
+++ b/lib/striconv.c
@@ -0,0 +1,430 @@
+/* Charset conversion.
+   Copyright (C) 2001-2006 Free Software Foundation, Inc.
+   Written by Bruno Haible and Simon Josefsson.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+/* Specification.  */
+#include "striconv.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if HAVE_ICONV
+# include <iconv.h>
+/* Get MB_LEN_MAX, CHAR_BIT.  */
+# include <limits.h>
+#endif
+
+#include "strdup.h"
+#include "c-strcase.h"
+
+#ifndef SIZE_MAX
+# define SIZE_MAX ((size_t) -1)
+#endif
+
+
+#if HAVE_ICONV
+
+int
+mem_cd_iconv (const char *src, size_t srclen, iconv_t cd,
+	      char **resultp, size_t *lengthp)
+{
+# define tmpbufsize 4096
+  size_t length;
+  char *result;
+
+  /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
+# if defined _LIBICONV_VERSION \
+    || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
+  /* Set to the initial state.  */
+  iconv (cd, NULL, NULL, NULL, NULL);
+# endif
+
+  /* Determine the length we need.  */
+  {
+    size_t count = 0;
+    char tmpbuf[tmpbufsize];
+    const char *inptr = src;
+    size_t insize = srclen;
+
+    while (insize > 0)
+      {
+	char *outptr = tmpbuf;
+	size_t outsize = tmpbufsize;
+	size_t res = iconv (cd,
+			    (ICONV_CONST char **) &inptr, &insize,
+			    &outptr, &outsize);
+
+	if (res == (size_t)(-1))
+	  {
+	    if (errno == E2BIG)
+	      ;
+	    else if (errno == EINVAL)
+	      break;
+	    else
+	      return -1;
+	  }
+# if !defined _LIBICONV_VERSION && (defined sgi || defined __sgi)
+	/* Irix iconv() inserts a NUL byte if it cannot convert.  */
+	else if (res > 0)
+	  {
+	    errno = EILSEQ;
+	    return -1;
+	  }
+# endif
+	count += outptr - tmpbuf;
+      }
+    /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
+# if defined _LIBICONV_VERSION \
+    || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
+    {
+      char *outptr = tmpbuf;
+      size_t outsize = tmpbufsize;
+      size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
+
+      if (res == (size_t)(-1))
+	return -1;
+      count += outptr - tmpbuf;
+    }
+# endif
+    length = count;
+  }
+
+  if (length == 0)
+    {
+      *lengthp = 0;
+      return 0;
+    }
+  result = (*resultp != NULL ? realloc (*resultp, length) : malloc (length));
+  if (result == NULL)
+    {
+      errno = ENOMEM;
+      return -1;
+    }
+  *resultp = result;
+  *lengthp = length;
+
+  /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
+# if defined _LIBICONV_VERSION \
+    || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
+  /* Return to the initial state.  */
+  iconv (cd, NULL, NULL, NULL, NULL);
+# endif
+
+  /* Do the conversion for real.  */
+  {
+    const char *inptr = src;
+    size_t insize = srclen;
+    char *outptr = result;
+    size_t outsize = length;
+
+    while (insize > 0)
+      {
+	size_t res = iconv (cd,
+			    (ICONV_CONST char **) &inptr, &insize,
+			    &outptr, &outsize);
+
+	if (res == (size_t)(-1))
+	  {
+	    if (errno == EINVAL)
+	      break;
+	    else
+	      return -1;
+	  }
+# if !defined _LIBICONV_VERSION && (defined sgi || defined __sgi)
+	/* Irix iconv() inserts a NUL byte if it cannot convert.  */
+	else if (res > 0)
+	  {
+	    errno = EILSEQ;
+	    return -1;
+	  }
+# endif
+      }
+    /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
+# if defined _LIBICONV_VERSION \
+    || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
+    {
+      size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
+
+      if (res == (size_t)(-1))
+	return -1;
+    }
+# endif
+    if (outsize != 0)
+      abort ();
+  }
+
+  return 0;
+# undef tmpbufsize
+}
+
+char *
+str_cd_iconv (const char *src, iconv_t cd)
+{
+  /* For most encodings, a trailing NUL byte in the input will be converted
+     to a trailing NUL byte in the output.  But not for UTF-7.  So that this
+     function is usable for UTF-7, we have to exclude the NUL byte from the
+     conversion and add it by hand afterwards.  */
+# if PROBABLY_SLOWER
+
+  char *result = NULL;
+  size_t length;
+  int retval = mem_cd_iconv (src, strlen (src), cd, &result, &length);
+  char *final_result;
+
+  if (retval < 0)
+    {
+      if (result != NULL)
+	{
+	  int saved_errno = errno;
+	  free (result);
+	  errno = saved_errno;
+	}
+      return NULL;
+    }
+
+  /* Add the terminating NUL byte.  */
+  final_result =
+    (result != NULL ? realloc (result, length + 1) : malloc (length + 1));
+  if (final_result == NULL)
+    {
+      if (result != NULL)
+	free (result);
+      errno = ENOMEM;
+      return NULL;
+    }
+  final_result[length] = '\0';
+
+  return final_result;
+
+# else
+
+  char *result;
+  size_t result_size;
+  size_t length;
+  const char *inptr = src;
+  size_t inbytes_remaining = strlen (src);
+
+  /* Make a guess for the worst-case output size, in order to avoid a
+     realloc.  It's OK if the guess is wrong as long as it is not zero and
+     doesn't lead to an integer overflow.  */
+  result_size = inbytes_remaining;
+  {
+    size_t approx_sqrt_SIZE_MAX = SIZE_MAX >> (sizeof (size_t) * CHAR_BIT / 2);
+    if (result_size <= approx_sqrt_SIZE_MAX / MB_LEN_MAX)
+      result_size *= MB_LEN_MAX;
+  }
+  result_size += 1; /* for the terminating NUL */
+
+  result = (char *) malloc (result_size);
+  if (result == NULL)
+    {
+      errno = ENOMEM;
+      return NULL;
+    }
+
+  /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug.  */
+# if defined _LIBICONV_VERSION \
+    || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
+  /* Set to the initial state.  */
+  iconv (cd, NULL, NULL, NULL, NULL);
+# endif
+
+  /* Do the conversion.  */
+  {
+    char *outptr = result;
+    size_t outbytes_remaining = result_size - 1;
+
+    for (;;)
+      {
+	/* Here inptr + inbytes_remaining = src + strlen (src),
+		outptr + outbytes_remaining = result + result_size - 1.  */
+	size_t res = iconv (cd,
+			    (ICONV_CONST char **) &inptr, &inbytes_remaining,
+			    &outptr, &outbytes_remaining);
+
+	if (res == (size_t)(-1))
+	  {
+	    if (errno == EINVAL)
+	      break;
+	    else if (errno == E2BIG)
+	      {
+		size_t used = outptr - result;
+		size_t newsize = result_size * 2;
+		char *newresult;
+
+		if (!(newsize > result_size))
+		  {
+		    errno = ENOMEM;
+		    goto failed;
+		  }
+		newresult = (char *) realloc (result, newsize);
+		if (newresult == NULL)
+		  {
+		    errno = ENOMEM;
+		    goto failed;
+		  }
+		result = newresult;
+		result_size = newsize;
+		outptr = result + used;
+		outbytes_remaining = result_size - 1 - used;
+	      }
+	    else
+	      goto failed;
+	  }
+# if !defined _LIBICONV_VERSION && (defined sgi || defined __sgi)
+	/* Irix iconv() inserts a NUL byte if it cannot convert.  */
+	else if (res > 0)
+	  {
+	    errno = EILSEQ;
+	    goto failed;
+	  }
+# endif
+	else
+	  break;
+      }
+    /* Avoid glibc-2.1 bug and Solaris 2.7 bug.  */
+# if defined _LIBICONV_VERSION \
+    || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
+    for (;;)
+      {
+	/* Here outptr + outbytes_remaining = result + result_size - 1.  */
+	size_t res = iconv (cd, NULL, NULL, &outptr, &outbytes_remaining);
+
+	if (res == (size_t)(-1))
+	  {
+	    if (errno == E2BIG)
+	      {
+		size_t used = outptr - result;
+		size_t newsize = result_size * 2;
+		char *newresult;
+
+		if (!(newsize > result_size))
+		  {
+		    errno = ENOMEM;
+		    goto failed;
+		  }
+		newresult = (char *) realloc (result, newsize);
+		if (newresult == NULL)
+		  {
+		    errno = ENOMEM;
+		    goto failed;
+		  }
+		result = newresult;
+		result_size = newsize;
+		outptr = result + used;
+		outbytes_remaining = result_size - 1 - used;
+	      }
+	    else
+	      goto failed;
+	  }
+	else
+	  break;
+      }
+# endif
+
+    /* Add the terminating NUL byte.  */
+    *outptr++ = '\0';
+
+    length = outptr - result;
+  }
+
+  /* Give away unused memory.  */
+  if (length < result_size)
+    {
+      char *smaller_result = (char *) realloc (result, length);
+
+      if (smaller_result != NULL)
+	result = smaller_result;
+    }
+
+  return result;
+
+ failed:
+  {
+    int saved_errno = errno;
+    free (result);
+    errno = saved_errno;
+    return NULL;
+  }
+
+# endif
+}
+
+#endif
+
+char *
+str_iconv (const char *src, const char *from_codeset, const char *to_codeset)
+{
+  if (c_strcasecmp (from_codeset, to_codeset) == 0)
+    return strdup (src);
+  else
+    {
+#if HAVE_ICONV
+      iconv_t cd;
+      char *result;
+
+      /* Avoid glibc-2.1 bug with EUC-KR.  */
+# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
+      if (c_strcasecmp (from_codeset, "EUC-KR") == 0
+	  || c_strcasecmp (to_codeset, "EUC-KR") == 0)
+	{
+	  errno = EINVAL;
+	  return NULL;
+	}
+# endif
+      cd = iconv_open (to_codeset, from_codeset);
+      if (cd == (iconv_t) -1)
+	return NULL;
+
+      result = str_cd_iconv (src, cd);
+
+      if (result == NULL)
+	{
+	  /* Close cd, but preserve the errno from str_cd_iconv.  */
+	  int saved_errno = errno;
+	  iconv_close (cd);
+	  errno = saved_errno;
+	}
+      else
+	{
+	  if (iconv_close (cd) < 0)
+	    {
+	      /* Return NULL, but free the allocated memory, and while doing
+		 that, preserve the errno from iconv_close.  */
+	      int saved_errno = errno;
+	      free (result);
+	      errno = saved_errno;
+	      return NULL;
+	    }
+	}
+      return result;
+#else
+      /* This is a different error code than if iconv_open existed but didn't
+	 support from_codeset and to_codeset, so that the caller can emit
+	 an error message such as
+	   "iconv() is not supported. Installing GNU libiconv and
+	    then reinstalling this package would fix this."  */
+      errno = ENOSYS;
+      return NULL;
+#endif
+    }
+}
new file mode 100644
--- /dev/null
+++ b/lib/striconv.h
@@ -0,0 +1,75 @@
+/* Charset conversion.
+   Copyright (C) 2001-2004, 2006 Free Software Foundation, Inc.
+   Written by Bruno Haible and Simon Josefsson.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software Foundation,
+   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
+
+#ifndef _STRICONV_H
+#define _STRICONV_H
+
+#include <stddef.h>
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#if HAVE_ICONV
+
+/* Convert an entire string from one encoding to another, using iconv.
+   The original string is at [SRC,...,SRC+SRCLEN-1].
+   The conversion descriptor is passed as CD.
+   *RESULTP should initially contain NULL or a malloced memory block.
+   May change the size of the allocated memory block in *RESULTP, storing
+   its new address in *RESULTP and its new length in *LENGTHP.
+   Return value: 0 if successful, otherwise -1 and errno set.
+   If successful, the resulting string is stored in *RESULTP and its length
+   in *LENGTHP.  */
+extern int mem_cd_iconv (const char *src, size_t srclen, iconv_t cd,
+			 char **resultp, size_t *lengthp);
+
+/* Convert an entire string from one encoding to another, using iconv.
+   The original string is the NUL-terminated string starting at SRC.
+   The conversion descriptor is passed as CD.  Both the "from" and the "to"
+   encoding must use a single NUL byte at the end of the string (i.e. not
+   UCS-2, UCS-4, UTF-16, UTF-32).
+   Allocate a malloced memory block for the result.
+   Return value: the freshly allocated resulting NUL-terminated string if
+   successful, otherwise NULL and errno set.  */
+extern char * str_cd_iconv (const char *src, iconv_t cd);
+
+#endif
+
+/* Convert an entire string from one encoding to another, using iconv.
+   The original string is the NUL-terminated string starting at SRC.
+   Both the "from" and the "to" encoding must use a single NUL byte at the
+   end of the string (i.e. not UCS-2, UCS-4, UTF-16, UTF-32).
+   Allocate a malloced memory block for the result.
+   Return value: the freshly allocated resulting NUL-terminated string if
+   successful, otherwise NULL and errno set.  */
+extern char * str_iconv (const char *src,
+			 const char *from_codeset, const char *to_codeset);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* _STRICONV_H */
new file mode 100644
--- /dev/null
+++ b/modules/striconv
@@ -0,0 +1,31 @@
+Description:
+Character set conversion of strings made easy, uses iconv.
+
+Files:
+lib/striconv.h
+lib/striconv.c
+
+Depends-on:
+iconv
+strdup
+c-strcase
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += striconv.h striconv.c
+if GL_COND_LIBTOOL
+lib_LDFLAGS += $(LTLIBICONV)
+else
+lib_LDFLAGS += $(LIBICONV)
+endif
+
+Include:
+"striconv.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible, Simon Josefsson
+