changeset 10050:53b5cb33e138

Use u8_conv_from_encoding instead of using special code for the conversion.
author Bruno Haible <bruno@clisp.org>
date Sat, 10 May 2008 15:23:28 +0200
parents c67210e038d9
children fd29ccb01214
files ChangeLog lib/unilbrk/ulc-common.c lib/unilbrk/ulc-common.h lib/unilbrk/ulc-possible-linebreaks.c lib/unilbrk/ulc-width-linebreaks.c modules/unilbrk/ulc-common modules/unilbrk/ulc-possible-linebreaks modules/unilbrk/ulc-width-linebreaks
diffstat 8 files changed, 150 insertions(+), 319 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2008-05-10  Bruno Haible  <bruno@clisp.org>
+
+	* lib/unilbrk/ulc-common.c: Don't include <stdlib.h>.
+	(iconv_string_length, iconv_string_keeping_offsets): Remove functions.
+	* lib/unilbrk/ulc-common.h (iconv_string_length,
+	iconv_string_keeping_offsets): Remove declarations.
+	* lib/unilbrk/ulc-possible-linebreaks.c: Include <string.h>, uniconv.h.
+	Don't include <iconv.h>, streq.h, xsize.h.
+	(ulc_possible_linebreaks): Use u8_conv_from_encoding for doing the
+	conversion.
+	* lib/unilbrk/ulc-width-linebreaks.c: Include uniconv.h. Don't include
+	<iconv.h>, streq.h, xsize.h.
+	(ulc_width_linebreaks): Use u8_conv_from_encoding for doing the
+	conversion.
+	* modules/unilbrk/ulc-common (Depends-on): Remove iconv.
+	* modules/unilbrk/ulc-possible-linebreaks (Depends-on): Add
+	uniconv/u8-conv-from-enc. Remove iconv_open, streq, xsize.
+	* modules/unilbrk/ulc-width-linebreaks (Depends-on): Likewise.
+
 2008-05-10  Bruno Haible  <bruno@clisp.org>
 
 	* modules/unilbrk/ulc-width-linebreaks-tests: New file.
--- a/lib/unilbrk/ulc-common.c
+++ b/lib/unilbrk/ulc-common.c
@@ -20,8 +20,6 @@
 /* Specification.  */
 #include "unilbrk/ulc-common.h"
 
-#include <stdlib.h>
-
 #include "c-ctype.h"
 #include "streq.h"
 
@@ -33,122 +31,6 @@
   return 0;
 }
 
-#if HAVE_ICONV
-
-# include <errno.h>
-
-size_t
-iconv_string_length (iconv_t cd, const char *s, size_t n)
-{
-# define TMPBUFSIZE 4096
-  size_t count = 0;
-  char tmpbuf[TMPBUFSIZE];
-  const char *inptr = s;
-  size_t insize = n;
-
-  while (insize > 0)
-    {
-      char *outptr = tmpbuf;
-      size_t outsize = TMPBUFSIZE;
-      size_t res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
-      if (res == (size_t)(-1) && errno != E2BIG
-# if !defined _LIBICONV_VERSION && !defined __GLIBC__
-	  /* Irix iconv() inserts a NUL byte if it cannot convert.
-	     NetBSD iconv() inserts a question mark if it cannot convert.
-	     Only GNU libiconv and GNU libc are known to prefer to fail rather
-	     than doing a lossy conversion.  */
-	  || res > 0
-# endif
-	 )
-	return (size_t)(-1);
-      count += outptr - tmpbuf;
-    }
-  /* Avoid glibc-2.1 bug and Solaris 7 through 9 bug.  */
-# if defined _LIBICONV_VERSION \
-     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
-  {
-    char *outptr = tmpbuf;
-    size_t outsize = TMPBUFSIZE;
-    size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
-    if (res == (size_t)(-1))
-      return (size_t)(-1);
-    count += outptr - tmpbuf;
-  }
-  /* Return to the initial state.  */
-  iconv (cd, NULL, NULL, NULL, NULL);
-# endif
-  return count;
-# undef TMPBUFSIZE
-}
-
-void
-iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n,
-			      size_t *offtable, char *t, size_t m)
-{
-  size_t i;
-  const char *s_end;
-  const char *inptr;
-  char *outptr;
-  size_t outsize;
-  /* Avoid glibc-2.1 bug.  */
-# if !defined _LIBICONV_VERSION && (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1)
-  const size_t extra = 1;
-# else
-  const size_t extra = 0;
-# endif
-
-  for (i = 0; i < n; i++)
-    offtable[i] = (size_t)(-1);
-
-  s_end = s + n;
-  inptr = s;
-  outptr = t;
-  outsize = m + extra;
-  while (inptr < s_end)
-    {
-      const char *saved_inptr;
-      size_t insize;
-      size_t res;
-
-      offtable[inptr - s] = outptr - t;
-
-      saved_inptr = inptr;
-      res = (size_t)(-1);
-      for (insize = 1; inptr + insize <= s_end; insize++)
-	{
-	  res = iconv (cd, (ICONV_CONST char **) &inptr, &insize, &outptr, &outsize);
-	  if (!(res == (size_t)(-1) && errno == EINVAL))
-	    break;
-	  /* We expect that no input bytes have been consumed so far.  */
-	  if (inptr != saved_inptr)
-	    abort ();
-	}
-      /* After we verified the convertibility and computed the translation's
-	 size m, there shouldn't be any conversion error here. */
-      if (res == (size_t)(-1)
-# if !defined _LIBICONV_VERSION && !defined __GLIBC__
-	  /* Irix iconv() inserts a NUL byte if it cannot convert.
-	     NetBSD iconv() inserts a question mark if it cannot convert.
-	     Only GNU libiconv and GNU libc are known to prefer to fail rather
-	     than doing a lossy conversion.  */
-	  || res > 0
-# endif
-	 )
-	abort ();
-    }
-  /* Avoid glibc-2.1 bug and Solaris 7 bug.  */
-# if defined _LIBICONV_VERSION \
-     || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
-  if (iconv (cd, NULL, NULL, &outptr, &outsize) == (size_t)(-1))
-    abort ();
-# endif
-  /* We should have produced exactly m output bytes.  */
-  if (outsize != extra)
-    abort ();
-}
-
-#endif /* HAVE_ICONV */
-
 #if C_CTYPE_ASCII
 
 /* Tests whether a string is entirely ASCII.  Returns 1 if yes.
--- a/lib/unilbrk/ulc-common.h
+++ b/lib/unilbrk/ulc-common.h
@@ -23,22 +23,6 @@
 #define is_utf8_encoding unilbrk_is_utf8_encoding
 extern int is_utf8_encoding (const char *encoding);
 
-#if HAVE_ICONV
-
-# include <iconv.h>
-
-/* Luckily, the encoding's name is platform independent.  */
-# define UTF8_NAME "UTF-8"
-
-/* Return the length of a string after conversion through an iconv_t.  */
-# define iconv_string_length unilbrk_iconv_string_length
-extern size_t iconv_string_length (iconv_t cd, const char *s, size_t n);
-
-# define iconv_string_keeping_offsets unilbrk_iconv_string_keeping_offsets
-extern void iconv_string_keeping_offsets (iconv_t cd, const char *s, size_t n, size_t *offtable, char *t, size_t m);
-
-#endif /* HAVE_ICONV */
-
 #if C_CTYPE_ASCII
 
 # define is_all_ascii unilbrk_is_all_ascii
--- a/lib/unilbrk/ulc-possible-linebreaks.c
+++ b/lib/unilbrk/ulc-possible-linebreaks.c
@@ -21,13 +21,10 @@
 #include "unilbrk.h"
 
 #include <stdlib.h>
-#if HAVE_ICONV
-# include <iconv.h>
-#endif
+#include <string.h>
 
 #include "c-ctype.h"
-#include "streq.h"
-#include "xsize.h"
+#include "uniconv.h"
 #include "unilbrk/ulc-common.h"
 
 /* Line breaking of a string in an arbitrary encoding.
@@ -47,92 +44,73 @@
 ulc_possible_linebreaks (const char *s, size_t n, const char *encoding,
 			 char *p)
 {
-  if (n == 0)
-    return;
-  if (is_utf8_encoding (encoding))
-    u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p);
-  else
+  if (n > 0)
     {
-#if HAVE_ICONV
-      iconv_t to_utf8;
-      /* Avoid glibc-2.1 bug with EUC-KR.  */
-# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
-      if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
-	to_utf8 = (iconv_t)(-1);
+      if (is_utf8_encoding (encoding))
+	u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p);
       else
-# endif
-      /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
-	 GB18030.  */
-# if defined __sun && !defined _LIBICONV_VERSION
-      if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
-	  || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
-	  || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
-	  || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
-	  || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
-	  || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
-	to_utf8 = (iconv_t)(-1);
-      else
-# endif
-      to_utf8 = iconv_open (UTF8_NAME, encoding);
-      if (to_utf8 != (iconv_t)(-1))
 	{
-	  /* Determine the length of the resulting UTF-8 string.  */
-	  size_t m = iconv_string_length (to_utf8, s, n);
-	  if (m != (size_t)(-1))
+	  /* Convert the string to UTF-8 and build a translation table
+	     from offsets into s to offsets into the translated string.  */
+	  size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
+
+	  if (offsets != NULL)
 	    {
-	      /* Convert the string to UTF-8 and build a translation table
-		 from offsets into s to offsets into the translated string.  */
-	      size_t memory_size = xsum3 (xtimes (n, sizeof (size_t)), m, m);
-	      char *memory =
-		(size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
-	      if (memory != NULL)
+	      uint8_t *t = NULL;
+	      size_t m;
+	      if (u8_conv_from_encoding (encoding, iconveh_question_mark,
+					 s, n, offsets, &t, &m)
+		  == 0)
 		{
-		  size_t *offtable = (size_t *) memory;
-		  char *t = (char *) (offtable + n);
-		  char *q = (char *) (t + m);
-		  size_t i;
+		  char *q = (char *) malloc (m);
+
+		  if (q != NULL)
+		    {
+		      size_t i;
 
-		  iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
+		      /* Determine the possible line breaks of the UTF-8
+			 string.  */
+		      u8_possible_linebreaks (t, m, encoding, q);
 
-		  /* Determine the possible line breaks of the UTF-8 string.  */
-		  u8_possible_linebreaks ((const uint8_t *) t, m, encoding, q);
+		      /* Translate the result back to the original string.  */
+		      memset (p, UC_BREAK_PROHIBITED, n);
+		      for (i = 0; i < n; i++)
+			if (offsets[i] != (size_t)(-1))
+			  p[i] = q[offsets[i]];
 
-		  /* Translate the result back to the original string.  */
-		  memset (p, UC_BREAK_PROHIBITED, n);
-		  for (i = 0; i < n; i++)
-		    if (offtable[i] != (size_t)(-1))
-		      p[i] = q[offtable[i]];
+		      free (q);
+		      free (t);
+		      free (offsets);
+		      return;
+		    }
+		  free (t);
+		}
+	      free (offsets);
+	    }
 
-		  free (memory);
-		  iconv_close (to_utf8);
-		  return;
-		}
+	  /* Impossible to convert.  */
+#if C_CTYPE_ASCII
+	  if (is_all_ascii (s, n))
+	    {
+	      /* ASCII is a subset of UTF-8.  */
+	      u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p);
+	      return;
 	    }
-	  iconv_close (to_utf8);
+#endif
+	  /* We have a non-ASCII string and cannot convert it.
+	     Don't produce line breaks except those already present in the
+	     input string.  All we assume here is that the encoding is
+	     minimally ASCII compatible.  */
+	  {
+	    const char *s_end = s + n;
+	    while (s < s_end)
+	      {
+		*p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
+		s++;
+		p++;
+	      }
+	  }
 	}
-#endif
-      /* Impossible to convert.  */
-#if C_CTYPE_ASCII
-      if (is_all_ascii (s, n))
-	{
-	  /* ASCII is a subset of UTF-8.  */
-	  u8_possible_linebreaks ((const uint8_t *) s, n, encoding, p);
-	  return;
-	}
-#endif
-      /* We have a non-ASCII string and cannot convert it.
-	 Don't produce line breaks except those already present in the
-	 input string.  All we assume here is that the encoding is
-	 minimally ASCII compatible.  */
-      {
-	const char *s_end = s + n;
-	while (s < s_end)
-	  {
-	    *p = (*s == '\n' ? UC_BREAK_MANDATORY : UC_BREAK_PROHIBITED);
-	    s++;
-	    p++;
-	  }
-      }
     }
 }
 
--- a/lib/unilbrk/ulc-width-linebreaks.c
+++ b/lib/unilbrk/ulc-width-linebreaks.c
@@ -22,13 +22,9 @@
 
 #include <stdlib.h>
 #include <string.h>
-#if HAVE_ICONV
-# include <iconv.h>
-#endif
 
 #include "c-ctype.h"
-#include "streq.h"
-#include "xsize.h"
+#include "uniconv.h"
 #include "unilbrk/ulc-common.h"
 
 /* Line breaking of a string in an arbitrary encoding.
@@ -50,113 +46,90 @@
 		      const char *o, const char *encoding,
 		      char *p)
 {
-  if (n == 0)
-    return start_column;
-  if (is_utf8_encoding (encoding))
-    return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
-  else
+  if (n > 0)
     {
-#if HAVE_ICONV
-      iconv_t to_utf8;
-      /* Avoid glibc-2.1 bug with EUC-KR.  */
-# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
-      if (STREQ (encoding, "EUC-KR", 'E', 'U', 'C', '-', 'K', 'R', 0, 0, 0))
-	to_utf8 = (iconv_t)(-1);
-      else
-# endif
-      /* Avoid Solaris 9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS, GBK,
-	 GB18030.  */
-# if defined __sun && !defined _LIBICONV_VERSION
-      if (   STREQ (encoding, "GB2312", 'G', 'B', '2', '3', '1', '2', 0, 0, 0)
-	  || STREQ (encoding, "EUC-TW", 'E', 'U', 'C', '-', 'T', 'W', 0, 0, 0)
-	  || STREQ (encoding, "BIG5", 'B', 'I', 'G', '5', 0, 0, 0, 0, 0)
-	  || STREQ (encoding, "BIG5-HKSCS", 'B', 'I', 'G', '5', '-', 'H', 'K', 'S', 'C')
-	  || STREQ (encoding, "GBK", 'G', 'B', 'K', 0, 0, 0, 0, 0, 0)
-	  || STREQ (encoding, "GB18030", 'G', 'B', '1', '8', '0', '3', '0', 0, 0))
-	to_utf8 = (iconv_t)(-1);
+      if (is_utf8_encoding (encoding))
+	return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
       else
-# endif
-      to_utf8 = iconv_open (UTF8_NAME, encoding);
-      if (to_utf8 != (iconv_t)(-1))
 	{
-	  /* Determine the length of the resulting UTF-8 string.  */
-	  size_t m = iconv_string_length (to_utf8, s, n);
-	  if (m != (size_t)(-1))
+	  /* Convert the string to UTF-8 and build a translation table
+	     from offsets into s to offsets into the translated string.  */
+	  size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
+
+	  if (offsets != NULL)
 	    {
-	      /* Convert the string to UTF-8 and build a translation table
-		 from offsets into s to offsets into the translated string.  */
-	      size_t memory_size =
-		xsum4 (xtimes (n, sizeof (size_t)), m, m,
-		       (o != NULL ? m : 0));
-	      char *memory =
-		(char *)
-		(size_in_bounds_p (memory_size) ? malloc (memory_size) : NULL);
-	      if (memory != NULL)
+	      uint8_t *t = NULL;
+	      size_t m;
+	      if (u8_conv_from_encoding (encoding, iconveh_question_mark,
+					 s, n, offsets, &t, &m)
+		  == 0)
 		{
-		  size_t *offtable = (size_t *) memory;
-		  char *t = (char *) (offtable + n);
-		  char *q = (char *) (t + m);
-		  char *o8 = (o != NULL ? (char *) (q + m) : NULL);
-		  int res_column;
-		  size_t i;
+		  char *memory = (char *) malloc (m + (o != NULL ? m : 0));
+
+		  if (memory != NULL)
+		    {
+		      char *q = (char *) memory;
+		      char *o8 = (o != NULL ? (char *) (q + m) : NULL);
+		      int res_column;
+		      size_t i;
 
-		  iconv_string_keeping_offsets (to_utf8, s, n, offtable, t, m);
+		      /* Translate the overrides to the UTF-8 string.  */
+		      if (o != NULL)
+			{
+			  memset (o8, UC_BREAK_UNDEFINED, m);
+			  for (i = 0; i < n; i++)
+			    if (offsets[i] != (size_t)(-1))
+			      o8[offsets[i]] = o[i];
+			}
+
+		      /* Determine the line breaks of the UTF-8 string.  */
+		      res_column =
+			u8_width_linebreaks (t, m, width, start_column, at_end_columns, o8, encoding, q);
 
-		  /* Translate the overrides to the UTF-8 string.  */
-		  if (o != NULL)
-		    {
-		      memset (o8, UC_BREAK_UNDEFINED, m);
+		      /* Translate the result back to the original string.  */
+		      memset (p, UC_BREAK_PROHIBITED, n);
 		      for (i = 0; i < n; i++)
-			if (offtable[i] != (size_t)(-1))
-			  o8[offtable[i]] = o[i];
-		    }
-
-		  /* Determine the line breaks of the UTF-8 string.  */
-		  res_column =
-		    u8_width_linebreaks ((const uint8_t *) t, m, width, start_column, at_end_columns, o8, encoding, q);
+			if (offsets[i] != (size_t)(-1))
+			  p[i] = q[offsets[i]];
 
-		  /* Translate the result back to the original string.  */
-		  memset (p, UC_BREAK_PROHIBITED, n);
-		  for (i = 0; i < n; i++)
-		    if (offtable[i] != (size_t)(-1))
-		      p[i] = q[offtable[i]];
-
-		  free (memory);
-		  iconv_close (to_utf8);
-		  return res_column;
+		      free (memory);
+		      free (t);
+		      free (offsets);
+		      return res_column;
+		    }
+		  free (t);
 		}
+	      free (offsets);
 	    }
-	  iconv_close (to_utf8);
-	}
+	  /* Impossible to convert.  */
+#if C_CTYPE_ASCII
+	  if (is_all_ascii (s, n))
+	    {
+	      /* ASCII is a subset of UTF-8.  */
+	      return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
+	    }
 #endif
-      /* Impossible to convert.  */
-#if C_CTYPE_ASCII
-      if (is_all_ascii (s, n))
-	{
-	  /* ASCII is a subset of UTF-8.  */
-	  return u8_width_linebreaks ((const uint8_t *) s, n, width, start_column, at_end_columns, o, encoding, p);
+	  /* We have a non-ASCII string and cannot convert it.
+	     Don't produce line breaks except those already present in the
+	     input string.  All we assume here is that the encoding is
+	     minimally ASCII compatible.  */
+	  {
+	    const char *s_end = s + n;
+	    while (s < s_end)
+	      {
+		*p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
+		      ? UC_BREAK_MANDATORY
+		      : UC_BREAK_PROHIBITED);
+		s++;
+		p++;
+		if (o != NULL)
+		  o++;
+	      }
+	    /* We cannot compute widths in this case.  */
+	  }
 	}
-#endif
-      /* We have a non-ASCII string and cannot convert it.
-	 Don't produce line breaks except those already present in the
-	 input string.  All we assume here is that the encoding is
-	 minimally ASCII compatible.  */
-      {
-	const char *s_end = s + n;
-	while (s < s_end)
-	  {
-	    *p = ((o != NULL && *o == UC_BREAK_MANDATORY) || *s == '\n'
-		  ? UC_BREAK_MANDATORY
-		  : UC_BREAK_PROHIBITED);
-	    s++;
-	    p++;
-	    if (o != NULL)
-	      o++;
-	  }
-	/* We cannot compute widths in this case.  */
-	return start_column;
-      }
     }
+  return start_column;
 }
 
 
--- a/modules/unilbrk/ulc-common
+++ b/modules/unilbrk/ulc-common
@@ -7,7 +7,6 @@
 
 Depends-on:
 c-ctype
-iconv
 streq
 
 configure.ac:
--- a/modules/unilbrk/ulc-possible-linebreaks
+++ b/modules/unilbrk/ulc-possible-linebreaks
@@ -8,10 +8,8 @@
 unilbrk/base
 unilbrk/u8-possible-linebreaks
 unilbrk/ulc-common
+uniconv/u8-conv-from-enc
 c-ctype
-iconv_open
-streq
-xsize
 
 configure.ac:
 
--- a/modules/unilbrk/ulc-width-linebreaks
+++ b/modules/unilbrk/ulc-width-linebreaks
@@ -8,10 +8,8 @@
 unilbrk/base
 unilbrk/u8-width-linebreaks
 unilbrk/ulc-common
+uniconv/u8-conv-from-enc
 c-ctype
-iconv_open
-streq
-xsize
 
 configure.ac: