changeset 2124:30abc8682bdf

Quote multibyte characters correctly. (ISGRAPH): Remove. (ISPRINT): New macro. (<wchar.h>): Include if HAVE_MBRTOWC && HAVE_WCHAR_H. (isprint, mbrtowc, mbsinit, mbstate_t): New macros, defined if ! (HAVE_MBRTOWC && HAVE_WCHAR_H). (quotearg_buffer_restyled): New function, with most of the old quotearg_buffer's contents. Major rewrite to support multibyte characters. (quotearg_buffer): Now just calls quotearg_buffer_restyled.
author Jim Meyering <jim@meyering.net>
date Sat, 15 Jan 2000 11:57:11 +0000
parents 25058bed1548
children decad920f28f
files lib/quotearg.c
diffstat 1 files changed, 301 insertions(+), 164 deletions(-) [+]
line wrap: on
line diff
--- a/lib/quotearg.c
+++ b/lib/quotearg.c
@@ -1,5 +1,5 @@
 /* quotearg.c - quote arguments for output
-   Copyright (C) 1998, 1999 Free Software Foundation, Inc.
+   Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
 
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -17,8 +17,6 @@
 
 /* Written by Paul Eggert <eggert@twinsun.com> */
 
-/* FIXME: Multibyte characters are not supported yet.  */
-
 #if HAVE_CONFIG_H
 # include <config.h>
 #endif
@@ -33,11 +31,7 @@
 #else
 # define ISASCII(c) isascii (c)
 #endif
-#ifdef isgraph
-# define ISGRAPH(c) (ISASCII (c) && isgraph (c))
-#else
-# define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
-#endif
+#define ISPRINT(c) (ISASCII (c) && isprint (c))
 
 #if ENABLE_NLS
 # include <libintl.h>
@@ -64,6 +58,15 @@
 # include <string.h>
 #endif
 
+#if HAVE_MBRTOWC && HAVE_WCHAR_H
+# include <wchar.h>
+#else
+# define iswprint(wc) 1
+# define mbrtowc(pwc, s, n, ps) 1
+# define mbsinit(ps) 1
+# define mbstate_t int
+#endif
+
 #define INT_BITS (sizeof (int) * CHAR_BIT)
 
 struct quoting_options
@@ -71,7 +74,7 @@
   /* Basic quoting style.  */
   enum quoting_style style;
 
-  /* Quote the chararacters indicated by this bit vector even if the
+  /* Quote the characters indicated by this bit vector even if the
      quoting style would not normally require them to be quoted.  */
   int quote_these_too[((UCHAR_MAX + 1) / INT_BITS
 		       + ((UCHAR_MAX + 1) % INT_BITS != 0))];
@@ -89,7 +92,7 @@
   0
 };
 
-/* Correspondances to quoting style names.  */
+/* Correspondences to quoting style names.  */
 enum quoting_style const quoting_style_vals[] =
 {
   literal_quoting_style,
@@ -147,6 +150,292 @@
 }
 
 /* Place into buffer BUFFER (of size BUFFERSIZE) a quoted version of
+   argument ARG (of size ARGSIZE), using QUOTING_STYLE and the
+   non-quoting-style part of O to control quoting.
+   Terminate the output with a null character, and return the written
+   size of the output, not counting the terminating null.
+   If BUFFERSIZE is too small to store the output string, return the
+   value that would have been returned had BUFFERSIZE been large enough.
+   If ARGSIZE is -1, use the string length of the argument for ARGSIZE.
+
+   This function acts like quotearg_buffer (BUFFER, BUFFERSIZE, ARG,
+   ARGSIZE, O), except it uses QUOTING_STYLE instead of the quoting
+   style specified by O, and O may not be null.  */
+
+static size_t
+quotearg_buffer_restyled (char *buffer, size_t buffersize,
+			  char const *arg, size_t argsize,
+			  enum quoting_style quoting_style,
+			  struct quoting_options const *o)
+{
+  size_t i;
+  size_t len = 0;
+  char const *quote_string = 0;
+  size_t quote_string_len = 0;
+  int backslash_escapes = 0;
+
+#define STORE(c) \
+    do \
+      { \
+	if (len < buffersize) \
+	  buffer[len] = (c); \
+	len++; \
+      } \
+    while (0)
+
+  switch (quoting_style)
+    {
+    case c_quoting_style:
+      STORE ('"');
+      backslash_escapes = 1;
+      quote_string = "\"";
+      quote_string_len = 1;
+      break;
+
+    case escape_quoting_style:
+      backslash_escapes = 1;
+      break;
+
+    case locale_quoting_style:
+      for (quote_string = _("`"); *quote_string; quote_string++)
+	STORE (*quote_string);
+      backslash_escapes = 1;
+      quote_string = _("'");
+      quote_string_len = strlen (quote_string);
+      break;
+
+    case shell_always_quoting_style:
+      STORE ('\'');
+      quote_string = "'";
+      quote_string_len = 1;
+      break;
+
+    default:
+      break;
+    }
+
+  for (i = 0;  ! (argsize == (size_t) -1 ? arg[i] == '\0' : i == argsize);  i++)
+    {
+      unsigned char c;
+      unsigned char esc;
+
+      if (backslash_escapes
+	  && quote_string_len
+	  && i + quote_string_len <= argsize
+	  && memcmp (arg + i, quote_string, quote_string_len) == 0)
+	STORE ('\\');
+
+      c = arg[i];
+      switch (c)
+	{
+	case '?':
+	  switch (quoting_style)
+	    {
+	    case shell_quoting_style:
+	      goto use_shell_always_quoting_style;
+
+	    case c_quoting_style:
+	      if (i + 2 < argsize && arg[i + 1] == '?')
+		switch (arg[i + 2])
+		  {
+		  case '!': case '\'':
+		  case '(': case ')': case '-': case '/':
+		  case '<': case '=': case '>':
+		    /* Escape the second '?' in what would otherwise be
+		       a trigraph.  */
+		    i += 2;
+		    c = arg[i + 2];
+		    STORE ('?');
+		    STORE ('\\');
+		    STORE ('?');
+		    break;
+		  }
+	      break;
+
+	    default:
+	      break;
+	    }
+	  break;
+
+#if HAVE_C_BACKSLASH_A
+	case '\a': esc = 'a'; goto c_escape;
+#endif
+	case '\b': esc = 'b'; goto c_escape;
+	case '\f': esc = 'f'; goto c_escape;
+	case '\n': esc = 'n'; goto c_escape;
+	case '\r': esc = 'r'; goto c_escape;
+	case '\t': esc = 't'; goto c_escape;
+	case '\v': esc = 'v'; goto c_escape;
+	case '\\': esc = c; goto c_escape;
+
+	c_escape:
+	  if (backslash_escapes)
+	    {
+	      c = esc;
+	      goto store_escape;
+	    }
+	  if (quoting_style == shell_quoting_style)
+	    goto use_shell_always_quoting_style;
+	  break;
+
+	case '#': case '~':
+	  if (i != 0)
+	    break;
+	  /* Fall through.  */
+	case ' ':
+	case '!': /* special in bash */
+	case '"': case '$': case '&':
+	case '(': case ')': case '*': case ';':
+	case '<': case '>': case '[':
+	case '^': /* special in old /bin/sh, e.g. SunOS 4.1.4 */
+	case '`': case '|':
+	  /* A shell special character.  In theory, '$' and '`' could
+	     be the first bytes of multibyte characters, which means
+	     we should check them with mbrtowc, but in practice this
+	     doesn't happen so it's not worth worrying about.  */
+	  if (quoting_style == shell_quoting_style)
+	    goto use_shell_always_quoting_style;
+	  break;
+
+	case '\'':
+	  switch (quoting_style)
+	    {
+	    case shell_quoting_style:
+	      goto use_shell_always_quoting_style;
+
+	    case shell_always_quoting_style:
+	      STORE ('\'');
+	      STORE ('\\');
+	      STORE ('\'');
+	      break;
+
+	    default:
+	      break;
+	    }
+	  break;
+
+	case '%': case '+': case ',': case '-': case '.': case '/':
+	case '0': case '1': case '2': case '3': case '4': case '5':
+	case '6': case '7': case '8': case '9': case ':': case '=':
+	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
+	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
+	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
+	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
+	case 'Y': case 'Z': case ']': case '_': case 'a': case 'b':
+	case 'c': case 'd': case 'e': case 'f': case 'g': case 'h':
+	case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
+	case 'o': case 'p': case 'q': case 'r': case 's': case 't':
+	case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
+	case '{': case '}':
+	  /* These characters don't cause problems, no matter what the
+	     quoting style is.  They cannot start multibyte sequences.  */
+	  break;
+
+	default:
+	  /* If we have a multibyte sequence, copy it until we reach
+	     its end, find an error, or come back to the initial shift
+	     state.  For C-like styles, if the sequence has
+	     unprintable characters, escape the whole sequence, since
+	     we can't easily escape single characters within it.  */
+	  {
+	    /* Length of multibyte sequence found so far.  */
+	    size_t m = 0;
+
+	    int printable = 1;
+	    mbstate_t mbstate;
+	    memset (&mbstate, 0, sizeof mbstate);
+
+	    if (argsize == (size_t) -1)
+	      argsize = strlen (arg);
+
+	    do
+	      {
+		wchar_t w;
+		size_t bytes = mbrtowc (&w, &arg[i + m],
+					argsize - (i + m), &mbstate);
+		if (bytes == 0)
+		  break;
+		else if (bytes == (size_t) -1)
+		  {
+		    printable = 0;
+		    break;
+		  }
+		else if (bytes == (size_t) -2)
+		  {
+		    printable = 0;
+		    while (i + m < argsize && arg[i + m])
+		      m++;
+		    break;
+		  }
+		else
+		  {
+		    if (! iswprint (w))
+		      printable = 0;
+		    m += bytes;
+		  }
+	      }
+	    while (! mbsinit (&mbstate));
+
+	    if (m <= 1)
+	      {
+		/* Escape a unibyte character like a multibyte
+		   sequence if using backslash escapes, and if the
+		   character is not printable.  */
+		m = backslash_escapes && ! ISPRINT (c);
+		printable = 0;
+	      }
+
+	    if (m)
+	      {
+		/* Output a multibyte sequence, or an escaped
+		   unprintable unibyte character.  */
+		size_t imax = i + m - 1;
+
+		for (;;)
+		  {
+		    if (backslash_escapes && ! printable)
+		      {
+			STORE ('\\');
+			STORE ('0' + (c >> 6));
+			STORE ('0' + ((c >> 3) & 7));
+			c = '0' + (c & 7);
+		      }
+		    if (i == imax)
+		      break;
+		    STORE (c);
+		    c = arg[++i];
+		  }
+
+		goto store_c;
+	      }
+	  }
+	}
+
+      if (! (backslash_escapes
+	     && o->quote_these_too[c / INT_BITS] & (1 << (c % INT_BITS))))
+	goto store_c;
+
+    store_escape:
+      STORE ('\\');
+
+    store_c:
+      STORE (c);
+    }
+
+  if (quote_string)
+    for (; *quote_string; quote_string++)
+      STORE (*quote_string);
+
+  if (len < buffersize)
+    buffer[len] = '\0';
+  return len;
+
+ use_shell_always_quoting_style:
+  return quotearg_buffer_restyled (buffer, buffersize, arg, argsize,
+				   shell_always_quoting_style, o);
+}
+
+/* Place into buffer BUFFER (of size BUFFERSIZE) a quoted version of
    argument ARG (of size ARGSIZE), using O to control quoting.
    If O is null, use the default.
    Terminate the output with a null character, and return the written
@@ -159,161 +448,9 @@
 		 char const *arg, size_t argsize,
 		 struct quoting_options const *o)
 {
-  unsigned char c;
-  size_t i;
-  size_t len = 0;
-  char const *quote_string;
-  size_t quote_string_len;
   struct quoting_options const *p = o ? o : &default_quoting_options;
-  enum quoting_style quoting_style = p->style;
-#define STORE(c) \
-    do \
-      { \
-	if (len < buffersize) \
-	  buffer[len] = (c); \
-	  len++; \
-      } \
-    while (0)
-
-  switch (quoting_style)
-    {
-    case shell_quoting_style:
-      if (! (argsize == (size_t) -1 ? arg[0] == '\0' : argsize == 0))
-	{
-	  switch (arg[0])
-	    {
-	    case '#': case '~':
-	      break;
-
-	    default:
-	      for (i = 0; ; i++)
-		{
-		  if (argsize == (size_t) -1 ? arg[i] == '\0' : i == argsize)
-		    goto done;
-
-		  c = arg[i];
-
-		  switch (c)
-		    {
-		    case '\t': case '\n': case ' ':
-		    case '!': /* special in csh */
-		    case '"': case '$': case '&': case '\'':
-		    case '(': case ')': case '*': case ';':
-		    case '<': case '>': case '?': case '[': case '\\':
-		    case '^': /* special in old /bin/sh, e.g. SunOS 4.1.4 */
-		    case '`': case '|':
-		      goto needs_quoting;
-		    }
-
-		  if (p->quote_these_too[c / INT_BITS] & (1 << (c % INT_BITS)))
-		    goto needs_quoting;
-
-		  STORE (c);
-		}
-	    needs_quoting:;
-
-	      len = 0;
-	      break;
-	    }
-	}
-      /* Fall through.  */
-
-    case shell_always_quoting_style:
-      STORE ('\'');
-      quote_string = "'";
-      quote_string_len = 1;
-      break;
-
-    case c_quoting_style:
-      STORE ('"');
-      quote_string = "\"";
-      quote_string_len = 1;
-      break;
-
-    case locale_quoting_style:
-      for (quote_string = _("`"); *quote_string; quote_string++)
-	STORE (*quote_string);
-      quote_string = _("'");
-      quote_string_len = strlen (quote_string);
-      break;
-
-    default:
-      quote_string = 0;
-      quote_string_len = 0;
-      break;
-    }
-
-  for (i = 0;  ! (argsize == (size_t) -1 ? arg[i] == '\0' : i == argsize);  i++)
-    {
-      c = arg[i];
-
-      switch (quoting_style)
-	{
-	case literal_quoting_style:
-	  break;
-
-	case shell_quoting_style:
-	case shell_always_quoting_style:
-	  if (c == '\'')
-	    {
-	      STORE ('\'');
-	      STORE ('\\');
-	      STORE ('\'');
-	    }
-	  break;
-
-	case c_quoting_style:
-	case escape_quoting_style:
-	case locale_quoting_style:
-	  switch (c)
-	    {
-	    case '?': /* Do not generate trigraphs.  */
-	    case '\\': goto store_escape;
-	      /* Not all C compilers know what \a means.  */
-	    case   7 : c = 'a'; goto store_escape;
-	    case '\b': c = 'b'; goto store_escape;
-	    case '\f': c = 'f'; goto store_escape;
-	    case '\n': c = 'n'; goto store_escape;
-	    case '\r': c = 'r'; goto store_escape;
-	    case '\t': c = 't'; goto store_escape;
-	    case '\v': c = 'v'; goto store_escape;
-
-	    case ' ': break;
-
-	    default:
-	      if (quote_string_len
-		  && strncmp (arg + i, quote_string, quote_string_len) == 0)
-		goto store_escape;
-	      if (!ISGRAPH (c))
-		{
-		  STORE ('\\');
-		  STORE ('0' + (c >> 6));
-		  STORE ('0' + ((c >> 3) & 7));
-		  c = '0' + (c & 7);
-		  goto store_c;
-		}
-	      break;
-	    }
-
-	  if (! (p->quote_these_too[c / INT_BITS] & (1 << (c % INT_BITS))))
-	    goto store_c;
-
-	store_escape:
-	  STORE ('\\');
-	}
-
-    store_c:
-      STORE (c);
-    }
-
-  if (quote_string)
-    for (; *quote_string; quote_string++)
-      STORE (*quote_string);
-
- done:
-  if (len < buffersize)
-    buffer[len] = '\0';
-  return len;
+  return quotearg_buffer_restyled (buffer, buffersize, arg, argsize,
+				   p->style, p);
 }
 
 /* Use storage slot N to return a quoted version of the string ARG.