changeset 14410:0a972f366396

regex-quote: New API. * lib/regex-quote.h: Include <stdbool.h>. (struct regex_quote_spec): New type. (regex_quote_spec_posix, regex_quote_spec_gnu, regex_quote_spec_pcre): New declarations. (regex_quote_length, regex_quote_copy, regex_quote): Take a 'const struct regex_quote_spec *' argument. * lib/regex-quote.c (RE_*, PCRE_*): New macros. (pcre_special): New constant. (regex_quote_spec_posix, regex_quote_spec_gnu, regex_quote_spec_pcre): New functions. (regex_quote_length, regex_quote_copy, regex_quote): Take a 'const struct regex_quote_spec *' argument. * modules/regex-quote (Depends-on): Add stdbool. * tests/test-regex-quote.c (check): Update for new API. Add test for anchored results. * NEWS: Mention the API change. Reported by Reuben Thomas and Eric Blake.
author Bruno Haible <bruno@clisp.org>
date Tue, 08 Mar 2011 10:09:47 +0100
parents ee532a615968
children 6e0c19bf3f95
files ChangeLog NEWS lib/regex-quote.c lib/regex-quote.h modules/regex-quote tests/test-regex-quote.c
diffstat 6 files changed, 264 insertions(+), 42 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,24 @@
+2011-03-08  Bruno Haible  <bruno@clisp.org>
+
+	regex-quote: New API.
+	* lib/regex-quote.h: Include <stdbool.h>.
+	(struct regex_quote_spec): New type.
+	(regex_quote_spec_posix, regex_quote_spec_gnu, regex_quote_spec_pcre):
+	New declarations.
+	(regex_quote_length, regex_quote_copy, regex_quote): Take a
+	'const struct regex_quote_spec *' argument.
+	* lib/regex-quote.c (RE_*, PCRE_*): New macros.
+	(pcre_special): New constant.
+	(regex_quote_spec_posix, regex_quote_spec_gnu, regex_quote_spec_pcre):
+	New functions.
+	(regex_quote_length, regex_quote_copy, regex_quote): Take a
+	'const struct regex_quote_spec *' argument.
+	* modules/regex-quote (Depends-on): Add stdbool.
+	* tests/test-regex-quote.c (check): Update for new API. Add test for
+	anchored results.
+	* NEWS: Mention the API change.
+	Reported by Reuben Thomas and Eric Blake.
+
 2011-03-06  Bruno Haible  <bruno@clisp.org>
 
 	regex-quote: Fix creation of POSIX extended regular expressions.
--- a/NEWS
+++ b/NEWS
@@ -12,6 +12,10 @@
 
 Date        Modules         Changes
 
+2011-03-08  regex-quote     The last argument is no longer an 'int cflags'
+                            but instead a pointer to a previously constructed
+                            'struct regex_quote_spec'.
+
 2011-02-25  dirname         These modules no longer put #defines for the
             dirname-lgpl    following symbols into <config.h>: ISSLASH,
             backupfile      FILE_SYSTEM_ACCEPTS_DRIVE_LETTER_PREFIX,
--- a/lib/regex-quote.c
+++ b/lib/regex-quote.c
@@ -31,56 +31,186 @@
 /* Characters that are special in an ERE.  */
 static const char ere_special[] = "$^.*[]\\+?{}()|";
 
-size_t
-regex_quote_length (const char *string, int cflags)
+struct regex_quote_spec
+regex_quote_spec_posix (int cflags, bool anchored)
+{
+  struct regex_quote_spec result;
+
+  strcpy (result.special, cflags != 0 ? ere_special : bre_special);
+  result.multibyte = true;
+  result.anchored = anchored;
+
+  return result;
+}
+
+/* Syntax bit values, defined in GNU <regex.h>.  We don't include it here,
+   otherwise this module would need to depend on gnulib module 'regex'.  */
+#define RE_BK_PLUS_QM    0x00000002
+#define RE_INTERVALS     0x00000200
+#define RE_LIMITED_OPS   0x00000400
+#define RE_NEWLINE_ALT   0x00000800
+#define RE_NO_BK_BRACES  0x00001000
+#define RE_NO_BK_PARENS  0x00002000
+#define RE_NO_BK_VBAR    0x00008000
+
+struct regex_quote_spec
+regex_quote_spec_gnu (unsigned long /*reg_syntax_t*/ syntax, bool anchored)
 {
-  const char *special = (cflags != 0 ? ere_special : bre_special);
+  struct regex_quote_spec result;
+  char *p;
+
+  p = result.special;
+  memcpy (p, bre_special, sizeof (bre_special) - 1);
+  p += sizeof (bre_special) - 1;
+  if ((syntax & RE_LIMITED_OPS) == 0 && (syntax & RE_BK_PLUS_QM) == 0)
+    {
+      *p++ = '+';
+      *p++ = '?';
+    }
+  if ((syntax & RE_INTERVALS) != 0 && (syntax & RE_NO_BK_BRACES) != 0)
+    {
+      *p++ = '{';
+      *p++ = '}';
+    }
+  if ((syntax & RE_NO_BK_PARENS) != 0)
+    {
+      *p++ = '(';
+      *p++ = ')';
+    }
+  if ((syntax & RE_LIMITED_OPS) == 0 && (syntax & RE_NO_BK_VBAR) != 0)
+    *p++ = '|';
+  if ((syntax & RE_NEWLINE_ALT) != 0)
+    *p++ = '\n';
+  *p = '\0';
+
+  result.multibyte = true;
+  result.anchored = anchored;
+
+  return result;
+}
+
+/* Characters that are special in a PCRE.  */
+static const char pcre_special[] = "$^.*[]\\+?{}()|";
+
+/* Options bit values, defined in <pcre.h>.  We don't include it here, because
+   it is not a standard header.  */
+#define PCRE_ANCHORED 0x00000010
+#define PCRE_EXTENDED 0x00000008
+
+struct regex_quote_spec
+regex_quote_spec_pcre (int options, bool anchored)
+{
+  struct regex_quote_spec result;
+  char *p;
+
+  p = result.special;
+  memcpy (p, bre_special, sizeof (pcre_special) - 1);
+  p += sizeof (pcre_special) - 1;
+  if (options & PCRE_EXTENDED)
+    {
+      *p++ = ' ';
+      *p++ = '\t';
+      *p++ = '\n';
+      *p++ = '\v';
+      *p++ = '\f';
+      *p++ = '\r';
+      *p++ = '#';
+    }
+  *p = '\0';
+
+  /* PCRE regular expressions consist of UTF-8 characters of options contains
+     PCRE_UTF8 and of single bytes otherwise.  */
+  result.multibyte = false;
+  /* If options contains PCRE_ANCHORED, the anchoring is implicit.  */
+  result.anchored = (options & PCRE_ANCHORED ? 0 : anchored);
+
+  return result;
+}
+
+size_t
+regex_quote_length (const char *string, const struct regex_quote_spec *spec)
+{
+  const char *special = spec->special;
   size_t length;
-  mbui_iterator_t iter;
 
   length = 0;
-  for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter))
+  if (spec->anchored)
+    length += 2; /* for '^' at the beginning and '$' at the end */
+  if (spec->multibyte)
     {
-      /* We know that special contains only ASCII characters.  */
-      if (mb_len (mbui_cur (iter)) == 1
-          && strchr (special, * mbui_cur_ptr (iter)))
-        length += 1;
-      length += mb_len (mbui_cur (iter));
+      mbui_iterator_t iter;
+
+      for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter))
+        {
+          /* We know that special contains only ASCII characters.  */
+          if (mb_len (mbui_cur (iter)) == 1
+              && strchr (special, * mbui_cur_ptr (iter)))
+            length += 1;
+          length += mb_len (mbui_cur (iter));
+        }
     }
+  else
+    {
+      const char *iter;
+
+      for (iter = string; *iter != '\0'; iter++)
+        {
+          if (strchr (special, *iter))
+            length += 1;
+          length += 1;
+        }
+    }
+
   return length;
 }
 
-/* Copies the quoted string to p and returns the incremented p.
-   There must be room for regex_quote_length (string, cflags) + 1 bytes at p.
- */
 char *
-regex_quote_copy (char *p, const char *string, int cflags)
+regex_quote_copy (char *p, const char *string, const struct regex_quote_spec *spec)
 {
-  const char *special = (cflags != 0 ? ere_special : bre_special);
-  mbui_iterator_t iter;
+  const char *special = spec->special;
+
+  if (spec->anchored)
+    *p++ = '^';
+  if (spec->multibyte)
+    {
+      mbui_iterator_t iter;
 
-  for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter))
+      for (mbui_init (iter, string); mbui_avail (iter); mbui_advance (iter))
+        {
+          /* We know that special contains only ASCII characters.  */
+          if (mb_len (mbui_cur (iter)) == 1
+              && strchr (special, * mbui_cur_ptr (iter)))
+            *p++ = '\\';
+          memcpy (p, mbui_cur_ptr (iter), mb_len (mbui_cur (iter)));
+          p += mb_len (mbui_cur (iter));
+        }
+    }
+  else
     {
-      /* We know that special contains only ASCII characters.  */
-      if (mb_len (mbui_cur (iter)) == 1
-          && strchr (special, * mbui_cur_ptr (iter)))
-        *p++ = '\\';
-      memcpy (p, mbui_cur_ptr (iter), mb_len (mbui_cur (iter)));
-      p += mb_len (mbui_cur (iter));
+      const char *iter;
+
+      for (iter = string; *iter != '\0'; iter++)
+        {
+          if (strchr (special, *iter))
+            *p++ = '\\';
+          *p++ = *iter++;
+        }
     }
+  if (spec->anchored)
+    *p++ = '$';
+
   return p;
 }
 
-/* Returns the freshly allocated quoted string.  */
 char *
-regex_quote (const char *string, int cflags)
+regex_quote (const char *string, const struct regex_quote_spec *spec)
 {
-  size_t length = regex_quote_length (string, cflags);
+  size_t length = regex_quote_length (string, spec);
   char *result = XNMALLOC (length + 1, char);
   char *p;
 
   p = result;
-  p = regex_quote_copy (p, string, cflags);
+  p = regex_quote_copy (p, string, spec);
   *p = '\0';
   return result;
 }
--- a/lib/regex-quote.h
+++ b/lib/regex-quote.h
@@ -15,27 +15,74 @@
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
 
+#ifndef _REGEX_QUOTE_H
+#define _REGEX_QUOTE_H
+
 #include <stddef.h>
+#include <stdbool.h>
+
 
-/* regex_quote converts a literal string to a regular expression that will
-   look for this literal string.
-   cflags can be 0 or REG_EXTENDED.
+/* Specifies a quotation task for converting a fixed string to a regular
+   expression pattern.  */
+struct regex_quote_spec
+{
+  /* True if the regular expression pattern consists of multibyte characters
+     (in the encoding given by the LC_CTYPE category of the locale),
+     false if it consists of single bytes or UTF-8 characters.  */
+  unsigned int /*bool*/ multibyte : 1;
+  /* True if the regular expression pattern shall match only entire lines.  */
+  unsigned int /*bool*/ anchored : 1;
+  /* Set of characters that need to be escaped (all ASCII), as a
+     NUL-terminated string.  */
+  char special[30 + 1];
+};
+
+
+/* Creates a quotation task that produces a POSIX regular expression, that is,
+   a pattern that can be compiled with regcomp().
+   CFLAGS can be 0 or REG_EXTENDED.
    If it is 0, the result is a Basic Regular Expression (BRE)
    <http://www.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_03>.
    If it is REG_EXTENDED, the result is an Extended Regular Expression (ERE)
    <http://www.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04>.
-   The result is not anchored; if you want it to match only complete lines,
-   you need to add "^" at the beginning of the result and "$" at the end of the
-   result.
- */
+   If ANCHORED is false, the regular expression will match substrings of lines.
+   If ANCHORED is true, it will match only complete lines,  */
+extern struct regex_quote_spec
+       regex_quote_spec_posix (int cflags, bool anchored);
+
+/* Creates a quotation task that produces a regular expression that can be
+   compiled with the GNU API function re_compile_pattern().
+   SYNTAX describes the syntax of the regular expression (such as
+   RE_SYNTAX_POSIX_BASIC, RE_SYNTAX_POSIX_EXTENDED, RE_SYNTAX_EMACS, all
+   defined in <regex.h>).  It must be the same value as 're_syntax_options'
+   at the moment of the re_compile_pattern() call.
+   If ANCHORED is false, the regular expression will match substrings of lines.
+   If ANCHORED is true, it will match only complete lines,  */
+extern struct regex_quote_spec
+       regex_quote_spec_gnu (unsigned long /*reg_syntax_t*/ syntax, bool anchored);
+
+/* Creates a quotation task that produces a PCRE regular expression, that is,
+   a pattern that can be compiled with pcre_compile().
+   OPTIONS is the same value as the second argument passed to pcre_compile().
+   If ANCHORED is false, the regular expression will match substrings of lines.
+   If ANCHORED is true, it will match only complete lines,  */
+extern struct regex_quote_spec
+       regex_quote_spec_pcre (int options, bool anchored);
+
 
 /* Returns the number of bytes needed for the quoted string.  */
-extern size_t regex_quote_length (const char *string, int cflags);
+extern size_t
+       regex_quote_length (const char *string, const struct regex_quote_spec *spec);
 
 /* Copies the quoted string to p and returns the incremented p.
-   There must be room for regex_quote_length (string, cflags) + 1 bytes at p.
- */
-extern char * regex_quote_copy (char *p, const char *string, int cflags);
+   There must be room for regex_quote_length (string, spec) + 1 bytes at p.  */
+extern char *
+       regex_quote_copy (char *p,
+                         const char *string, const struct regex_quote_spec *spec);
 
 /* Returns the freshly allocated quoted string.  */
-extern char * regex_quote (const char *string, int cflags);
+extern char *
+       regex_quote (const char *string, const struct regex_quote_spec *spec);
+
+
+#endif /* _REGEX_QUOTE_H */
--- a/modules/regex-quote
+++ b/modules/regex-quote
@@ -6,6 +6,7 @@
 lib/regex-quote.c
 
 Depends-on:
+stdbool
 xalloc
 mbuiter
 
--- a/tests/test-regex-quote.c
+++ b/tests/test-regex-quote.c
@@ -29,18 +29,37 @@
 static void
 check (const char *literal, int cflags, const char *expected)
 {
+  struct regex_quote_spec spec;
   char *result;
   size_t length;
 
-  result = regex_quote (literal, cflags);
+  spec = regex_quote_spec_posix (cflags, false);
+  result = regex_quote (literal, &spec);
   ASSERT (strcmp (result, expected) == 0);
-  length = regex_quote_length (literal, cflags);
+  length = regex_quote_length (literal, &spec);
   ASSERT (length == strlen (result));
   free (result);
 
   result = (char *) xmalloc (1 + length + 1 + 1);
   result[0] = '^';
-  strcpy (regex_quote_copy (result + 1, literal, cflags), "$");
+  strcpy (regex_quote_copy (result + 1, literal, &spec), "$");
+  {
+    regex_t regex;
+    regmatch_t match[1];
+
+    ASSERT (regcomp (&regex, result, cflags) == 0);
+
+    ASSERT (regexec (&regex, literal, 1, match, 0) == 0);
+    ASSERT (match[0].rm_so == 0);
+    ASSERT (match[0].rm_eo == strlen (literal));
+    regfree (&regex);
+  }
+  free (result);
+
+  spec = regex_quote_spec_posix (cflags, true);
+  result = regex_quote (literal, &spec);
+  length = regex_quote_length (literal, &spec);
+  ASSERT (length == strlen (result));
   {
     regex_t regex;
     regmatch_t match[1];