changeset 15542:9db32cabeacf

Fix backslash handling in regexp pattern (Bug #37092) * NEWS: Give an example of how escape sequence processing in single-quoted regular expressions works. * libinterp/corefcn/regexp.cc(do_regexp_string_escapes): Rename to do_regexp_ptn_string_escapes. Only sequence to expand is '\b' for backspace. Others are handled by PCRE. * libinterp/corefcn/regexp.cc(do_regexp_rep_string_escapes): New function to do escape sequence processing for the replacement string since the sequences to expand differ from that of the regexp pattern. * liboctave/util/regexp.cc(regexp::replace): Process backslashes in replacement string so that '\$1' results in '$1' rather than replacement with first capture buffer.
author Rik <rik@octave.org>
date Wed, 17 Oct 2012 20:13:19 -0700
parents d8e3111b1890
children 0272bb60408a
files NEWS libinterp/corefcn/regexp.cc liboctave/util/regexp.cc
diffstat 3 files changed, 155 insertions(+), 83 deletions(-) [+]
line wrap: on
line diff
--- a/NEWS
+++ b/NEWS
@@ -56,9 +56,16 @@
     where the pattern is actually the assertion '^' or start-of-line.
 
  ** For compatibility with Matlab, the regexp, regexpi, and regexprep
-    functions now process backslash escapes in single-quoted pattern
+    functions now process backslash escape sequences in single-quoted pattern
     strings.  In addition, the regexprep function now processes backslash
-    escapes in single-quoted replacement strings.
+    escapes in single-quoted replacement strings.  For example,
+    
+    regexprep (str, '\t', '\n')
+
+    would search the variable str for a TAB character (escape sequence \t)
+    and replace it with a NEWLINE (escape sequence \n).  Previously the
+    expression would have searched for a literal '\' followed by 't' and
+    replaced the two characters with the sequence '\', 'n'.
 
  ** Redundant terminal comma accepted by parser
 
--- a/libinterp/corefcn/regexp.cc
+++ b/libinterp/corefcn/regexp.cc
@@ -45,12 +45,12 @@
 #include "utils.h"
 
 // Replace backslash escapes in a string with the real values.  We need
-// this special function instead of the one in utils.cc because the set
-// of escape sequences used in regexps is different from those used in
-// the *printf functions.
+// two special functions instead of the one in utils.cc because the set
+// of escape sequences used for regexp patterns and replacement strings
+// is different from those used in the *printf functions.
 
 static std::string
-do_regexp_string_escapes (const std::string& s)
+do_regexp_ptn_string_escapes (const std::string& s)
 {
   std::string retval;
 
@@ -66,11 +66,56 @@
         {
           switch (s[++j])
             {
-            case '$':
-              retval[i] = '$';
+            case 'b': // backspace
+              retval[i] = '\b';
               break;
 
-            case 'a':
+#if 0
+// FIXME : To be complete, we need to handle \oN, \o{N}.
+//         The PCRE library already handles \N where N
+//         is an octal number.  New code needs to merely
+//         replace \oN or \o{N} with \N.
+            case 'o': // octal number
+#endif
+
+            default:  // pass escape sequence through
+              retval[i] = '\\';
+              retval[++i] = s[j];
+              break;
+            }
+        }
+      else
+        {
+          retval[i] = s[j];
+        }
+
+      i++;
+      j++;
+    }
+
+  retval.resize (i);
+
+  return retval;
+}
+
+static std::string
+do_regexp_rep_string_escapes (const std::string& s)
+{
+  std::string retval;
+
+  size_t i = 0;
+  size_t j = 0;
+  size_t len = s.length ();
+
+  retval.resize (len);
+
+  while (j < len)
+    {
+      if (s[j] == '\\' && j+1 < len)
+        {
+          switch (s[++j])
+            {
+            case 'a': // alarm
               retval[i] = '\a';
               break;
 
@@ -98,10 +143,6 @@
               retval[i] = '\v';
               break;
 
-            case '\\': // backslash
-              retval[i] = '\\';
-              break;
-
 #if 0
 // FIXME -- to be complete, we need to handle \oN, \o{N}, \xN, and
 // \x{N}.  Hex digits may be upper or lower case.  Brackets are
@@ -110,8 +151,8 @@
             case 'o': // octal number
             case 'x': // hex number
 #endif
-
-            default:
+ 
+            default:  // pass escape sequence through
               retval[i] = '\\';
               retval[++i] = s[j];
               break;
@@ -205,7 +246,7 @@
     return retval;
   // Matlab compatibility.
   if (args(1).is_sq_string ())
-    pattern = do_regexp_string_escapes (pattern);
+    pattern = do_regexp_ptn_string_escapes (pattern);
 
   regexp::opts options;
   options.case_insensitive (case_insensitive);
@@ -1196,14 +1237,14 @@
     return retval;
   // Matlab compatibility.
   if (args(1).is_sq_string ())
-    pattern = do_regexp_string_escapes (pattern);
+    pattern = do_regexp_ptn_string_escapes (pattern);
 
   std::string replacement = args(2).string_value ();
   if (error_state)
     return retval;
   // Matlab compatibility.
   if (args(2).is_sq_string ())
-    replacement = do_regexp_string_escapes (replacement);
+    replacement = do_regexp_rep_string_escapes (replacement);
 
   // Pack options excluding 'tokenize' and various output
   // reordering strings into regexp arg list
--- a/liboctave/util/regexp.cc
+++ b/liboctave/util/regexp.cc
@@ -446,55 +446,80 @@
   return retval;
 }
 
+// Declare rep_token_t used in processing replacement string
+typedef struct
+  {
+    size_t pos;
+    int num;
+  } rep_token_t;
+
+
 std::string
 regexp::replace (const std::string& buffer, const std::string& replacement)
 {
   std::string retval;
 
+  regexp::match_data rx_lst = match (buffer);
+
+  size_t num_matches = rx_lst.size ();
+
+  if (num_matches == 0)
+    {
+      retval = buffer;
+      return retval;
+    }
+
   // Identify replacement tokens; build a vector of group numbers in
   // the replacement string so that we can quickly calculate the size
   // of the replacement.
 
-  int tokens = 0;
-  for (size_t i=1; i < replacement.size (); i++)
+  // FIXME: All code assumes that only 10 tokens ($0-$9) exist.
+  //        $11 represents $1 followed by the character '1' rather than
+  //        the eleventh capture buffer.
+
+  std::string repstr = replacement;
+  std::vector<rep_token_t> tokens;
+  tokens.reserve (5);  // Reserve memory for 5 pattern replacements
+
+  for (size_t i=0; i < repstr.size (); i++)
     {
-      if (replacement[i-1]=='$' && isdigit (replacement[i]))
+      if (repstr[i] == '\\')
         {
-          tokens++;
-          i++;
+          if (i < repstr.size () - 1 && repstr[i+1] == '$')
+            {
+              repstr.erase (i,1);  // erase backslash
+              i++;                 // skip over '$'
+              continue;
+            }
+          if (i < repstr.size () - 1 && repstr[i+1] == '\\')
+            {
+              repstr.erase (i,1);  // erase 1st backslash
+              continue;
+            }
         }
-    }
-  std::vector<int> token (tokens);
+      else if (repstr[i] == '$')
+        {
+          if (i < repstr.size () - 1 && isdigit (repstr[i+1]))
+            {
+              rep_token_t tmp_token;
 
-  int kk = 0;
-  for (size_t i = 1; i < replacement.size (); i++)
-    {
-      if (replacement[i-1]=='$' && isdigit (replacement[i]))
-        {
-          token[kk++] = replacement[i]-'0';
-          i++;
+              tmp_token.pos = i;
+              tmp_token.num = repstr[i+1]-'0';
+              tokens.push_back (tmp_token);
+            }
         }
     }
 
-  regexp::match_data rx_lst = match (buffer);
-
-  size_t sz = rx_lst.size ();
+  std::string rep;
+  int num_tokens = tokens.size ();
 
-  if (sz == 0)
-    {
-      retval = buffer;
-      return retval;
-    }
-
-  std::string rep;
-
-  if (tokens > 0)
+  if (num_tokens > 0)
     {
       // Determine replacement length
-      const size_t replen = replacement.size () - 2*tokens;
+      const size_t replen = repstr.size () - 2*num_tokens;
       int delta = 0;
       regexp::match_data::const_iterator p = rx_lst.begin ();
-      for (size_t i = 0; i < sz; i++)
+      for (size_t i = 0; i < num_matches; i++)
         {
           OCTAVE_QUIT;
 
@@ -503,13 +528,13 @@
 
           const Matrix pairs (p->token_extents ());
           size_t pairlen = 0;
-          for (int j = 0; j < tokens; j++)
+          for (int j = 0; j < num_tokens; j++)
             {
-              if (token[j] == 0)
+              if (tokens[j].num == 0)
                 pairlen += static_cast<size_t> (end - start) + 1;
-              else if (token[j] <= pairs.rows ())
-                pairlen += static_cast<size_t> (pairs(token[j]-1,1)
-                                                - pairs(token[j]-1,0)) + 1;
+              else if (tokens[j].num <= pairs.rows ())
+                pairlen += static_cast<size_t> (pairs(tokens[j].num-1,1)
+                                                - pairs(tokens[j].num-1,0)) + 1;
             }
           delta += (static_cast<int> (replen + pairlen)
                     - static_cast<int> (end - start + 1));
@@ -520,7 +545,7 @@
       rep.reserve (buffer.size () + delta);
       size_t from = 0;
       p = rx_lst.begin ();
-      for (size_t i = 0; i < sz; i++)
+      for (size_t i = 0; i < num_matches; i++)
         {
           OCTAVE_QUIT;
 
@@ -531,51 +556,50 @@
           rep.append (&buffer[from], static_cast<size_t> (start - 1) - from);
           from = static_cast<size_t> (end - 1) + 1;
 
-          for (size_t j = 1; j < replacement.size (); j++)
+          size_t cur_pos = 0;
+
+          for (int j = 0; j < num_tokens; j++)
             {
-              if (replacement[j-1]=='$' && isdigit (replacement[j]))
+              rep.append (&repstr[cur_pos], (tokens[j].pos) - cur_pos);
+              cur_pos = tokens[j].pos+2;
+
+              int k = tokens[j].num;
+              if (k == 0)
                 {
-                  int k = replacement[j]-'0';
-                  if (k == 0)
-                    {
-                      // replace with entire match
-                      rep.append (&buffer[static_cast<size_t> (end - 1)],
-                                  static_cast<size_t> (end - start) + 1);
-                    }
-                  else if (k <= pairs.rows ())
-                    {
-                      // replace with group capture
-                      rep.append (&buffer[static_cast<size_t> (pairs(k-1,0)-1)],
-                                  static_cast<size_t> (pairs(k-1,1)
-                                                       - pairs(k-1,0)) + 1);
-                    }
-                  else
-                    {
-                      // replace with nothing
-                    }
-                  j++;
+                  // replace with entire match
+                  rep.append (&buffer[static_cast<size_t> (end - 1)],
+                              static_cast<size_t> (end - start) + 1);
+                }
+              else if (k <= pairs.rows ())
+                {
+                  // replace with group capture
+                  rep.append (&buffer[static_cast<size_t> (pairs(k-1,0)-1)],
+                              static_cast<size_t> (pairs(k-1,1)
+                                                   - pairs(k-1,0)) + 1);
                 }
               else
-                rep.append (1, replacement[j-1]);
+                {
+                  // replace with nothing
+                }
+            }
+          if (cur_pos < repstr.size ())
+            rep.append (&repstr[cur_pos], repstr.size () - cur_pos);
 
-              if (j+1 == replacement.size ())
-                rep.append (1, replacement[j]);
-            }
           p++;
         }
       rep.append (&buffer[from], buffer.size () - from);
     }
   else
     {
-      // Determine replacement length
-      const size_t replen = replacement.size ();
+      // Determine repstr length
+      const size_t replen = repstr.size ();
       int delta = 0;
       regexp::match_data::const_iterator p = rx_lst.begin ();
-      for (size_t i = 0; i < sz; i++)
+      for (size_t i = 0; i < num_matches; i++)
         {
           OCTAVE_QUIT;
           delta += static_cast<int> (replen)
-            - static_cast<int> (p->end () - p->start () + 1);
+                   - static_cast<int> (p->end () - p->start () + 1);
           p++;
         }
 
@@ -583,13 +607,13 @@
       rep.reserve (buffer.size () + delta);
       size_t from = 0;
       p = rx_lst.begin ();
-      for (size_t i = 0; i < sz; i++)
+      for (size_t i = 0; i < num_matches; i++)
         {
           OCTAVE_QUIT;
           rep.append (&buffer[from],
                       static_cast<size_t> (p->start () - 1) - from);
           from = static_cast<size_t> (p->end () - 1) + 1;
-          rep.append (replacement);
+          rep.append (repstr);
           p++;
         }
       rep.append (&buffer[from], buffer.size () - from);