diff liboctave/util/regexp.cc @ 15541:9db32cabeacf

Fix backslash handling in regexp pattern (Bug #37092) * NEWS: Give an example of how escape sequence processing in single-quoted regular expressions works. * libinterp/corefcn/regexp.cc(do_regexp_string_escapes): Rename to do_regexp_ptn_string_escapes. Only sequence to expand is '\b' for backspace. Others are handled by PCRE. * libinterp/corefcn/regexp.cc(do_regexp_rep_string_escapes): New function to do escape sequence processing for the replacement string since the sequences to expand differ from that of the regexp pattern. * liboctave/util/regexp.cc(regexp::replace): Process backslashes in replacement string so that '\$1' results in '$1' rather than replacement with first capture buffer.
author Rik <rik@octave.org>
date Wed, 17 Oct 2012 20:13:19 -0700
parents 648dabbb4c6b
children 1e9a6285acc4
line wrap: on
line diff
--- a/liboctave/util/regexp.cc
+++ b/liboctave/util/regexp.cc
@@ -446,55 +446,80 @@
   return retval;
 }
 
+// Declare rep_token_t used in processing replacement string
+typedef struct
+  {
+    size_t pos;
+    int num;
+  } rep_token_t;
+
+
 std::string
 regexp::replace (const std::string& buffer, const std::string& replacement)
 {
   std::string retval;
 
+  regexp::match_data rx_lst = match (buffer);
+
+  size_t num_matches = rx_lst.size ();
+
+  if (num_matches == 0)
+    {
+      retval = buffer;
+      return retval;
+    }
+
   // Identify replacement tokens; build a vector of group numbers in
   // the replacement string so that we can quickly calculate the size
   // of the replacement.
 
-  int tokens = 0;
-  for (size_t i=1; i < replacement.size (); i++)
+  // FIXME: All code assumes that only 10 tokens ($0-$9) exist.
+  //        $11 represents $1 followed by the character '1' rather than
+  //        the eleventh capture buffer.
+
+  std::string repstr = replacement;
+  std::vector<rep_token_t> tokens;
+  tokens.reserve (5);  // Reserve memory for 5 pattern replacements
+
+  for (size_t i=0; i < repstr.size (); i++)
     {
-      if (replacement[i-1]=='$' && isdigit (replacement[i]))
+      if (repstr[i] == '\\')
         {
-          tokens++;
-          i++;
+          if (i < repstr.size () - 1 && repstr[i+1] == '$')
+            {
+              repstr.erase (i,1);  // erase backslash
+              i++;                 // skip over '$'
+              continue;
+            }
+          if (i < repstr.size () - 1 && repstr[i+1] == '\\')
+            {
+              repstr.erase (i,1);  // erase 1st backslash
+              continue;
+            }
         }
-    }
-  std::vector<int> token (tokens);
+      else if (repstr[i] == '$')
+        {
+          if (i < repstr.size () - 1 && isdigit (repstr[i+1]))
+            {
+              rep_token_t tmp_token;
 
-  int kk = 0;
-  for (size_t i = 1; i < replacement.size (); i++)
-    {
-      if (replacement[i-1]=='$' && isdigit (replacement[i]))
-        {
-          token[kk++] = replacement[i]-'0';
-          i++;
+              tmp_token.pos = i;
+              tmp_token.num = repstr[i+1]-'0';
+              tokens.push_back (tmp_token);
+            }
         }
     }
 
-  regexp::match_data rx_lst = match (buffer);
-
-  size_t sz = rx_lst.size ();
+  std::string rep;
+  int num_tokens = tokens.size ();
 
-  if (sz == 0)
-    {
-      retval = buffer;
-      return retval;
-    }
-
-  std::string rep;
-
-  if (tokens > 0)
+  if (num_tokens > 0)
     {
       // Determine replacement length
-      const size_t replen = replacement.size () - 2*tokens;
+      const size_t replen = repstr.size () - 2*num_tokens;
       int delta = 0;
       regexp::match_data::const_iterator p = rx_lst.begin ();
-      for (size_t i = 0; i < sz; i++)
+      for (size_t i = 0; i < num_matches; i++)
         {
           OCTAVE_QUIT;
 
@@ -503,13 +528,13 @@
 
           const Matrix pairs (p->token_extents ());
           size_t pairlen = 0;
-          for (int j = 0; j < tokens; j++)
+          for (int j = 0; j < num_tokens; j++)
             {
-              if (token[j] == 0)
+              if (tokens[j].num == 0)
                 pairlen += static_cast<size_t> (end - start) + 1;
-              else if (token[j] <= pairs.rows ())
-                pairlen += static_cast<size_t> (pairs(token[j]-1,1)
-                                                - pairs(token[j]-1,0)) + 1;
+              else if (tokens[j].num <= pairs.rows ())
+                pairlen += static_cast<size_t> (pairs(tokens[j].num-1,1)
+                                                - pairs(tokens[j].num-1,0)) + 1;
             }
           delta += (static_cast<int> (replen + pairlen)
                     - static_cast<int> (end - start + 1));
@@ -520,7 +545,7 @@
       rep.reserve (buffer.size () + delta);
       size_t from = 0;
       p = rx_lst.begin ();
-      for (size_t i = 0; i < sz; i++)
+      for (size_t i = 0; i < num_matches; i++)
         {
           OCTAVE_QUIT;
 
@@ -531,51 +556,50 @@
           rep.append (&buffer[from], static_cast<size_t> (start - 1) - from);
           from = static_cast<size_t> (end - 1) + 1;
 
-          for (size_t j = 1; j < replacement.size (); j++)
+          size_t cur_pos = 0;
+
+          for (int j = 0; j < num_tokens; j++)
             {
-              if (replacement[j-1]=='$' && isdigit (replacement[j]))
+              rep.append (&repstr[cur_pos], (tokens[j].pos) - cur_pos);
+              cur_pos = tokens[j].pos+2;
+
+              int k = tokens[j].num;
+              if (k == 0)
                 {
-                  int k = replacement[j]-'0';
-                  if (k == 0)
-                    {
-                      // replace with entire match
-                      rep.append (&buffer[static_cast<size_t> (end - 1)],
-                                  static_cast<size_t> (end - start) + 1);
-                    }
-                  else if (k <= pairs.rows ())
-                    {
-                      // replace with group capture
-                      rep.append (&buffer[static_cast<size_t> (pairs(k-1,0)-1)],
-                                  static_cast<size_t> (pairs(k-1,1)
-                                                       - pairs(k-1,0)) + 1);
-                    }
-                  else
-                    {
-                      // replace with nothing
-                    }
-                  j++;
+                  // replace with entire match
+                  rep.append (&buffer[static_cast<size_t> (end - 1)],
+                              static_cast<size_t> (end - start) + 1);
+                }
+              else if (k <= pairs.rows ())
+                {
+                  // replace with group capture
+                  rep.append (&buffer[static_cast<size_t> (pairs(k-1,0)-1)],
+                              static_cast<size_t> (pairs(k-1,1)
+                                                   - pairs(k-1,0)) + 1);
                 }
               else
-                rep.append (1, replacement[j-1]);
+                {
+                  // replace with nothing
+                }
+            }
+          if (cur_pos < repstr.size ())
+            rep.append (&repstr[cur_pos], repstr.size () - cur_pos);
 
-              if (j+1 == replacement.size ())
-                rep.append (1, replacement[j]);
-            }
           p++;
         }
       rep.append (&buffer[from], buffer.size () - from);
     }
   else
     {
-      // Determine replacement length
-      const size_t replen = replacement.size ();
+      // Determine repstr length
+      const size_t replen = repstr.size ();
       int delta = 0;
       regexp::match_data::const_iterator p = rx_lst.begin ();
-      for (size_t i = 0; i < sz; i++)
+      for (size_t i = 0; i < num_matches; i++)
         {
           OCTAVE_QUIT;
           delta += static_cast<int> (replen)
-            - static_cast<int> (p->end () - p->start () + 1);
+                   - static_cast<int> (p->end () - p->start () + 1);
           p++;
         }
 
@@ -583,13 +607,13 @@
       rep.reserve (buffer.size () + delta);
       size_t from = 0;
       p = rx_lst.begin ();
-      for (size_t i = 0; i < sz; i++)
+      for (size_t i = 0; i < num_matches; i++)
         {
           OCTAVE_QUIT;
           rep.append (&buffer[from],
                       static_cast<size_t> (p->start () - 1) - from);
           from = static_cast<size_t> (p->end () - 1) + 1;
-          rep.append (replacement);
+          rep.append (repstr);
           p++;
         }
       rep.append (&buffer[from], buffer.size () - from);