Mercurial > hg > octave-lyh
diff liboctave/regexp.cc @ 14024:fc9f204faea0
refactor regexp (bug #34440)
* liboctave/regexp.h, liboctave/regexp.cc: New files.
Provide classes and functions for regular expressions.
Adapted from src/DLD-FUNCTIONS/regexp.cc.
* regex-match.h, regex-match.cc: Delete
* liboctave/Makefile.am (INCS, LIBOCTAVE_CXX_SOURCES): Update.
* variables.cc (name_matches_any_pattern): Use new regexp class.
* symtab.h (symbol_table::regexp_global_variables,
symbol_table::do_clear_variable_regexp, symbol_table::do_regexp):
Likewise.
* DLD-FUNCTIONS/regexp.cc (parse_options): New function.
(octregexp, octcellregexp, octregexprep): Extract matching code for
use in new regexp class. Use new regexp class to provide required
functionality.
author | John W. Eaton <jwe@octave.org> |
---|---|
date | Sun, 11 Dec 2011 22:19:57 -0500 |
parents | liboctave/regex-match.cc@12df7854fa7c |
children | 9867be070ee1 |
line wrap: on
line diff
copy from liboctave/regex-match.cc copy to liboctave/regexp.cc --- a/liboctave/regex-match.cc +++ b/liboctave/regexp.cc @@ -1,6 +1,8 @@ /* -Copyright (C) 2008-2011 David Bateman +Copyright (C) 2011 John W. Eaton +Copyright (C) 2005-2011 David Bateman +Copyright (C) 2002-2005 Paul Kienzle This file is part of Octave. @@ -24,126 +26,550 @@ #include <config.h> #endif +#include <list> +#include <sstream> +#include <string> #include <vector> -#include <iostream> -#include <string> -#include "regex-match.h" -#include "str-vec.h" -#include "oct-locbuf.h" +#include <pcre.h> -regex_match& -regex_match::operator = (const regex_match& gm) -{ - if (this != &gm) - { -#if HAVE_REGEX - for (int i = 0; i < pat.length (); i++) - regfree (compiled +i); - delete [] compiled; -#endif - pat = gm.pat; - case_insen = gm.case_insen; - init (); - } - return *this; -} +#include "Matrix.h" +#include "base-list.h" +#include "lo-error.h" +#include "oct-locbuf.h" +#include "quit.h" +#include "regexp.h" +#include "str-vec.h" -regex_match::~regex_match (void) -{ -#if HAVE_REGEX - for (int i = 0; i < pat.length (); i++) - regfree (compiled +i); - delete [] compiled; -#endif -} +// Define the maximum number of retries for a pattern that possibly +// results in an infinite recursion. +#define PCRE_MATCHLIMIT_MAX 10 +// FIXME -- should this be configurable? +#define MAXLOOKBEHIND 10 + +static bool lookbehind_warned = false; + +// FIXME -- don't bother collecting and composing return values the user +// doesn't want. void -regex_match::set_pattern (const std::string& p) +regexp::free (void) { -#if HAVE_REGEX - for (int i = 0; i < pat.length (); i++) - regfree (compiled +i); - delete [] compiled; -#endif - pat = p; - init (); -} - -void -regex_match::set_pattern (const string_vector& p) -{ -#if HAVE_REGEX - for (int i = 0; i < pat.length (); i++) - regfree (compiled +i); - delete [] compiled; -#endif - pat = p; - init (); + if (data) + pcre_free (static_cast<pcre *> (data)); } void -regex_match::init (void) +regexp::compile_internal (void) { -#ifdef HAVE_REGEX - int npat = pat.length (); - int err = 0; - int i; + // If we had a previously compiled pattern, release it. + free (); + + size_t max_length = MAXLOOKBEHIND; + + size_t pos = 0; + size_t new_pos; + int inames = 0; + std::ostringstream buf; + + while ((new_pos = pattern.find ("(?", pos)) != std::string::npos) + { + if (pattern.at (new_pos + 2) == '<' + && !(pattern.at (new_pos + 3) == '=' + || pattern.at (new_pos + 3) == '!')) + { + // The syntax of named tokens in pcre is "(?P<name>...)" while + // we need a syntax "(?<name>...)", so fix that here. Also an + // expression like + // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)" + // should be perfectly legal, while pcre does not allow the same + // named token name on both sides of the alternative. Also fix + // that here by replacing name tokens by dummy names, and dealing + // with the dummy names later. + + size_t tmp_pos = pattern.find_first_of ('>', new_pos); + + if (tmp_pos == std::string::npos) + { + (*current_liboctave_error_handler) + ("regexp: syntax error in pattern"); + return; + } + + std::string tmp_name = + pattern.substr (new_pos+3, tmp_pos-new_pos-3); + + bool found = false; + + for (int i = 0; i < nnames; i++) + { + if (named_pats(i) == tmp_name) + { + named_idx.resize (dim_vector (inames+1, 1)); + named_idx(inames) = i; + found = true; + break; + } + } + + if (! found) + { + named_idx.resize (dim_vector (inames+1, 1)); + named_idx(inames) = nnames; + named_pats.append (tmp_name); + nnames++; + } + + if (new_pos - pos > 0) + buf << pattern.substr (pos, new_pos-pos); + if (inames < 10) + buf << "(?P<n00" << inames++; + else if (inames < 100) + buf << "(?P<n0" << inames++; + else + buf << "(?P<n" << inames++; - compiled = new regex_t [npat]; + pos = tmp_pos; + } + else if (pattern.at (new_pos + 2) == '<') + { + // Find lookbehind operators of arbitrary length (ie like + // "(?<=[a-z]*)") and replace with a maximum length operator + // as PCRE can not yet handle arbitrary length lookahead + // operators. Use the string length as the maximum length to + // avoid issues. + + int brackets = 1; + size_t tmp_pos1 = new_pos + 2; + size_t tmp_pos2 = tmp_pos1; + + while (tmp_pos1 <= pattern.length () && brackets > 0) + { + char ch = pattern.at (tmp_pos1); + + if (ch == '(') + brackets++; + else if (ch == ')') + { + if (brackets > 1) + tmp_pos2 = tmp_pos1; + + brackets--; + } + + tmp_pos1++; + } + + if (brackets != 0) + { + buf << pattern.substr (pos, new_pos - pos) << "(?"; + pos = new_pos + 2; + } + else + { + size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2); - for (i = 0; i < npat; i++) - { - err = regcomp (compiled + i, pat(i).c_str (), - (REG_NOSUB | REG_EXTENDED | - (case_insen ? REG_ICASE : 0))); - if (err) - break; + if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1) + { + if (!lookbehind_warned) + { + lookbehind_warned = true; + (*current_liboctave_warning_handler) + ("%s: arbitrary length lookbehind patterns are only supported up to length %d", + who.c_str (), MAXLOOKBEHIND); + } + + buf << pattern.substr (pos, new_pos - pos) << "("; + + size_t i; + + if (pattern.at (tmp_pos3) == '*') + i = 0; + else + i = 1; + + for (; i < max_length + 1; i++) + { + buf << pattern.substr (new_pos, tmp_pos3 - new_pos) + << "{" << i << "}"; + buf << pattern.substr (tmp_pos3 + 1, + tmp_pos1 - tmp_pos3 - 1); + if (i != max_length) + buf << "|"; + } + buf << ")"; + } + else + buf << pattern.substr (pos, tmp_pos1 - pos); + + pos = tmp_pos1; + } + } + else + { + buf << pattern.substr (pos, new_pos - pos) << "(?"; + pos = new_pos + 2; + } + } - if (err) + buf << pattern.substr (pos); + + const char *err; + int erroffset; + std::string buf_str = buf.str (); + + int pcre_options + = ((options.case_insensitive () ? PCRE_CASELESS : 0) + | (options.dotexceptnewline () ? 0 : PCRE_DOTALL) + | (options.lineanchors () ? PCRE_MULTILINE : 0) + | (options.freespacing () ? PCRE_EXTENDED : 0)); + + data = pcre_compile (buf_str.c_str (), pcre_options, &err, &erroffset, 0); + + if (! data) + (*current_liboctave_error_handler) + ("%s: %s at position %d of expression", who.c_str (), + err, erroffset); +} + +regexp::match_data +regexp::match (const std::string& buffer) +{ + regexp::match_data retval; + + std::list<regexp::match_element> lst; + + int subpatterns; + int namecount; + int nameentrysize; + char *nametable; + size_t idx = 0; + + pcre *re = static_cast <pcre *> (data); + + pcre_fullinfo (re, 0, PCRE_INFO_CAPTURECOUNT, &subpatterns); + pcre_fullinfo (re, 0, PCRE_INFO_NAMECOUNT, &namecount); + pcre_fullinfo (re, 0, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize); + pcre_fullinfo (re, 0, PCRE_INFO_NAMETABLE, &nametable); + + OCTAVE_LOCAL_BUFFER (int, ovector, (subpatterns+1)*3); + OCTAVE_LOCAL_BUFFER (int, nidx, namecount); + + for (int i = 0; i < namecount; i++) { - int len = regerror (err, compiled + i, 0, 0); - OCTAVE_LOCAL_BUFFER (char, errmsg, len); - regerror(err, compiled + i, errmsg, len); - (*current_liboctave_error_handler) ("%s in pattern (%s)", errmsg, - pat(i).c_str()); + // Index of subpattern in first two bytes MSB first of name. + // Extract index. + nidx[i] = (static_cast<int> (nametable[i*nameentrysize])) << 8 + | static_cast<int> (nametable[i*nameentrysize+1]); + } + + while (true) + { + OCTAVE_QUIT; + + int matches = pcre_exec (re, 0, buffer.c_str (), + buffer.length (), idx, + (idx ? PCRE_NOTBOL : 0), + ovector, (subpatterns+1)*3); + + if (matches == PCRE_ERROR_MATCHLIMIT) + { + // Try harder; start with default value for MATCH_LIMIT + // and increase it. + (*current_liboctave_warning_handler) + ("your pattern caused PCRE to hit its MATCH_LIMIT; trying harder now, but this will be slow"); + + pcre_extra pe; + + pcre_config (PCRE_CONFIG_MATCH_LIMIT, + static_cast <void *> (&pe.match_limit)); + + pe.flags = PCRE_EXTRA_MATCH_LIMIT; + + int i = 0; + while (matches == PCRE_ERROR_MATCHLIMIT + && i++ < PCRE_MATCHLIMIT_MAX) + { + OCTAVE_QUIT; + + pe.match_limit *= 10; + matches = pcre_exec (re, &pe, buffer.c_str (), + buffer.length (), idx, + (idx ? PCRE_NOTBOL : 0), + ovector, (subpatterns+1)*3); + } + } - for (int j = 0; j < i + 1; j++) - regfree (compiled + j); + if (matches < 0 && matches != PCRE_ERROR_NOMATCH) + { + (*current_liboctave_error_handler) + ("%s: internal error calling pcre_exec; error code from pcre_exec is %i", + who.c_str (), matches); + return retval; + } + else if (matches == PCRE_ERROR_NOMATCH) + break; + else if (ovector[1] <= ovector[0]) + { + // Zero sized match. Skip to next char. + idx = ovector[0] + 1; + if (idx < buffer.length ()) + continue; + else + break; + } + else + { + int pos_match = 0; + Matrix token_extents (matches-1, 2); + + for (int i = 1; i < matches; i++) + { + if (ovector[2*i] >= 0 && ovector[2*i+1] > 0 + && (i == 1 || ovector[2*i] != ovector[2*i-2] + || ovector[2*i-1] != ovector[2*i+1]) + && ovector[2*i] >= 0 && ovector[2*i+1] > 0) + { + token_extents(pos_match,0) = double (ovector[2*i]+1); + token_extents(pos_match++,1) = double (ovector[2*i+1]); + } + } + + token_extents.resize (pos_match, 2); + + double start = double (ovector[0]+1); + double end = double (ovector[1]); + + const char **listptr; + int status = pcre_get_substring_list (buffer.c_str (), ovector, + matches, &listptr); + + if (status == PCRE_ERROR_NOMEMORY) + { + (*current_liboctave_error_handler) + ("%s: cannot allocate memory in pcre_get_substring_list", + who.c_str ()); + return retval; + } + + string_vector tokens (pos_match); + string_vector named_tokens (nnames); + int pos_offset = 0; + pos_match = 0; + + for (int i = 1; i < matches; i++) + { + if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) + { + if (i == 1 || ovector[2*i] != ovector[2*i-2] + || ovector[2*i-1] != ovector[2*i+1]) + { + if (namecount > 0) + named_tokens(named_idx(i-pos_offset-1)) = + std::string (*(listptr+nidx[i-pos_offset-1])); + + tokens(pos_match++) = std::string (*(listptr+i)); + } + else + pos_offset++; + } + } + + std::string match_string = std::string (*listptr); + + pcre_free_substring_list (listptr); + + regexp::match_element new_elem (named_tokens, tokens, match_string, + token_extents, start, end); + lst.push_back (new_elem); + idx = ovector[1]; + + if (options.once () || idx >= buffer.length ()) + break; + } } -#else - (*current_liboctave_error_handler) - ("regex not available in this version of Octave"); -#endif + + retval = regexp::match_data (lst, named_pats); + + return retval; } bool -regex_match::match (const std::string& s) +regexp::is_match (const std::string& buffer) { -#if HAVE_REGEX - int npat = pat.length (); - - const char *str = s.c_str (); + regexp::match_data rx_lst = match (buffer); - for (int i = 0; i < npat; i++) - if (regexec (compiled + i, str, 0, 0, 0) == 0) - return true; -#endif + regexp::match_data::const_iterator p = rx_lst.begin (); - return false; + std::string match_string = p->match_string (); + + return ! match_string.empty (); } Array<bool> -regex_match::match (const string_vector& s) +regexp::is_match (const string_vector& buffer) { - int n = s.length (); + octave_idx_type len = buffer.length (); - Array<bool> retval (dim_vector (n, 1)); + Array<bool> retval (len, 1); - for (int i = 0; i < n; i++) - retval(i) = match (s[i]); + for (octave_idx_type i = 0; i < buffer.length (); i++) + retval(i) = is_match (buffer(i)); return retval; } + +std::string +regexp::replace (const std::string& buffer, const std::string& replacement) +{ + std::string retval; + + // Identify replacement tokens; build a vector of group numbers in + // the replacement string so that we can quickly calculate the size + // of the replacement. + + int tokens = 0; + for (size_t i=1; i < replacement.size (); i++) + { + if (replacement[i-1]=='$' && isdigit (replacement[i])) + { + tokens++; + i++; + } + } + std::vector<int> token (tokens); + + int kk = 0; + for (size_t i = 1; i < replacement.size (); i++) + { + if (replacement[i-1]=='$' && isdigit (replacement[i])) + { + token[kk++] = replacement[i]-'0'; + i++; + } + } + + regexp::match_data rx_lst = match (buffer); + + size_t sz = rx_lst.size (); + + if (sz == 0) + { + retval = buffer; + return retval; + } + + std::string rep; + + if (tokens > 0) + { + // Determine replacement length + const size_t replen = replacement.size () - 2*tokens; + int delta = 0; + regexp::match_data::const_iterator p = rx_lst.begin (); + for (size_t i = 0; i < sz; i++) + { + OCTAVE_QUIT; + + double start = p->start (); + double end = p->end (); + + const Matrix pairs (p->token_extents ()); + size_t pairlen = 0; + for (int j = 0; j < tokens; j++) + { + if (token[j] == 0) + pairlen += static_cast<size_t> (end - start) + 1; + else if (token[j] <= pairs.rows ()) + pairlen += static_cast<size_t> (pairs(token[j]-1,1) + - pairs(token[j]-1,0)) + 1; + } + delta += (static_cast<int> (replen + pairlen) + - static_cast<int> (end - start + 1)); + p++; + } + + // Build replacement string + rep.reserve (buffer.size () + delta); + size_t from = 0; + p = rx_lst.begin (); + for (size_t i = 0; i < sz; i++) + { + OCTAVE_QUIT; + + double start = p->start (); + double end = p->end (); + + const Matrix pairs (p->token_extents ()); + rep.append (&buffer[from], static_cast<size_t> (start - 1) - from); + from = static_cast<size_t> (end - 1) + 1; + + for (size_t j = 1; j < replacement.size (); j++) + { + if (replacement[j-1]=='$' && isdigit (replacement[j])) + { + int k = replacement[j]-'0'; + if (k == 0) + { + // replace with entire match + rep.append (&buffer[static_cast<size_t> (end - 1)], + static_cast<size_t> (end - start) + 1); + } + else if (k <= pairs.rows ()) + { + // replace with group capture + rep.append (&buffer[static_cast<size_t> (pairs(k-1,0)-1)], + static_cast<size_t> (pairs(k-1,1) + - pairs(k-1,0)) + 1); + } + else + { + // replace with nothing + } + j++; + } + else + rep.append (1, replacement[j-1]); + + if (j+1 == replacement.size ()) + rep.append (1, replacement[j]); + } + p++; + } + rep.append (&buffer[from], buffer.size () - from); + } + else + { + // Determine replacement length + const size_t replen = replacement.size (); + int delta = 0; + regexp::match_data::const_iterator p = rx_lst.begin (); + for (size_t i = 0; i < sz; i++) + { + OCTAVE_QUIT; + delta += static_cast<int> (replen) + - static_cast<int> (p->end () - p->start () + 1); + p++; + } + + // Build replacement string + rep.reserve (buffer.size () + delta); + size_t from = 0; + p = rx_lst.begin (); + for (size_t i = 0; i < sz; i++) + { + OCTAVE_QUIT; + rep.append (&buffer[from], + static_cast<size_t> (p->start () - 1) - from); + from = static_cast<size_t> (p->end () - 1) + 1; + rep.append (replacement); + p++; + } + rep.append (&buffer[from], buffer.size () - from); + } + + retval = rep; + return retval; +}