Mercurial > hg > octave-lyh
view src/DLD-FUNCTIONS/regexp.cc @ 7520:b166043585a8
use 0 instead of NULL
author | John W. Eaton <jwe@octave.org> |
---|---|
date | Fri, 22 Feb 2008 16:44:03 -0500 |
parents | e4398e3903be |
children | eb9ccb44ea41 |
line wrap: on
line source
/* Copyright (C) 2005, 2006, 2007 David Bateman Copyright (C) 2002, 2003, 2004, 2005 Paul Kienzle This file is part of Octave. Octave is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. Octave is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with Octave; see the file COPYING. If not, see <http://www.gnu.org/licenses/>. */ #ifdef HAVE_CONFIG_H #include <config.h> #endif #include <algorithm> #include <sstream> #include "defun-dld.h" #include "error.h" #include "gripes.h" #include "oct-obj.h" #include "utils.h" #include "Cell.h" #include "oct-map.h" #include "str-vec.h" #include "quit.h" #include "parse.h" #if defined (HAVE_PCRE) #include <pcre.h> #elif defined (HAVE_REGEX) #if defined (__MINGW32__) #define __restrict #endif #if defined (HAVE_SYS_TYPES_H) #include <sys/types.h> #endif #include <regex.h> #endif // The regexp is constructed as a linked list to avoid resizing the // return values in arrays at each new match. // FIXME don't bother collecting and composing return values the user // doesn't want. class regexp_elem { public: regexp_elem (const string_vector& _named_token, const Cell& _t, const std::string& _m, const Matrix& _te, double _s, double _e) : named_token (_named_token), t (_t), m (_m), te (_te), s (_s), e (_e) { } regexp_elem (const regexp_elem &a) : named_token (a.named_token), t (a.t), m (a.m), te (a.te), s (a.s), e (a.e) { } string_vector named_token; Cell t; std::string m; Matrix te; double s; double e; }; typedef std::list<regexp_elem>::const_iterator const_iterator; static int octregexp_list (const octave_value_list &args, const std::string &nm, bool case_insensitive, std::list<regexp_elem> &lst, string_vector &named, int &nopts) { int sz = 0; #if defined (HAVE_REGEX) || defined (HAVE_PCRE) int nargin = args.length(); bool once = false; bool lineanchors = false; bool dotexceptnewline = false; bool freespacing = false; nopts = nargin - 2; std::string buffer = args(0).string_value (); if (error_state) { gripe_wrong_type_arg (nm.c_str(), args(0)); return 0; } std::string pattern = args(1).string_value (); if (error_state) { gripe_wrong_type_arg (nm.c_str(), args(1)); return 0; } for (int i = 2; i < nargin; i++) { std::string str = args(i).string_value(); if (error_state) { error ("%s: optional arguments must be strings", nm.c_str()); break; } std::transform (str.begin (), str.end (), str.begin (), tolower); if (str.find("once", 0) == 0) { once = true; nopts--; } else if (str.find("matchcase", 0) == 0) { case_insensitive = false; nopts--; } else if (str.find("ignorecase", 0) == 0) { case_insensitive = true; nopts--; } else if (str.find("dotall", 0) == 0) { dotexceptnewline = false; nopts--; } else if (str.find("stringanchors", 0) == 0) { lineanchors = false; nopts--; } else if (str.find("literalspacing", 0) == 0) { freespacing = false; nopts--; } #if HAVE_PCRE // Only accept these options with pcre else if (str.find("dotexceptnewline", 0) == 0) { dotexceptnewline = true; nopts--; } else if (str.find("lineanchors", 0) == 0) { lineanchors = true; nopts--; } else if (str.find("freespacing", 0) == 0) { freespacing = true; nopts--; } else if (str.find("start", 0) && str.find("end", 0) && str.find("tokenextents", 0) && str.find("match", 0) && str.find("tokens", 0) && str.find("names", 0)) error ("%s: unrecognized option", nm.c_str()); #else else if (str.find("names", 0) == 0 || str.find("dotexceptnewline", 0) == 0 || str.find("lineanchors", 0) == 0 || str.find("freespacing", 0) == 0) error ("%s: %s not implemented in this version", str.c_str(), nm.c_str()); else if (str.find("start", 0) && str.find("end", 0) && str.find("tokenextents", 0) && str.find("match", 0) && str.find("tokens", 0)) error ("%s: unrecognized option", nm.c_str()); #endif } if (!error_state) { Cell t; std::string m; double s, e; // named tokens "(?<name>...)" are only treated with PCRE not regex. #if HAVE_PCRE // The syntax of named tokens in pcre is "(?P<name>...)" while we need // a syntax "(?<name>...)", so fix that here. Also an expression like // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)" should // be perfectly legal, while pcre does not allow the same named token // name on both sides of the alternative. Also fix that here by replacing // name tokens by dummy names, and dealing with the dummy names later. size_t pos = 0; size_t new_pos; int nnames = 0; int inames = 0; std::ostringstream buf; Array<int> named_idx; while ((new_pos = pattern.find ("(?<",pos)) != NPOS) { size_t tmp_pos = pattern.find_first_of ('>',new_pos); if (tmp_pos == NPOS) { error ("syntax error in pattern"); break; } std::string tmp_name = pattern.substr(new_pos+3,tmp_pos-new_pos-3); bool found = false; for (int i = 0; i < nnames; i++) if (named(i) == tmp_name) { named_idx.resize(inames+1); named_idx(inames) = i; found = true; break; } if (! found) { named_idx.resize(inames+1); named_idx(inames) = nnames; named.append(tmp_name); nnames++; } if (new_pos - pos > 0) buf << pattern.substr(pos,new_pos-pos); if (inames < 10) buf << "(?P<n00" << inames++; else if (inames < 100) buf << "(?P<n0" << inames++; else buf << "(?P<n" << inames++; pos = tmp_pos; } buf << pattern.substr(pos); if (error_state) return 0; // Compile expression pcre *re; const char *err; int erroffset; std::string buf_str = buf.str (); re = pcre_compile (buf_str.c_str (), (case_insensitive ? PCRE_CASELESS : 0) | (dotexceptnewline ? 0 : PCRE_DOTALL) | (lineanchors ? PCRE_MULTILINE : 0) | (freespacing ? PCRE_EXTENDED : 0), &err, &erroffset, 0); if (re == 0) { error("%s: %s at position %d of expression", nm.c_str(), err, erroffset); return 0; } int subpatterns; int namecount; int nameentrysize; char *nametable; int idx = 0; pcre_fullinfo(re, 0, PCRE_INFO_CAPTURECOUNT, &subpatterns); pcre_fullinfo(re, 0, PCRE_INFO_NAMECOUNT, &namecount); pcre_fullinfo(re, 0, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize); pcre_fullinfo(re, 0, PCRE_INFO_NAMETABLE, &nametable); OCTAVE_LOCAL_BUFFER(int, ovector, (subpatterns+1)*3); OCTAVE_LOCAL_BUFFER(int, nidx, namecount); for (int i = 0; i < namecount; i++) { // Index of subpattern in first two bytes MSB first of name. // Extract index. nidx[i] = (static_cast<int>(nametable[i*nameentrysize])) << 8 | static_cast<int>(nametable[i*nameentrysize+1]); } while(true) { OCTAVE_QUIT; int matches = pcre_exec(re, 0, buffer.c_str(), buffer.length(), idx, (idx ? PCRE_NOTBOL : 0), ovector, (subpatterns+1)*3); if (matches < 0 && matches != PCRE_ERROR_NOMATCH) { error ("%s: internal error calling pcre_exec", nm.c_str()); pcre_free(re); return 0; } else if (matches == PCRE_ERROR_NOMATCH) break; else if (ovector[1] <= ovector[0]) break; else { int pos_match = 0; Matrix te(matches-1,2); for (int i = 1; i < matches; i++) { if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) { te(pos_match,0) = double (ovector[2*i]+1); te(pos_match++,1) = double (ovector[2*i+1]); } } te.resize(pos_match,2); s = double (ovector[0]+1); e = double (ovector[1]); const char **listptr; int status = pcre_get_substring_list(buffer.c_str(), ovector, matches, &listptr); if (status == PCRE_ERROR_NOMEMORY) { error("%s: cannot allocate memory in pcre_get_substring_list", nm.c_str()); pcre_free(re); return 0; } Cell cell_t (dim_vector(1,pos_match)); pos_match = 0; for (int i = 1; i < matches; i++) if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) cell_t(pos_match++) = std::string(*(listptr+i)); m = std::string(*listptr); t = cell_t; string_vector named_tokens(nnames); if (namecount > 0) for (int i = 1; i < matches; i++) { if (ovector[2*i] >= 0 && ovector[2*i+1] > 0) { named_tokens(named_idx(i-1)) = std::string(*(listptr+nidx[i-1])); } } pcre_free_substring_list(listptr); regexp_elem new_elem (named_tokens, t, m, te, s, e); lst.push_back (new_elem); idx = ovector[1]; sz++; if (once) break; } } pcre_free(re); #else regex_t compiled; int err=regcomp(&compiled, pattern.c_str(), REG_EXTENDED | (case_insensitive ? REG_ICASE : 0)); if (err) { int len = regerror(err, &compiled, 0, 0); OCTAVE_LOCAL_BUFFER (char, errmsg, len); regerror(err, &compiled, errmsg, len); error("%s: %s in pattern (%s)", nm.c_str(), errmsg, pattern.c_str()); regfree(&compiled); return 0; } int subexpr = 1; int idx = 0; for (unsigned int i=0; i < pattern.length(); i++) subexpr += ( pattern[i] == '(' ? 1 : 0 ); OCTAVE_LOCAL_BUFFER (regmatch_t, match, subexpr ); while(true) { OCTAVE_QUIT; if (regexec(&compiled, buffer.c_str() + idx, subexpr, match, (idx ? REG_NOTBOL : 0)) == 0) { // Count actual matches int matches = 0; while (matches < subexpr && match[matches].rm_so >= 0) matches++; s = double (match[0].rm_so+1+idx); e = double (match[0].rm_eo+idx); Matrix te(matches-1,2); for (int i = 1; i < matches; i++) { te(i-1,0) = double (match[i].rm_so+1+idx); te(i-1,1) = double (match[i].rm_eo+idx); } m = buffer.substr (match[0].rm_so+idx, match[0].rm_eo-match[0].rm_so); Cell cell_t (dim_vector(1,matches-1)); for (int i = 1; i < matches; i++) cell_t(i-1) = buffer.substr (match[i].rm_so+idx, match[i].rm_eo-match[i].rm_so); t = cell_t; idx += match[0].rm_eo; string_vector sv; regexp_elem new_elem (sv, t, m, te, s, e); lst.push_back (new_elem); sz++; if (once) break; } else break; } regfree(&compiled); #endif } #else error ("%s: not available in this version of Octave", nm.c_str()); #endif return sz; } static octave_value_list octregexp (const octave_value_list &args, int nargout, const std::string &nm, bool case_insensitive) { octave_value_list retval; int nargin = args.length(); std::list<regexp_elem> lst; string_vector named; int nopts; int sz = octregexp_list (args, nm, case_insensitive, lst, named, nopts); if (! error_state) { // Converted the linked list in the correct form for the return values octave_idx_type i = 0; #ifdef HAVE_PCRE Octave_map nmap; if (sz == 1) { for (int j = 0; j < named.length(); j++) nmap.assign (named(j), lst.begin()->named_token(j)); retval(5) = nmap; } else { for (int j = 0; j < named.length (); j++) { i = 0; Cell tmp(dim_vector (1, sz)); for (const_iterator p = lst.begin(); p != lst.end(); p++) tmp(i++) = p->named_token(j); nmap.assign (named(j), octave_value (tmp)); } retval(5) = nmap; } #else retval(5) = Octave_map(); #endif Cell t (dim_vector(1, sz)); i = 0; for (const_iterator p = lst.begin(); p != lst.end(); p++) t(i++) = p->t; retval(4) = t; Cell m (dim_vector(1, sz)); i = 0; for (const_iterator p = lst.begin(); p != lst.end(); p++) m(i++) = p->m; retval(3) = m; Cell te (dim_vector(1, sz)); i = 0; for (const_iterator p = lst.begin(); p != lst.end(); p++) te(i++) = p->te; retval(2) = te; NDArray e (dim_vector(1, sz)); i = 0; for (const_iterator p = lst.begin(); p != lst.end(); p++) e(i++) = p->e; retval(1) = e; NDArray s (dim_vector(1, sz)); i = 0; for (const_iterator p = lst.begin(); p != lst.end(); p++) s(i++) = p->s; retval(0) = s; // Alter the order of the output arguments if (nopts > 0) { int n = 0; octave_value_list new_retval; new_retval.resize(nargout); OCTAVE_LOCAL_BUFFER (int, arg_used, 6); for (int j = 0; j < 6; j++) arg_used[j] = false; for (int j = 2; j < nargin; j++) { int k = 0; std::string str = args(j).string_value(); std::transform (str.begin (), str.end (), str.begin (), tolower); if (str.find("once", 0) == 0 || str.find("stringanchors", 0) == 0 || str.find("lineanchors", 0) == 0 || str.find("matchcase", 0) == 0 || str.find("ignorecase", 0) == 0 || str.find("dotall", 0) == 0 || str.find("dotexceptnewline", 0) == 0 || str.find("literalspacing", 0) == 0 || str.find("freespacing", 0) == 0 ) continue; else if (str.find("start", 0) == 0) k = 0; else if (str.find("end", 0) == 0) k = 1; else if (str.find("tokenextents", 0) == 0) k = 2; else if (str.find("match", 0) == 0) k = 3; else if (str.find("tokens", 0) == 0) k = 4; else if (str.find("names", 0) == 0) k = 5; new_retval(n++) = retval(k); arg_used[k] = true; if (n == nargout) break; } // Fill in the rest of the arguments if (n < nargout) { for (int j = 0; j < 6; j++) { if (! arg_used[j]) new_retval(n++) = retval(j); } } retval = new_retval; } } return retval; } static octave_value_list octcellregexp (const octave_value_list &args, int nargout, const std::string &nm, bool case_insensitive) { octave_value_list retval; if (args(0).is_cell()) { OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout); octave_value_list new_args = args; Cell cellstr = args(0).cell_value(); if (args(1).is_cell()) { Cell cellpat = args(1).cell_value(); if (cellpat.numel() == 1) { for (int j = 0; j < nargout; j++) newretval[j].resize(cellstr.dims()); new_args(1) = cellpat(0); for (octave_idx_type i = 0; i < cellstr.numel (); i++) { new_args(0) = cellstr(i); octave_value_list tmp = octregexp (new_args, nargout, nm, case_insensitive); if (error_state) break; for (int j = 0; j < nargout; j++) newretval[j](i) = tmp(j); } } else if (cellstr.numel() == 1) { for (int j = 0; j < nargout; j++) newretval[j].resize(cellpat.dims()); new_args(0) = cellstr(0); for (octave_idx_type i = 0; i < cellpat.numel (); i++) { new_args(1) = cellpat(i); octave_value_list tmp = octregexp (new_args, nargout, nm, case_insensitive); if (error_state) break; for (int j = 0; j < nargout; j++) newretval[j](i) = tmp(j); } } else if (cellstr.numel() == cellpat.numel()) { if (cellstr.dims() != cellpat.dims()) error ("%s: Inconsistent cell array dimensions", nm.c_str()); else { for (int j = 0; j < nargout; j++) newretval[j].resize(cellstr.dims()); for (octave_idx_type i = 0; i < cellstr.numel (); i++) { new_args(0) = cellstr(i); new_args(1) = cellpat(i); octave_value_list tmp = octregexp (new_args, nargout, nm, case_insensitive); if (error_state) break; for (int j = 0; j < nargout; j++) newretval[j](i) = tmp(j); } } } else error ("regexp: cell array arguments must be scalar or equal size"); } else { for (int j = 0; j < nargout; j++) newretval[j].resize(cellstr.dims()); for (octave_idx_type i = 0; i < cellstr.numel (); i++) { new_args(0) = cellstr(i); octave_value_list tmp = octregexp (new_args, nargout, nm, case_insensitive); if (error_state) break; for (int j = 0; j < nargout; j++) newretval[j](i) = tmp(j); } } if (!error_state) for (int j = 0; j < nargout; j++) retval(j) = octave_value (newretval[j]); } else if (args(1).is_cell()) { OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout); octave_value_list new_args = args; Cell cellpat = args(1).cell_value(); for (int j = 0; j < nargout; j++) newretval[j].resize(cellpat.dims()); for (octave_idx_type i = 0; i < cellpat.numel (); i++) { new_args(1) = cellpat(i); octave_value_list tmp = octregexp (new_args, nargout, nm, case_insensitive); if (error_state) break; for (int j = 0; j < nargout; j++) newretval[j](i) = tmp(j); } if (!error_state) for (int j = 0; j < nargout; j++) retval(j) = octave_value (newretval[j]); } else retval = octregexp (args, nargout, nm, case_insensitive); return retval; } DEFUN_DLD (regexp, args, nargout, "-*- texinfo -*-\n\ @deftypefn {Loadable Function} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}] =} regexp (@var{str}, @var{pat})\n\ @deftypefnx {Loadable Function} {[@dots{}] =} regexp (@var{str}, @var{pat}, @var{opts}, @dots{})\n\ \n\ Regular expression string matching. Matches @var{pat} in @var{str} and\n\ returns the position and matching substrings or empty values if there are\n\ none.\n\ \n\ The matched pattern @var{pat} can include any of the standard regex\n\ operators, including:\n\ \n\ @table @code\n\ @item .\n\ Match any character\n\ @item * + ? @{@}\n\ Repetition operators, representing\n\ @table @code\n\ @item *\n\ Match zero or more times\n\ @item +\n\ Match one or more times\n\ @item ?\n\ Match zero or one times\n\ @item @{@}\n\ Match range operator, which is of the form @code{@{@var{n}@}} to match exactly\n\ @var{n} times, @code{@{@var{m},@}} to match @var{m} or more times,\n\ @code{@{@var{m},@var{n}@}} to match between @var{m} and @var{n} times.\n\ @end table\n\ @item [@dots{}] [^@dots{}]\n\ List operators, where for example @code{[ab]c} matches @code{ac} and @code{bc}\n\ @item ()\n\ Grouping operator\n\ @item |\n\ Alternation operator. Match one of a choice of regular expressions. The\n\ alternatives must be delimited by the grouping operator @code{()} above\n\ @item ^ $\n\ Anchoring operator. @code{^} matches the start of the string @var{str} and\n\ @code{$} the end\n\ @end table\n\ \n\ In addition the following escaped characters have special meaning. It should\n\ be noted that it is recommended to quote @var{pat} in single quotes rather\n\ than double quotes, to avoid the escape sequences being interpreted by octave\n\ before being passed to @code{regexp}.\n\ \n\ @table @code\n\ @item \\b\n\ Match a word boundary\n\ @item \\B\n\ Match within a word\n\ @item \\w\n\ Matches any word character\n\ @item \\W\n\ Matches any non word character\n\ @item \\<\n\ Matches the beginning of a word\n\ @item \\>\n\ Matches the end of a word\n\ @item \\s\n\ Matches any whitespace character\n\ @item \\S\n\ Matches any non whitespace character\n\ @item \\d\n\ Matches any digit\n\ @item \\D\n\ Matches any non-digit\n\ @end table\n\ \n\ The outputs of @code{regexp} by default are in the order as given below\n\ \n\ @table @asis\n\ @item @var{s}\n\ The start indices of each of the matching substrings\n\ \n\ @item @var{e}\n\ The end indices of each matching substring\n\ \n\ @item @var{te}\n\ The extents of each of the matched token surrounded by @code{(@dots{})} in\n\ @var{pat}.\n\ \n\ @item @var{m}\n\ A cell array of the text of each match.\n\ \n\ @item @var{t}\n\ A cell array of the text of each token matched.\n\ \n\ @item @var{nm}\n\ A structure containing the text of each matched named token, with the name\n\ being used as the fieldname. A named token is denoted as\n\ @code{(?<name>@dots{})}\n\ @end table\n\ \n\ Particular output arguments or the order of the output arguments can be\n\ selected by additional @var{opts} arguments. These are strings and the\n\ correspondence between the output arguments and the optional argument\n\ are\n\ \n\ @multitable @columnfractions 0.2 0.3 0.3 0.2\n\ @item @tab 'start' @tab @var{s} @tab\n\ @item @tab 'end' @tab @var{e} @tab\n\ @item @tab 'tokenExtents' @tab @var{te} @tab\n\ @item @tab 'match' @tab @var{m} @tab\n\ @item @tab 'tokens' @tab @var{t} @tab\n\ @item @tab 'names' @tab @var{nm} @tab\n\ @end multitable\n\ \n\ A further optional argument is 'once', that limits the number of returned\n\ matches to the first match. Additional arguments are\n\ \n\ @table @asis\n\ @item matchcase\n\ Make the matching case sensitive.\n\ @item ignorecase\n\ Make the matching case insensitive.\n\ @item stringanchors\n\ Match the anchor characters at the beginning and end of the string.\n\ @item lineanchors\n\ Match the anchor characters at the beginning and end of the line.\n\ @item dotall\n\ The character @code{.} matches the newline character.\n\ @item dotexceptnewline\n\ The character @code{.} matches all but the newline character.\n\ @item freespacing\n\ The pattern can include arbitrary whitespace and comments starting with\n\ @code{#}.\n\ @item literalspacing\n\ The pattern is taken literally.\n\ @end table\n\ @end deftypefn") { octave_value_list retval; int nargin = args.length(); if (nargin < 2) print_usage (); else if (args(0).is_cell() || args(1).is_cell()) retval = octcellregexp (args, nargout, "regexp", false); else retval = octregexp (args, nargout, "regexp", false); return retval; } /* ## seg-fault test %!assert(regexp("abcde","."),[1,2,3,4,5]) ## Check that anchoring of pattern works correctly %!assert(regexp('abcabc','^abc'),1); %!assert(regexp('abcabc','abc$'),4); %!assert(regexp('abcabc','^abc$'),zeros(1,0)); %!test %! [s, e, te, m, t] = regexp(' No Match ', 'f(.*)uck'); %! assert (s,zeros(1,0)) %! assert (e,zeros(1,0)) %! assert (te,cell(1,0)) %! assert (m, cell(1,0)) %! assert (t, cell(1,0)) %!test %! [s, e, te, m, t] = regexp(' FiRetrUck ', 'f(.*)uck'); %! assert (s,zeros(1,0)) %! assert (e,zeros(1,0)) %! assert (te,cell(1,0)) %! assert (m, cell(1,0)) %! assert (t, cell(1,0)) %!test %! [s, e, te, m, t] = regexp(' firetruck ', 'f(.*)uck'); %! assert (s,2) %! assert (e,10) %! assert (te{1},[3,7]) %! assert (m{1}, 'firetruck') %! assert (t{1}{1}, 'iretr') %!test %! [s, e, te, m, t] = regexp('short test string','\w*r\w*'); %! assert (s,[1,12]) %! assert (e,[5,17]) %! assert (size(te), [1,2]) %! assert (isempty(te{1})) %! assert (isempty(te{2})) %! assert (m{1},'short') %! assert (m{2},'string') %! assert (size(t), [1,2]) %! assert (isempty(t{1})) %! assert (isempty(t{2})) %!test %! [s, e, te, m, t] = regexp('short test string','\w*r\w*','once'); %! assert (s,1) %! assert (e,5) %! assert (size(te), [1,1]) %! assert (isempty(te{1})) %! assert (m{1},'short') %! ## Matlab gives [1,0] here but that seems wrong. %! assert (size(t), [1,1]) %!test %! [m, te, e, s, t] = regexp('short test string','\w*r\w*','once', 'match', 'tokenExtents', 'end', 'start', 'tokens'); %! assert (s,1) %! assert (e,5) %! assert (size(te), [1,1]) %! assert (isempty(te{1})) %! assert (m{1},'short') %! ## Matlab gives [1,0] here but that seems wrong. %! assert (size(t), [1,1]) %!testif HAVE_PCRE %! ## This test is expected to fail if PCRE is not installed %! [s, e, te, m, t, nm] = regexp('short test string','(?<word1>\w*t)\s*(?<word2>\w*t)'); %! assert (s,1) %! assert (e,10) %! assert (size(te), [1,1]) %! assert (te{1}, [1 5; 7, 10]) %! assert (m{1},'short test') %! assert (size(t),[1,1]) %! assert (t{1}{1},'short') %! assert (t{1}{2},'test') %! assert (size(nm), [1,1]) %! assert (!isempty(fieldnames(nm))) %! assert (sort(fieldnames(nm)),{'word1';'word2'}) %! assert (nm.word1,'short') %! assert (nm.word2,'test') %!testif HAVE_PCRE %! ## This test is expected to fail if PCRE is not installed %! [nm, m, te, e, s, t] = regexp('short test string','(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens'); %! assert (s,1) %! assert (e,10) %! assert (size(te), [1,1]) %! assert (te{1}, [1 5; 7, 10]) %! assert (m{1},'short test') %! assert (size(t),[1,1]) %! assert (t{1}{1},'short') %! assert (t{1}{2},'test') %! assert (size(nm), [1,1]) %! assert (!isempty(fieldnames(nm))) %! assert (sort(fieldnames(nm)),{'word1';'word2'}) %! assert (nm.word1,'short') %! assert (nm.word2,'test') %!testif HAVE_PCRE %! ## This test is expected to fail if PCRE is not installed %! [t, nm] = regexp("John Davis\nRogers, James",'(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)','tokens','names'); %! assert (size(t), [1,2]); %! assert (t{1}{1},'John'); %! assert (t{1}{2},'Davis'); %! assert (t{2}{1},'Rogers'); %! assert (t{2}{2},'James'); %! assert (size(nm), [1,1]); %! assert (nm.first{1},'John'); %! assert (nm.first{2},'James'); %! assert (nm.last{1},'Davis'); %! assert (nm.last{2},'Rogers'); %!assert(regexp("abc\nabc",'.'),[1:7]) %!assert(regexp("abc\nabc",'.','dotall'),[1:7]) %!testif HAVE_PCRE %! assert(regexp("abc\nabc",'(?s).'),[1:7]) %! assert(regexp("abc\nabc",'.','dotexceptnewline'),[1,2,3,5,6,7]) %! assert(regexp("abc\nabc",'(?-s).'),[1,2,3,5,6,7]) %!assert(regexp("caseCaSe",'case'),1) %!assert(regexp("caseCaSe",'case',"matchcase"),1) %!assert(regexp("caseCaSe",'case',"ignorecase"),[1,5]) %!testif HAVE_PCRE %! assert(regexp("caseCaSe",'(?-i)case'),1) %! assert(regexp("caseCaSe",'(?i)case'),[1,5]) %!assert (regexp("abc\nabc",'c$'),7) %!assert (regexp("abc\nabc",'c$',"stringanchors"),7) %!testif HAVE_PCRE %! assert (regexp("abc\nabc",'(?-m)c$'),7) %! assert (regexp("abc\nabc",'c$',"lineanchors"),[3,7]) %! assert (regexp("abc\nabc",'(?m)c$'),[3,7]) %!assert (regexp("this word",'s w'),4) %!assert (regexp("this word",'s w','literalspacing'),4) %!testif HAVE_PCRE %! assert (regexp("this word",'(?-x)s w','literalspacing'),4) %! assert (regexp("this word",'s w','freespacing'),zeros(1,0)) %! assert (regexp("this word",'(?x)s w'),zeros(1,0)) %!error regexp('string', 'tri', 'BadArg'); %!error regexp('string'); %!assert(regexp({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},'-'),{6;[1,5,9];zeros(1,0)}) %!assert(regexp({'asdfg-dfd','-dfd-dfd-','qasfdfdaq'},'-'),{6,[1,5,9],zeros(1,0)}) %!assert(regexp({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},{'-';'f';'q'}),{6;[3,7];[1,9]}) %!assert(regexp('Strings',{'t','s'}),{2,7}) */ DEFUN_DLD (regexpi, args, nargout, "-*- texinfo -*-\n\ @deftypefn {Loadable Function} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}] =} regexpi (@var{str}, @var{pat})\n\ @deftypefnx {Loadable Function} {[@dots{}] =} regexpi (@var{str}, @var{pat}, @var{opts}, @dots{})\n\ \n\ Case insensitive regular expression string matching. Matches @var{pat} in\n\ @var{str} and returns the position and matching substrings or empty values\n\ if there are none. See @code{regexp} for more details\n\ @end deftypefn") { octave_value_list retval; int nargin = args.length(); if (nargin < 2) print_usage (); else if (args(0).is_cell() || args(1).is_cell()) retval = octcellregexp (args, nargout, "regexpi", true); else retval = octregexp (args, nargout, "regexpi", true); return retval; } /* ## seg-fault test %!assert(regexpi("abcde","."),[1,2,3,4,5]) ## Check that anchoring of pattern works correctly %!assert(regexpi('abcabc','^abc'),1); %!assert(regexpi('abcabc','abc$'),4); %!assert(regexpi('abcabc','^abc$'),zeros(1,0)); %!test %! [s, e, te, m, t] = regexpi(' No Match ', 'f(.*)uck'); %! assert (s,zeros(1,0)) %! assert (e,zeros(1,0)) %! assert (te,cell(1,0)) %! assert (m, cell(1,0)) %! assert (t, cell(1,0)) %!test %! [s, e, te, m, t] = regexpi(' FiRetrUck ', 'f(.*)uck'); %! assert (s,2) %! assert (e,10) %! assert (te{1},[3,7]) %! assert (m{1}, 'FiRetrUck') %! assert (t{1}{1}, 'iRetr') %!test %! [s, e, te, m, t] = regexpi(' firetruck ', 'f(.*)uck'); %! assert (s,2) %! assert (e,10) %! assert (te{1},[3,7]) %! assert (m{1}, 'firetruck') %! assert (t{1}{1}, 'iretr') %!test %! [s, e, te, m, t] = regexpi('ShoRt Test String','\w*r\w*'); %! assert (s,[1,12]) %! assert (e,[5,17]) %! assert (size(te), [1,2]) %! assert (isempty(te{1})) %! assert (isempty(te{2})) %! assert (m{1},'ShoRt') %! assert (m{2},'String') %! assert (size(t), [1,2]) %! assert (isempty(t{1})) %! assert (isempty(t{2})) %!test %! [s, e, te, m, t] = regexpi('ShoRt Test String','\w*r\w*','once'); %! assert (s,1) %! assert (e,5) %! assert (size(te), [1,1]) %! assert (isempty(te{1})) %! assert (m{1},'ShoRt') %! ## Matlab gives [1,0] here but that seems wrong. %! assert (size(t), [1,1]) %!test %! [m, te, e, s, t] = regexpi('ShoRt Test String','\w*r\w*','once', 'match', 'tokenExtents', 'end', 'start', 'tokens'); %! assert (s,1) %! assert (e,5) %! assert (size(te), [1,1]) %! assert (isempty(te{1})) %! assert (m{1},'ShoRt') %! ## Matlab gives [1,0] here but that seems wrong. %! assert (size(t), [1,1]) %!testif HAVE_PCRE %! ## This test is expected to fail if PCRE is not installed %! [s, e, te, m, t, nm] = regexpi('ShoRt Test String','(?<word1>\w*t)\s*(?<word2>\w*t)'); %! assert (s,1) %! assert (e,10) %! assert (size(te), [1,1]) %! assert (te{1}, [1 5; 7, 10]) %! assert (m{1},'ShoRt Test') %! assert (size(t),[1,1]) %! assert (t{1}{1},'ShoRt') %! assert (t{1}{2},'Test') %! assert (size(nm), [1,1]) %! assert (!isempty(fieldnames(nm))) %! assert (sort(fieldnames(nm)),{'word1';'word2'}) %! assert (nm.word1,'ShoRt') %! assert (nm.word2,'Test') %!testif HAVE_PCRE %! ## This test is expected to fail if PCRE is not installed %! [nm, m, te, e, s, t] = regexpi('ShoRt Test String','(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens'); %! assert (s,1) %! assert (e,10) %! assert (size(te), [1,1]) %! assert (te{1}, [1 5; 7, 10]) %! assert (m{1},'ShoRt Test') %! assert (size(t),[1,1]) %! assert (t{1}{1},'ShoRt') %! assert (t{1}{2},'Test') %! assert (size(nm), [1,1]) %! assert (!isempty(fieldnames(nm))) %! assert (sort(fieldnames(nm)),{'word1';'word2'}) %! assert (nm.word1,'ShoRt') %! assert (nm.word2,'Test') %!assert(regexpi("abc\nabc",'.'),[1:7]) %!assert(regexpi("abc\nabc",'.','dotall'),[1:7]) %!testif HAVE_PCRE %! assert(regexpi("abc\nabc",'(?s).'),[1:7]) %! assert(regexpi("abc\nabc",'.','dotexceptnewline'),[1,2,3,5,6,7]) %! assert(regexpi("abc\nabc",'(?-s).'),[1,2,3,5,6,7]) %!assert(regexpi("caseCaSe",'case'),[1,5]) %!assert(regexpi("caseCaSe",'case',"matchcase"),1) %!assert(regexpi("caseCaSe",'case',"ignorecase"),[1,5]) %!testif HAVE_PCRE %! assert(regexpi("caseCaSe",'(?-i)case'),1) %! assert(regexpi("caseCaSe",'(?i)case'),[1,5]) %!assert (regexpi("abc\nabc",'c$'),7) %!assert (regexpi("abc\nabc",'c$',"stringanchors"),7) %!testif HAVE_PCRE %! assert (regexpi("abc\nabc",'(?-m)c$'),7) %! assert (regexpi("abc\nabc",'c$',"lineanchors"),[3,7]) %! assert (regexpi("abc\nabc",'(?m)c$'),[3,7]) %!assert (regexpi("this word",'s w'),4) %!assert (regexpi("this word",'s w','literalspacing'),4) %!testif HAVE_PCRE %! assert (regexpi("this word",'(?-x)s w','literalspacing'),4) %! assert (regexpi("this word",'s w','freespacing'),zeros(1,0)) %! assert (regexpi("this word",'(?x)s w'),zeros(1,0)) %!error regexpi('string', 'tri', 'BadArg'); %!error regexpi('string'); %!assert(regexpi({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},'-'),{6;[1,5,9];zeros(1,0)}) %!assert(regexpi({'asdfg-dfd','-dfd-dfd-','qasfdfdaq'},'-'),{6,[1,5,9],zeros(1,0)}) %!assert(regexpi({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},{'-';'f';'q'}),{6;[3,7];[1,9]}) %!assert(regexpi('Strings',{'t','s'}),{2,[1,7]}) */ static octave_value octregexprep (const octave_value_list &args, const std::string &nm) { octave_value retval; int nargin = args.length(); // Make sure we have string,pattern,replacement const std::string buffer = args(0).string_value (); if (error_state) return retval; const std::string pattern = args(1).string_value (); if (error_state) return retval; const std::string replacement = args(2).string_value (); if (error_state) return retval; // Pack options excluding 'tokenize' and various output // reordering strings into regexp arg list octave_value_list regexpargs(nargin-1,octave_value()); regexpargs(0) = args(0); regexpargs(1) = args(1); int len=2; for (int i = 3; i < nargin; i++) { const std::string opt = args(i).string_value(); if (opt != "tokenize" && opt != "start" && opt != "end" && opt != "tokenextents" && opt != "match" && opt != "tokens" && opt != "names" && opt != "warnings") { regexpargs(len++) = args(i); } } regexpargs.resize(len); // Identify replacement tokens; build a vector of group numbers in // the replacement string so that we can quickly calculate the size // of the replacement. int tokens = 0; for (size_t i=1; i < replacement.size(); i++) { if (replacement[i-1]=='$' && isdigit(replacement[i])) { tokens++, i++; } } std::vector<int> token(tokens); int kk = 0; for (size_t i = 1; i < replacement.size(); i++) { if (replacement[i-1]=='$' && isdigit(replacement[i])) { token[kk++] = replacement[i]-'0'; i++; } } // Perform replacement std::string rep; if (tokens > 0) { std::list<regexp_elem> lst; string_vector named; int nopts; int sz = octregexp_list (regexpargs, nm , false, lst, named, nopts); if (error_state) return retval; if (sz == 0) { retval = args(0); return retval; } // Determine replacement length const size_t replen = replacement.size() - 2*tokens; int delta = 0; const_iterator p = lst.begin(); for (int i = 0; i < sz; i++) { OCTAVE_QUIT; const Matrix pairs(p->te); size_t pairlen = 0; for (int j = 0; j < tokens; j++) { if (token[j] == 0) pairlen += static_cast<size_t>(p->e - p->s) + 1; else if (token[j] <= pairs.rows()) pairlen += static_cast<size_t>(pairs(token[j]-1,1) - pairs(token[j]-1,0)) + 1; } delta += static_cast<int>(replen + pairlen) - static_cast<int>(p->e - p->s + 1); p++; } // Build replacement string rep.reserve(buffer.size()+delta); size_t from = 0; p = lst.begin(); for (int i=0; i < sz; i++) { OCTAVE_QUIT; const Matrix pairs(p->te); rep.append(&buffer[from], static_cast<size_t>(p->s - 1) - from); from = static_cast<size_t>(p->e - 1) + 1; for (size_t j = 1; j < replacement.size(); j++) { if (replacement[j-1]=='$' && isdigit(replacement[j])) { int k = replacement[j]-'0'; if (k == 0) { // replace with entire match rep.append(&buffer[static_cast<size_t>(p->e - 1)], static_cast<size_t>(p->e - p->s) + 1); } else if (k <= pairs.rows()) { // replace with group capture rep.append(&buffer[static_cast<size_t>(pairs(k-1,0)-1)], static_cast<size_t>(pairs(k-1,1) - pairs(k-1,0))+1); } else { // replace with nothing } j++; } else { rep.append(1,replacement[j-1]); } if (j+1 == replacement.size()) { rep.append(1,replacement[j]); } } p++; } rep.append(&buffer[from],buffer.size()-from); } else { std::list<regexp_elem> lst; string_vector named; int nopts; int sz = octregexp_list (regexpargs, nm, false, lst, named,nopts); if (error_state) return retval; if (sz == 0) { retval = args(0); return retval; } // Determine replacement length const size_t replen = replacement.size(); int delta = 0; const_iterator p = lst.begin(); for (int i = 0; i < sz; i++) { OCTAVE_QUIT; delta += static_cast<int>(replen) - static_cast<int>(p->e - p->s + 1); p++; } // Build replacement string rep.reserve(buffer.size()+delta); size_t from = 0; p = lst.begin(); for (int i=0; i < sz; i++) { OCTAVE_QUIT; rep.append(&buffer[from], static_cast<size_t>(p->s - 1) - from); from = static_cast<size_t>(p->e - 1) + 1; rep.append(replacement); p++; } rep.append(&buffer[from],buffer.size()-from); } retval = rep; return retval; } DEFUN_DLD (regexprep, args, , "-*- texinfo -*-\n\ @deftypefn {Loadable Function} {@var{string} =} regexprep (@var{string}, @var{pat}, @var{repstr}, @var{options})\n\ Replace matches of @var{pat} in @var{string} with @var{repstr}.\n\ \n\ \n\ The replacement can contain @code{$i}, which substitutes\n\ for the ith set of parentheses in the match string. E.g.,\n\ @example\n\ \n\ regexprep(\"Bill Dunn\",'(\\w+) (\\w+)','$2, $1')\n\ \n\ @end example\n\ returns \"Dunn, Bill\"\n\ \n\ @var{options} may be zero or more of\n\ @table @samp\n\ \n\ @item once\n\ Replace only the first occurrence of @var{pat} in the result.\n\ \n\ @item warnings\n\ This option is present for compatibility but is ignored.\n\ \n\ @item ignorecase or matchcase\n\ Ignore case for the pattern matching (see @code{regexpi}).\n\ Alternatively, use (?i) or (?-i) in the pattern.\n\ \n\ @item lineanchors and stringanchors\n\ Whether characters ^ and $ match the beginning and ending of lines.\n\ Alternatively, use (?m) or (?-m) in the pattern.\n\ \n\ @item dotexceptnewline and dotall\n\ Whether . matches newlines in the string.\n\ Alternatively, use (?s) or (?-s) in the pattern.\n\ \n\ @item freespacing or literalspacing\n\ Whether whitespace and # comments can be used to make the regular expression more readable.\n\ Alternatively, use (?x) or (?-x) in the pattern.\n\ \n\ @end table\n\ @seealso{regexp,regexpi}\n\ @end deftypefn") { octave_value_list retval; int nargin = args.length(); if (nargin < 3) { print_usage (); return retval; } if (args(0).is_cell() || args(1).is_cell() || args(2).is_cell()) { Cell str; Cell pat; Cell rep; dim_vector dv0; dim_vector dv1(1,1); if (args(0).is_cell()) str = args(0).cell_value(); else str = Cell (args(0)); if (args(1).is_cell()) pat = args(1).cell_value(); else pat = Cell (args(1)); if (args(2).is_cell()) rep = args(2).cell_value(); else rep = Cell (args(2)); dv0 = str.dims(); if (pat.numel() != 1) { dv1 = pat.dims(); if (rep.numel() != 1 && dv1 != rep.dims()) error ("regexprep: Inconsistent cell array dimensions"); } else if (rep.numel() != 1) dv1 = rep.dims(); if (!error_state) { Cell ret (dv0); octave_value_list new_args = args; for (octave_idx_type i = 0; i < dv0.numel(); i++) { new_args(0) = str(i); if (pat.numel() == 1) new_args(1) = pat(0); if (rep.numel() == 1) new_args(2) = rep(0); for (octave_idx_type j = 0; j < dv1.numel(); j++) { if (pat.numel() != 1) new_args(1) = pat(j); if (rep.numel() != 1) new_args(2) = rep(j); new_args(0) = octregexprep (new_args, "regexprep"); if (error_state) break; } if (error_state) break; ret(i) = new_args(0); } if (!error_state) retval = octave_value (ret); } } else retval = octregexprep (args, "regexprep"); return retval; } /* %!test # Replace with empty %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; %! t = regexprep(xml,'<[!?][^>]*>',''); %! assert(t,' <tag v="hello">some stuff</tag>') %!test # Replace with non-empty %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; %! t = regexprep(xml,'<[!?][^>]*>','?'); %! assert(t,'? <tag v="hello">some stuff?</tag>') %!test # Check that 'tokenize' is ignored %! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>'; %! t = regexprep(xml,'<[!?][^>]*>','','tokenize'); %! assert(t,' <tag v="hello">some stuff</tag>') %!testif HAVE_PCRE # Capture replacement %! data = "Bob Smith\nDavid Hollerith\nSam Jenkins"; %! result = "Smith, Bob\nHollerith, David\nJenkins, Sam"; %! t = regexprep(data,'(?m)^(\w+)\s+(\w+)$','$2, $1'); %! assert(t,result) # Return the original if no match %!assert(regexprep('hello','world','earth'),'hello') ## Test a general replacement %!assert(regexprep("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_"), "a_b_c_d_e_f_g"); ## Make sure it works at the beginning and end %!assert(regexprep("a[b]c{d}e-f=g", "a", "_"), "_[b]c{d}e-f=g"); %!assert(regexprep("a[b]c{d}e-f=g", "g", "_"), "a[b]c{d}e-f=_"); ## Options %!assert(regexprep("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_", "once"), "a_b]c{d}e-f=g"); %!assert(regexprep("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "ignorecase"), "a_b_c_d_e_f_g"); ## Option combinations %!assert(regexprep("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "once", "ignorecase"), "a_b]c{d}e-f=g"); ## End conditions on replacement %!assert(regexprep("abc","(b)",".$1"),"a.bc"); %!assert(regexprep("abc","(b)","$1"),"abc"); %!assert(regexprep("abc","(b)","$1."),"ab.c"); %!assert(regexprep("abc","(b)","$1.."),"ab..c"); ## Test cell array arguments %!assert(regexprep("abc",{"b","a"},"?"),{"??c"}) %!assert(regexprep({"abc","cba"},"b","?"),{"a?c","c?a"}) %!assert(regexprep({"abc","cba"},{"b","a"},{"?","!"}),{"!?c","c?!"}) */ /* ;;; Local Variables: *** ;;; mode: C++ *** ;;; End: *** */