# HG changeset patch # User Rik # Date 1333817033 25200 # Node ID 6d5c951ec5204b469a08fd0cf0429dc6b7d27bb0 # Parent 8150ccfffa22e215b822123c57a6d91e3d4347de Add 'emptymatch', 'noemptymatch' options to regular expressions. * NEWS: Announce new options. * liboctave/regexp.cc (regexp::match): Add processing option for zero length matches. * liboctave/regexp.h (class opts): Add emptymatch option to constructors, setter/getter routines, private variable. * DLD-FUNCTIONS/regexp.cc (parse_options): Add emptymatch to options parsing routine. * DLD-FUNCTIONS/regexp.cc (octregexp): Ignore emptymatch when determing output ordering of arguments. * DLD-FUNCTIONS/regexp.cc (Fregexp): Add new options to docstring. Add %!tests for new behavior. * DLD-FUNCTIONS/regexp.cc (Fregexprep): Add %!tests for new behavior. diff --git a/NEWS b/NEWS --- a/NEWS +++ b/NEWS @@ -1,6 +1,18 @@ Summary of important user-visible changes for version 3.8: --------------------------------------------------------- + ** 'emptymatch', 'noemptymatch' options added to regular expressions. + + With this addition Octave now accepts the entire set of Matlab options + for regular expressions. 'noemptymatch' is the default, but 'emptymatch' + has certain uses where you need to match an assertion rather than actual + characters. For example, + + regexprep ('World', '^', 'Hello ', 'emptymatch') + => Hello World + + where the pattern is actually the assertion '^' or start-of-line. + ** Redundant terminal comma accepted by parser A redundant terminal comma is now accepted in matrix diff --git a/liboctave/regexp.cc b/liboctave/regexp.cc --- a/liboctave/regexp.cc +++ b/liboctave/regexp.cc @@ -315,9 +315,9 @@ } else if (matches == PCRE_ERROR_NOMATCH) break; - else if (ovector[1] <= ovector[0]) + else if (ovector[1] <= ovector[0] && ! options.emptymatch ()) { - // Zero sized match. Skip to next char. + // Zero length match. Skip to next char. idx = ovector[0] + 1; if (idx < buffer.length ()) continue; @@ -400,7 +400,16 @@ regexp::match_element new_elem (named_tokens, tokens, match_string, token_extents, start, end); lst.push_back (new_elem); - idx = ovector[1]; + + if (ovector[1] <= ovector[0]) + { + // Zero length match. Skip to next char. + idx = ovector[0] + 1; + if (idx <= buffer.length ()) + continue; + } + else + idx = ovector[1]; if (options.once () || idx >= buffer.length ()) break; diff --git a/liboctave/regexp.h b/liboctave/regexp.h --- a/liboctave/regexp.h +++ b/liboctave/regexp.h @@ -95,11 +95,13 @@ opts (void) : x_case_insensitive (false), x_dotexceptnewline (false), - x_freespacing (false), x_lineanchors (false), x_once (false) { } + x_emptymatch (false), x_freespacing (false), x_lineanchors (false), + x_once (false) { } opts (const opts& o) : x_case_insensitive (o.x_case_insensitive), x_dotexceptnewline (o.x_dotexceptnewline), + x_emptymatch (o.x_emptymatch), x_freespacing (o.x_freespacing), x_lineanchors (o.x_lineanchors), x_once (o.x_once) @@ -111,6 +113,7 @@ { x_case_insensitive = o.x_case_insensitive; x_dotexceptnewline = o.x_dotexceptnewline; + x_emptymatch = o.x_emptymatch; x_freespacing = o.x_freespacing; x_lineanchors = o.x_lineanchors; x_once = o.x_once; @@ -123,12 +126,14 @@ void case_insensitive (bool val) { x_case_insensitive = val; } void dotexceptnewline (bool val) { x_dotexceptnewline = val; } + void emptymatch (bool val) { x_emptymatch = val; } void freespacing (bool val) { x_freespacing = val; } void lineanchors (bool val) { x_lineanchors = val; } void once (bool val) { x_once = val; } bool case_insensitive (void) const { return x_case_insensitive; } bool dotexceptnewline (void) const { return x_dotexceptnewline; } + bool emptymatch (void) const { return x_emptymatch; } bool freespacing (void) const { return x_freespacing; } bool lineanchors (void) const { return x_lineanchors; } bool once (void) const { return x_once; } @@ -137,6 +142,7 @@ bool x_case_insensitive; bool x_dotexceptnewline; + bool x_emptymatch; bool x_freespacing; bool x_lineanchors; bool x_once; diff --git a/src/DLD-FUNCTIONS/regexp.cc b/src/DLD-FUNCTIONS/regexp.cc --- a/src/DLD-FUNCTIONS/regexp.cc +++ b/src/DLD-FUNCTIONS/regexp.cc @@ -77,12 +77,16 @@ options.lineanchors (false); else if (str.find ("literalspacing", 0) == 0) options.freespacing (false); + else if (str.find ("noemptymatch", 0) == 0) + options.emptymatch (false); else if (str.find ("dotexceptnewline", 0) == 0) options.dotexceptnewline (true); else if (str.find ("lineanchors", 0) == 0) options.lineanchors (true); else if (str.find ("freespacing", 0) == 0) options.freespacing (true); + else if (str.find ("emptymatch", 0) == 0) + options.emptymatch (true); else if (str.find ("start", 0) == 0 || str.find ("end", 0) == 0 || str.find ("tokenextents", 0) == 0 @@ -257,7 +261,9 @@ || str.find ("dotall", 0) == 0 || str.find ("dotexceptnewline", 0) == 0 || str.find ("literalspacing", 0) == 0 - || str.find ("freespacing", 0) == 0) + || str.find ("freespacing", 0) == 0 + || str.find ("noemptymatch", 0) == 0 + || str.find ("emptymatch", 0) == 0) continue; else if (str.find ("start", 0) == 0) k = 0; @@ -488,8 +494,8 @@ operators. For example, a template for a floating point number might be\n\ @code{[-+.\\d]+}.\n\ \n\ -@item ()\n\ -Grouping operator\n\ +@item () (?:)\n\ +Grouping operator. The first form, parentheses only, also creates a token.\n\ \n\ @item |\n\ Alternation operator. Match one of a choice of regular expressions. The\n\ @@ -562,7 +568,8 @@ @code{(?@dots{})}.\n\ \n\ @item sp\n\ -A cell array of the text not returned by match.\n\ +A cell array of the text not returned by match, i.e., what remains if you\n\ +split the string based on @var{pat}.\n\ @end table\n\ \n\ Particular output arguments, or the order of the output arguments, can be\n\ @@ -630,6 +637,15 @@ \n\ Alternatively, use (?x) in the pattern.\n\ \n\ +@item noemptymatch\n\ +Zero-length matches are not returned. (default)\n\ +\n\ +@item emptymatch\n\ +Return zero-length matches.\n\ +\n\ +@code{regexp ('a', 'b*', 'emptymatch'} returns @code{[1 2]} because there are\n\ +zero or more 'b' characters at positions 1 and end-of-string.\n\ +\n\ @end table\n\ @seealso{regexpi, strfind, regexprep}\n\ @end deftypefn") @@ -810,6 +826,46 @@ %! assert (regexp ("this word", 's w', 'freespacing'), zeros (1,0)); %! assert (regexp ("this word", '(?x)s w'), zeros (1,0)); +%!test +%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'noemptymatch'); +%! assert (s, [1 5]); +%! assert (e, [3 5]); +%! assert (te, { zeros(0,2), zeros(0,2) }); +%! assert (m, { "OCT", "V" }); +%! assert (t, { cell(1,0), cell(1,0) }); +%! assert (isempty (fieldnames (nm))); +%! assert (sp, { "", "A", "E" }); + +%!test +%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'noemptymatch'); +%! assert (s, [1 5]); +%! assert (e, [3 5]); +%! assert (te, { [1 3], [5 5] }); +%! assert (m, { "OCT", "V" }); +%! assert (t, { {"OCT"}, {"V"} }); +%! assert (isempty (fieldnames (nm))); +%! assert (sp, { "", "A", "E" }); + +%!test +%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '[VOCT]*', 'emptymatch'); +%! assert (s, [1 4 5 6 7]); +%! assert (e, [3 3 5 5 6]); +%! assert (te, repmat ({zeros(0,2)}, [1, 5])); +%! assert (m, { "OCT", "", "V", "", "" }); +%! assert (t, repmat({cell(1,0)}, [1, 5])); +%! assert (isempty (fieldnames (nm))); +%! assert (sp, { "", "", "A", "", "E", "" }); + +%!test +%! [s, e, te, m, t, nm, sp] = regexp ('OCTAVE', '([VOCT]*)', 'emptymatch'); +%! assert (s, [1 4 5 6 7]); +%! assert (e, [3 3 5 5 6]); +%! assert (te, { [1 3], [4 3], [5 5], [6 5], [7 6] }); +%! assert (m, { "OCT", "", "V", "", "" }); +%! assert (t, { {"OCT"}, {""}, {"V"}, {""}, {""} }); +%! assert (isempty (fieldnames (nm))); +%! assert (sp, { "", "", "A", "", "E", "" }); + %!error regexp ('string', 'tri', 'BadArg') %!error regexp ('string') @@ -1213,6 +1269,10 @@ ## Return the original if no match %!assert (regexprep ('hello', 'world', 'earth'), 'hello') +## Test emptymatch +%!assert (regexprep ('World', '^', 'Hello '), 'World') +%!assert (regexprep ('World', '^', 'Hello ', 'emptymatch'), 'Hello World') + ## Test a general replacement %!assert (regexprep ("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_"), "a_b_c_d_e_f_g")