Mercurial > hg > octave-nkf
view scripts/strings/strsplit.m @ 16557:d50bca1cdc22
Support escaped characters in sq_strings for strsplit.m.
* scripts/strings/strsplit.m: Remove private function regexp2simple() and
replace with regexprep(). Apply do_string_escapes() to sq_strings. Improve
doc-string. Add tests.
author | Ben Abbott <bpabbott@mac.com> |
---|---|
date | Tue, 23 Apr 2013 20:26:07 -0400 |
parents | 03a28487fa9d |
children | 9ce08a1efc29 |
line wrap: on
line source
## Copyright (C) 2009-2012 Jaroslav Hajek ## ## This file is part of Octave. ## ## Octave is free software; you can redistribute it and/or modify it ## under the terms of the GNU General Public License as published by ## the Free Software Foundation; either version 3 of the License, or (at ## your option) any later version. ## ## Octave is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with Octave; see the file COPYING. If not, see ## <http://www.gnu.org/licenses/>. ## -*- texinfo -*- ## @deftypefn {Function File} {[@var{cstr}] =} strsplit (@var{s}) ## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@var{s}, @var{del}) ## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@var{s}, @var{del}, @var{collapsedelimiters}) ## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@dots{}, @var{name}, @var{value}) ## @deftypefnx {Function File} {[@var{cstr}, @var{matches}] =} strsplit (@dots{}) ## Split the string @var{s} using the delimiters specified by @var{del} ## and return a cell array of strings. For a single delimiter, @var{del} ## may be a string, or a scalar cell-string. For multible delimiters, ## @var{del} must be a cell-string array. Unless @var{collapsedelimiters} is ## specified to be @var{false}, consecutive delimiters are collapsed into one. ## ## The second output, @var{matches}, returns the delmiters which were matched ## in the original string. The matched delimiters are uneffected by the ## @var{collapsedelimiters}. ## ## Example: ## ## @example ## @group ## strsplit ("a b c") ## @result{} ## @{ ## [1,1] = a ## [1,2] = b ## [1,3] = c ## @} ## ## strsplit ("a,b,c", ",") ## @result{} ## @{ ## [1,1] = a ## [1,2] = b ## [1,3] = c ## @} ## ## strsplit ("a foo b,bar c", @{"\s", "foo", "bar"@}) ## @result{} ## @{ ## [1,1] = a ## [1,2] = b ## [1,3] = c ## @} ## ## strsplit ("a,,b, c", @{",", " "@}, false) ## @result{} ## @{ ## [1,1] = a ## [1,2] = ## [1,3] = b ## [1,4] = ## [1,5] = c ## @} ## ## @end group ## @end example ## ## Supported @var{name}/@var{value} pair arguments are; ## ## @itemize ## @item @var{collapsedelimiters} may take the value of @var{true} or @var{false} ## with the default being @var{false}. ## @item @var{delimitertype} may take the value of @code{legacy}, ## @code{simple} or @code{regularexpression}. ## If @var{delimitertype} is equal to @code{legacy}, each individual ## character of @var{del} is used to split the input. For both @code{simple} ## and @code{regularexpression}, the string is split at the boundaries of the ## delimiter string. If @var{delimiter} is a cell-string, then the string ## is split at the boundaries of each of the cells' strings. @var{simple} ## delimiters may contain escaped characters, but are otherwise treated as ## literal strings. ## ## If the specified delimiters are single characters, the default is ## @var{delimitertype} is @code{legacy}. Otherwise the default ## @var{delimitertype} is @code{simple}. ## @end itemize ## ## Example: ## ## @example ## @group ## strsplit ("a foo b,bar c", ",|\\s|foo|bar", "delimitertype", "regularexpression") ## @result{} ## @{ ## [1,1] = a ## [1,2] = b ## [1,3] = c ## @} ## ## strsplit ("a,,b, c", "[, ]", false, "delimitertype", "regularexpression") ## @result{} ## @{ ## [1,1] = a ## [1,2] = ## [1,3] = b ## [1,4] = ## [1,5] = c ## @} ## ## strsplit ("a,,b, c", ", ", false, "delimitertype", "legacy") ## @result{} ## @{ ## [1,1] = a ## [1,2] = ## [1,3] = b ## [1,4] = ## [1,5] = c ## @} ## ## strsplit ("a,\t,b, c", @{',', '\s'@}, "delimitertype", "regularexpression") ## @result{} ## @{ ## [1,1] = a ## [1,2] = b ## [1,3] = c ## @} ## @end group ## @end example ## ## @seealso{strjoin, strtok, regexp} ## @end deftypefn function [result, matches] = strsplit (str, del, varargin) args.collapsedelimiters = true; args.delimitertype = "default"; [reg, params] = parseparams (varargin); if (numel (reg) > 1) print_usage (); elseif (numel (reg) == 1) if (islogical (reg{1}) || isnumeric (reg{1})) args.collapsedelimiters = reg{1}; else print_usage (); endif endif fields = fieldnames (args); for n = 1:2:numel(params) if (any (strcmpi (params{n}, fields))) args.(lower(params{n})) = params{n+1}; elseif (ischar (varargin{n})) error ("strsplit:invalid_parameter_name", sprintf ("strsplit: Invalid parameter name, `%s'", varargin{n})) else print_usage (); endif endfor if (strcmpi (args.delimitertype, "default")) if (nargin == 1 || numel (del) == 1 || (nargin > 1 && (islogical (del) || isnumeric (del))) || iscell (del) && all (cellfun (@numel, del) < 2)) ## For single character delimiters, default to "legacy" args.delimitertype = "legacy"; else ## For multi-character delimiters, default to "simple" args.delimitertype = "simple"; endif endif # Save the length of the "delimitertype" parameter length_deltype = numel (args.delimitertype); if (nargin == 1 || (nargin > 1 && (islogical (del) || isnumeric (del)))) if (nargin > 1) ## Second input is the "collapsedelimiters" parameter args.collapsedelimiters = del; endif ## Set proper default for the delimiter type if (strncmpi (args.delimitertype, "simple", numel (args.delimitertype))) del = {" ","\f","\n","\r","\t","\v"}; elseif (strncmpi (args.delimitertype, "legacy", numel (args.delimitertype))) del = " \f\n\r\t\v"; else del = "\\s"; endif endif if (nargin < 1) print_usage (); elseif (! ischar (str) || (! ischar (del) && ! iscellstr (del))) error ("strsplit: S and DEL must be string values"); elseif (! isscalar (args.collapsedelimiters)) error ("strsplit: COLLAPSEDELIMITERS must be a scalar value"); endif if (strncmpi (args.delimitertype, "simple", length_deltype)) if (iscellstr (del)) del = cellfun (@do_string_escapes, del, "uniformoutput", false); else del = do_string_escapes (del); endif del = regexprep (del, '([^\w])', '\\$1'); endif if (rows (str) > 1) tmp = char (del(1)); str = [str, repmat(tmp,rows(str),1)]; str = reshape (str.', 1, numel (str)); str(end-numel(tmp)+1:end) = []; endif if (isempty (str)) result = {str}; elseif (strncmpi (args.delimitertype, "legacy", length_deltype)) ## Legacy splitting is fast if (! ischar (del)) if (iscell (del) && all (cellfun (@numel, del) < 2)) del = [del{:}]; else error ("strsplit:legacy_delimiter_must_be_char", "%s %s", "strsplit: for DELIMITERTYPE = ""legacy"" ", "DEL must be a string, or a cell array scalar character elements.") endif endif if (strcmp (typeinfo (del), "sq_string")) del = do_string_escapes (del); endif ## Split str at each character contained in del if (isscalar (del)) ## Single separator idx = find (str == del); else ## Multiple separators idx = strchr (str, del); endif ## Get substring lengths. if (isempty (idx)) strlens = length (str); else strlens = [idx(1)-1, diff(idx)-1, numel(str)-idx(end)]; endif if (nargout > 1) ## Grab the separators matches = num2cell (str(idx)(:)).'; if (args.collapsedelimiters) ## Collapse the consequtive delimiters ## TODO - is there a vectorized way? for m = numel(matches):-1:2 if (strlens(m) == 0) matches{m-1} = [matches{m-1:m}]; matches(m) = []; endif end endif endif ## Remove separators. str(idx) = []; if (args.collapsedelimiters) ## Omit zero lengths. strlens = strlens(strlens != 0); endif ## Convert! result = mat2cell (str, 1, strlens); elseif (strncmpi (args.delimitertype, "regularexpression", length_deltype) || strncmpi (args.delimitertype, "simple", length_deltype)) if (iscellstr (del)) del = sprintf ('%s|', del{:}); del(end) = []; endif if (args.collapsedelimiters) del = ["(", del, ")+"]; endif [result, ~, ~, ~, matches] = regexp (str, del, "split"); else error ("strsplit:invalid_delimitertype", sprintf ("strsplit: Invalid DELIMITERTYPE")) endif endfunction % Mimic the old strsplit() %!assert (cellfun (@numel, strsplit (["a,b,c";"1,2 "], ",")), [1 1 2 1 4]) %!shared str %! str = "The rain in Spain stays mainly in the plain."; % Split on all whitespace. %!assert (strsplit (str), {"The", "rain", "in", "Spain", "stays", ... %! "mainly", "in", "the", "plain."}) % Split on "ain". %!assert (strsplit (str, "ain"), {"The r", " in Sp", " stays m", ... %! "ly in the pl", "."}) % Split on " " and "ain" (treating multiple delimiters as one). %!test %! s = strsplit (str, '\s|ain', true, "delimitertype", "r"); %! assert (s, {"The", "r", "in", "Sp", "stays", "m", "ly", "in", "the", "pl", "."}) %!test %! s = strsplit (str, "\\s|ain", true, "delimitertype", "r"); %! assert (s, {"The", "r", "in", "Sp", "stays", "m", "ly", "in", "the", "pl", "."}) %!test %! [s, m] = strsplit (str, {"\\s", "ain"}, true, "delimitertype", "r"); %! assert (s, {"The", "r", "in", "Sp", "stays", "m", "ly", "in", "the", "pl", "."}) %! assert (m, {" ", "ain ", " ", "ain ", " ", "ain", " ", " ", " ", "ain"}) % Split on " " and "ain", and treat multiple delimiters separately. %!test %! [s, m] = strsplit (str, {" ", "ain"}, "collapsedelimiters", false); %! assert (s, {"The", "r", "", "in", "Sp", "", "stays", "m", "ly", "in", "the", "pl", "."}) %! assert (m, {" ", "ain", " ", " ", "ain", " ", " ", "ain", " ", " ", " ", "ain"}) %!assert (strsplit ("road to hell"), {"road", "to", "hell"}) %!assert (strsplit ("road to hell", " "), {"road", "to", "hell"}) %!assert (strsplit ("road to^hell", {" ","^"}), {"road", "to", "hell"}) %!assert (strsplit ("road to--hell", {" ","-"}, true), {"road", "to", "hell"}) %!assert (strsplit (["a,bc,,de"], ",", false, "delimitertype", "s"), {"a", "bc", "", "de"}) %!assert (strsplit (["a,bc,,de"], ",", false), {"a", "bc", char(ones(1,0)), "de"}) %!assert (strsplit (["a,bc,de"], ",", true), {"a", "bc", "de"}) %!assert (strsplit (["a,bc,de"], {","," "}, true), {"a", "bc", "de"}) %!assert (strsplit ("road to hell", " ", "delimitertype", "r"), {"road", "to", "hell"}) %!assert (strsplit ("road to^hell", '\^| ', "delimitertype", "r"), {"road", "to", "hell"}) %!assert (strsplit ("road to^hell", "[ ^]", "delimitertype", "r"), {"road", "to", "hell"}) %!assert (strsplit ("road to--hell", "[ -]", false, "delimitertype", "r"), {"road", "", "", "to", "", "hell"}) %!assert (strsplit (["a,bc,de"], ",", "delimitertype", "r"), {"a", "bc", "de"}) %!assert (strsplit (["a,bc,,de"], ",", false, "delimitertype", "r"), {"a", "bc", "", "de"}) %!assert (strsplit (["a,bc,de"], ",", true, "delimitertype", "r"), {"a", "bc", "de"}) %!assert (strsplit (["a,bc,de"], "[, ]", true, "delimitertype", "r"), {"a", "bc", "de"}) %!assert (strsplit ("hello \t world", 1, "delimitertype", "r"), {"hello", "world"}); %!assert (strsplit ("road to hell", " ", false, "delimitertype", "l"), {"road", "to", "hell"}) %!assert (strsplit ("road to^hell", " ^", false, "delimitertype", "l"), {"road", "to", "hell"}) %!assert (strsplit ("road to--hell", " -", true, "delimitertype", "l"), {"road", "to", "hell"}) %!assert (strsplit (["a,bc";",de"], ",", false, "delimitertype", "l"), {"a", "bc", char(ones(1,0)), "de "}) %!assert (strsplit (["a,bc";",de"], ",", true, "delimitertype", "l"), {"a", "bc", "de "}) %!assert (strsplit (["a,bc";",de"], ", ", true, "delimitertype", "l"), {"a", "bc", "de"}) %!assert (strsplit ("foo\tbar", '\t', "delimitertype", "l"), {"foo", "bar"}) %!assert (strsplit ("foo\tbar", '\t', "delimitertype", "r"), {"foo", "bar"}) %!assert (strsplit ("foo\tbar", '\t', "delimitertype", "s"), {"foo", "bar"}) ## Test "match" for consecutive delmiters %!test %! [a, m] = strsplit ("a\t \nb", '\s', 'delimitertype', 'regularexpression', %! 'collapsedelimiters', false); %! assert (a, {"a", "", "", "b"}) %! assert (m, {"\t", " ", "\n"}) %!test %! [a, m] = strsplit ("a\t \nb", '\s', false, 'delimitertype', 'regularexpression'); %! assert (a, {"a", "", "", "b"}) %! assert (m, {"\t", " ", "\n"}) %!test %! [a, m] = strsplit ("a\t \nb", '\s', "delimitertype", "regularexpression"); %! assert (a, {"a", "b"}) %! assert (m, {"\t \n"}) %!test %! [a, m] = strsplit ("a\t \nb", {"\t", " ", "\n"}, "delimitertype", "simple"); %! assert (a, {"a", "b"}) %! assert (m, {"\t \n"}) %!test %! [a, m] = strsplit ("a123b", "123", "delimitertype", "legacy"); %! assert (a, {"a", "b"}) %! assert (m, {"123"}) %!test %! [s, m] = strsplit ("hello \t world", 1); %! assert (s, {"hello", "world"}); %! assert (m, {" \t "}); %% Test input validation %!error strsplit () %!error strsplit ("abc", "b", true, 4) %!error <S and DEL must be string values> strsplit (123, "b") %!error <COLLAPSEDELIMITERS must be a scalar value> strsplit ("abc", "def", ones (3,3))