Mercurial > hg > octave-nkf
diff scripts/strings/strsplit.m @ 16727:3ce3579405f8
Remove the "legacy" delimiter type from strsplit().
* strings/strsplit.m: Since ostrstplit.m has been added to Octave, remove the
support for the "legacy" delimiter type from strsplit(). Bug #39010.
author | Ben Abbott <bpabbott@mac.com> |
---|---|
date | Thu, 06 Jun 2013 11:03:22 +0800 |
parents | 9ce08a1efc29 |
children | 7268845c0a1e |
line wrap: on
line diff
--- a/scripts/strings/strsplit.m +++ b/scripts/strings/strsplit.m @@ -19,18 +19,17 @@ ## -*- texinfo -*- ## @deftypefn {Function File} {[@var{cstr}] =} strsplit (@var{s}) ## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@var{s}, @var{del}) -## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@var{s}, @var{del}, @var{collapsedelimiters}) ## @deftypefnx {Function File} {[@var{cstr}] =} strsplit (@dots{}, @var{name}, @var{value}) ## @deftypefnx {Function File} {[@var{cstr}, @var{matches}] =} strsplit (@dots{}) ## Split the string @var{s} using the delimiters specified by @var{del} -## and return a cell array of strings. For a single delimiter, @var{del} -## may be a string, or a scalar cell-string. For multible delimiters, -## @var{del} must be a cell-string array. Unless @var{collapsedelimiters} is -## specified to be @var{false}, consecutive delimiters are collapsed into one. +## and return a cell-string array of sub-strings. If a delmiter is not +## specified the string, @var{s}, is split at whitespace. The delimiter, +## @var{del} may be a string, a scalar cell-string, or cell-string array. +## @var{del} must be a cell-string array. By default, consecutive +## delimiters in the input string, @var{s}, are collapsed into one. ## ## The second output, @var{matches}, returns the delmiters which were matched -## in the original string. The matched delimiters are uneffected by the -## @var{collapsedelimiters}. +## in the original string. ## ## Example: ## @@ -76,21 +75,11 @@ ## Supported @var{name}/@var{value} pair arguments are; ## ## @itemize -## @item @var{collapsedelimiters} may take the value of @var{true} or @var{false} -## with the default being @var{false}. -## @item @var{delimitertype} may take the value of @code{legacy}, -## @code{simple} or @code{regularexpression}. -## If @var{delimitertype} is equal to @code{legacy}, each individual -## character of @var{del} is used to split the input. For both @code{simple} -## and @code{regularexpression}, the string is split at the boundaries of the -## delimiter string. If @var{delimiter} is a cell-string, then the string -## is split at the boundaries of each of the cells' strings. @var{simple} -## delimiters may contain escaped characters, but are otherwise treated as -## literal strings. -## -## If the specified delimiters are single characters, the default is -## @var{delimitertype} is @code{legacy}. Otherwise the default -## @var{delimitertype} is @code{simple}. +## @item @var{collapsedelimiters} may take the value of @var{true} or +## @var{false} with the default being @var{false}. +## @item @var{delimitertype} may take the value of @code{simple} or +## @code{regularexpression}. The default is @var{delimitertype} is +## @code{simple}. ## @end itemize ## ## Example: @@ -115,16 +104,6 @@ ## [1,5] = c ## @} ## -## strsplit ("a,,b, c", ", ", false, "delimitertype", "legacy") -## @result{} -## @{ -## [1,1] = a -## [1,2] = -## [1,3] = b -## [1,4] = -## [1,5] = c -## @} -## ## strsplit ("a,\t,b, c", @{',', '\s'@}, "delimitertype", "regularexpression") ## @result{} ## @{ @@ -132,10 +111,21 @@ ## [1,2] = b ## [1,3] = c ## @} +## +## strsplit ("a,\t,b, c", @{',', ' ', '\t'@}, "collapsedelimiters", false) +## @result{} +## @{ +## [1,1] = a +## [1,2] = +## [1,3] = +## [1,4] = b +## [1,5] = +## [1,6] = c +## @} ## @end group ## @end example ## -## @seealso{strjoin, strtok, regexp} +## @seealso{ostrsplit, strjoin, strtok, regexp} ## @end deftypefn function [result, matches] = strsplit (str, del, varargin) @@ -169,15 +159,7 @@ endfor if (strcmpi (args.delimitertype, "default")) - if (nargin == 1 || numel (del) == 1 - || (nargin > 1 && (islogical (del) || isnumeric (del))) - || iscell (del) && all (cellfun (@numel, del) < 2)) - ## For single character delimiters, default to "legacy" - args.delimitertype = "legacy"; - else - ## For multi-character delimiters, default to "simple" - args.delimitertype = "simple"; - endif + args.delimitertype = "simple"; endif # Save the length of the "delimitertype" parameter @@ -191,8 +173,6 @@ ## Set proper default for the delimiter type if (strncmpi (args.delimitertype, "simple", numel (args.delimitertype))) del = {" ","\f","\n","\r","\t","\v"}; - elseif (strncmpi (args.delimitertype, "legacy", numel (args.delimitertype))) - del = " \f\n\r\t\v"; else del = "\\s"; endif @@ -210,70 +190,12 @@ else del = do_string_escapes (del); endif + % This is clumsy, but needed for multi-row strings del = regexprep (del, '([^\w])', '\\$1'); endif - if (rows (str) > 1) - tmp = char (del(1)); - str = [str, repmat(tmp,rows(str),1)]; - str = reshape (str.', 1, numel (str)); - str(end-numel(tmp)+1:end) = []; - endif - if (isempty (str)) result = {str}; - elseif (strncmpi (args.delimitertype, "legacy", length_deltype)) - ## Legacy splitting is fast - if (! ischar (del)) - if (iscell (del) && all (cellfun (@numel, del) < 2)) - del = [del{:}]; - else - error ("strsplit:legacy_delimiter_must_be_char", - "%s %s", "strsplit: for DELIMITERTYPE = ""legacy"" ", - "DEL must be a string, or a cell array scalar character elements.") - endif - endif - if (strcmp (typeinfo (del), "sq_string")) - del = do_string_escapes (del); - endif - ## Split str at each character contained in del - if (isscalar (del)) - ## Single separator - idx = find (str == del); - else - ## Multiple separators - idx = strchr (str, del); - endif - - ## Get substring lengths. - if (isempty (idx)) - strlens = length (str); - else - strlens = [idx(1)-1, diff(idx)-1, numel(str)-idx(end)]; - endif - if (nargout > 1) - ## Grab the separators - matches = num2cell (str(idx)(:)).'; - if (args.collapsedelimiters) - ## Collapse the consequtive delimiters - ## TODO - is there a vectorized way? - for m = numel(matches):-1:2 - if (strlens(m) == 0) - matches{m-1} = [matches{m-1:m}]; - matches(m) = []; - endif - end - endif - endif - ## Remove separators. - str(idx) = []; - if (args.collapsedelimiters) - ## Omit zero lengths. - strlens = strlens(strlens != 0); - endif - - ## Convert! - result = mat2cell (str, 1, strlens); elseif (strncmpi (args.delimitertype, "regularexpression", length_deltype) || strncmpi (args.delimitertype, "simple", length_deltype)) if (iscellstr (del)) @@ -290,9 +212,6 @@ endif endfunction -% Mimic the old strsplit() -%!assert (cellfun (@numel, strsplit (["a,b,c";"1,2 "], ",")), [1 1 2 1 4]) - %!shared str %! str = "The rain in Spain stays mainly in the plain."; % Split on all whitespace. @@ -323,7 +242,7 @@ %!assert (strsplit ("road to^hell", {" ","^"}), {"road", "to", "hell"}) %!assert (strsplit ("road to--hell", {" ","-"}, true), {"road", "to", "hell"}) %!assert (strsplit (["a,bc,,de"], ",", false, "delimitertype", "s"), {"a", "bc", "", "de"}) -%!assert (strsplit (["a,bc,,de"], ",", false), {"a", "bc", char(ones(1,0)), "de"}) +%!assert (strsplit (["a,bc,,de"], ",", false), {"a", "bc", "", "de"}) %!assert (strsplit (["a,bc,de"], ",", true), {"a", "bc", "de"}) %!assert (strsplit (["a,bc,de"], {","," "}, true), {"a", "bc", "de"}) @@ -337,14 +256,6 @@ %!assert (strsplit (["a,bc,de"], "[, ]", true, "delimitertype", "r"), {"a", "bc", "de"}) %!assert (strsplit ("hello \t world", 1, "delimitertype", "r"), {"hello", "world"}); -%!assert (strsplit ("road to hell", " ", false, "delimitertype", "l"), {"road", "to", "hell"}) -%!assert (strsplit ("road to^hell", " ^", false, "delimitertype", "l"), {"road", "to", "hell"}) -%!assert (strsplit ("road to--hell", " -", true, "delimitertype", "l"), {"road", "to", "hell"}) -%!assert (strsplit (["a,bc";",de"], ",", false, "delimitertype", "l"), {"a", "bc", char(ones(1,0)), "de "}) -%!assert (strsplit (["a,bc";",de"], ",", true, "delimitertype", "l"), {"a", "bc", "de "}) -%!assert (strsplit (["a,bc";",de"], ", ", true, "delimitertype", "l"), {"a", "bc", "de"}) - -%!assert (strsplit ("foo\tbar", '\t', "delimitertype", "l"), {"foo", "bar"}) %!assert (strsplit ("foo\tbar", '\t', "delimitertype", "r"), {"foo", "bar"}) %!assert (strsplit ("foo\tbar", '\t', "delimitertype", "s"), {"foo", "bar"}) @@ -367,14 +278,16 @@ %! assert (a, {"a", "b"}) %! assert (m, {"\t \n"}) %!test -%! [a, m] = strsplit ("a123b", "123", "delimitertype", "legacy"); -%! assert (a, {"a", "b"}) -%! assert (m, {"123"}) -%!test %! [s, m] = strsplit ("hello \t world", 1); %! assert (s, {"hello", "world"}); %! assert (m, {" \t "}); +%% Compatibility +%! assert (strsplit ("", "a"), {""}) +%! assert (strsplit ("a", "a"), {"", ""}) +%! assert (strsplit ("aa", "a"), {"", ""}) +%! assert (strsplit ("aaa", "a"), {"", ""}) + %% Test input validation %!error strsplit () %!error strsplit ("abc", "b", true, 4)