Mercurial > hg > octave-nkf
changeset 12877:ddea3962b024
Various improvements to strread, textread, textscan functions
Eliminate redundant code, Do better input validation, use
one-line code idioms where possible, improve documentation.
* strread.m: Validate 'treatasempty' option. Remove redundant
code to find nfields. Initialize litptr
* textread.m: Only read enough of file to do format_repeat_count
operations. Improve documentation and use one-line code idioms.
Add new tests.
* textscan.m: Only read enough of file to do format_repeat_count
operations. Improve documentation and use one-line code idioms.
Add new tests.
author | Rik <octave@nomad.inbox5.com> |
---|---|
date | Sun, 24 Jul 2011 14:15:31 -0700 |
parents | 29cd5a828bb2 |
children | 875c735c0929 |
files | scripts/io/strread.m scripts/io/textread.m scripts/io/textscan.m |
diffstat | 3 files changed, 133 insertions(+), 115 deletions(-) [+] |
line wrap: on
line diff
--- a/scripts/io/strread.m +++ b/scripts/io/strread.m @@ -26,10 +26,9 @@ ## ## The string @var{str} is split into words that are repeatedly matched to the ## specifiers in @var{format}. The first word is matched to the first -## specifier, -## the second to the second specifier and so forth. If there are more words -## than -## specifiers, the process is repeated until all words have been processed. +## specifier, the second to the second specifier and so forth. If there are +## more words than specifiers, the process is repeated until all words have +## been processed. ## ## The string @var{format} describes how the words in @var{str} should be ## parsed. @@ -224,9 +223,12 @@ case "returnonerror" err_action = varargin{n+1}; case "treatasempty" - empty_str = varargin{n+1}; - if (ischar (empty_str)) - empty_str = {empty_str}; + if (iscellstr (varargin{n+1})) + empty_str = varargin{n+1}; + elseif (ischar (varargin{n+1})) + empty_str = varargin(n+1); + else + error ('strread: "treatasempty" value must be string or cellstr'); endif otherwise warning ('strread: unknown property "%s"', varargin{n}); @@ -234,11 +236,7 @@ endfor ## Parse format string to compare nr. of conversion fields and nargout - idx = strfind (format, "%")'; - specif = format([idx, idx+1]); - nspecif = length (idx); - idx_star = strfind (format, "%*"); - nfields = length (idx) - length (idx_star); + nfields = length (strfind (format, "%")) - length (strfind (format, "%*")); ## If str only has numeric fields, a (default) format ("%f") will do. ## Otherwise: if ((max (nargout, 1) != nfields) && ! strcmp (format, "%f")) @@ -300,6 +298,7 @@ if (! isempty (white_spaces)) ## Check for overlapping whitespaces and delimiters & trim whitespace + ## FIXME: Can this section be replaced by call to setdiff() ? if (! isempty (delimiter_str)) [ovlp, iw] = intersect (white_spaces, delimiter_str); if (! isempty (ovlp)) @@ -362,6 +361,7 @@ ## Replace TreatAsEmpty char sequences by empty strings if (! isempty (empty_str)) ## FIXME: There should be a simpler way to do this with cellfun + ## or possibly with regexprep for ii = 1:numel (empty_str) idz = strmatch (empty_str{ii}, words, "exact"); words(idz) = {""}; @@ -393,7 +393,7 @@ ## 1. Assess "period" in the split-up words array ( < num_words_per_line). ## Could be done using EndOfLine but that prohibits EndOfLine = "" option. fmt_in_word = cell (num_words_per_line, 1); - words_period = 1; + words_period = litptr = 1; ## For each literal in turn for ii = 1:numel (idy) fmt_in_word(idy(ii)) = num_words;
--- a/scripts/io/textread.m +++ b/scripts/io/textread.m @@ -19,7 +19,9 @@ ## -*- texinfo -*- ## @deftypefn {Function File} {[@var{a}, @dots{}] =} textread (@var{filename}) ## @deftypefnx {Function File} {[@var{a}, @dots{}] =} textread (@var{filename}, @var{format}) +## @deftypefnx {Function File} {[@var{a}, @dots{}] =} textread (@var{filename}, @var{format}, @var{n}) ## @deftypefnx {Function File} {[@var{a}, @dots{}] =} textread (@var{filename}, @var{format}, @var{prop1}, @var{value1}, @dots{}) +## @deftypefnx {Function File} {[@var{a}, @dots{}] =} textread (@var{filename}, @var{format}, @var{n}, @var{prop1}, @var{value1}, @dots{}) ## Read data from a text file. ## ## The file @var{filename} is read and parsed according to @var{format}. The @@ -39,6 +41,9 @@ ## delimiters. ## @end itemize ## +## The optional input @var{n} specifes the number of times to use +## @var{format} when parsing, i.e., the format repeat count. +## ## @seealso{strread, load, dlmread, fscanf, textscan} ## @end deftypefn @@ -49,7 +54,7 @@ print_usage (); endif - if (!ischar (filename) || !ischar (format)) + if (! ischar (filename) || ! ischar (format)) error ("textread: FILENAME and FORMAT arguments must be strings"); endif @@ -67,47 +72,60 @@ varargin(headerlines:headerlines+1) = []; endif - str = fread (fid, "char=>char").'; + if (nargin > 2 && isnumeric (varargin{1})) + nlines = varargin{1}; + else + nlines = Inf; + endif + + if (isfinite (nlines)) + str = tmp_str = ""; + n = 0; + ## FIXME: Can this be done without slow loop? + while (ischar (tmp_str) && n++ <= nlines) + str = strcat (str, tmp_str); + tmp_str = fgets (fid); + endwhile + else + str = fread (fid, "char=>char").'; + endif fclose (fid); if (isempty (str)) warning ("textread: empty file"); + return; + endif + + endofline = find (strcmpi (varargin, "endofline"), 1); + if (! isempty (endofline)) + ## 'endofline' option set by user. + if (! ischar (varargin{endofline + 1})); + error ("textread: character value required for EndOfLine"); + endif else - endofline = find (strcmpi (varargin, "endofline"), 1); - if (! isempty (endofline)) - ## 'endofline' option set by user. - endofline = find (strcmpi (varargin, "endofline"), 1); - if (! ischar (varargin{endofline + 1})); - error ("textscan: character value required for EndOfLine"); - endif + ## Determine EOL from file. Search for EOL candidates in first 3000 chars + eol_srch_len = min (length (str), 3000); + ## First try DOS (CRLF) + if (! isempty (findstr ("\r\n", str(1 : eol_srch_len)))) + eol_char = "\r\n"; + ## Perhaps old Macintosh? (CR) + elseif (! isempty (findstr ("\r", str(1 : eol_srch_len)))) + eol_char = "\r"; + ## Otherwise, use plain UNIX (LF) else - ## Determine EOL from file. Search for EOL candidates in first 3000 chars - eol_srch_len = min (length (str), 3000); - ## First try DOS (CRLF) - if (! isempty (findstr ("\r\n", str(1 : eol_srch_len)))) - eol_char = "\r\n"; - ## Perhaps old Macintosh? (CR) - elseif (! isempty (findstr ("\r", str(1 : eol_srch_len)))) - eol_char = "\r"; - ## Otherwise, use plain UNIX (LF) - else - eol_char = "\n"; - endif - ## Set up default endofline param value - nargs = numel (varargin); - varargin(nargs+1:nargs+2) = {'endofline', eol_char}; + eol_char = "\n"; endif + ## Set up default endofline param value + varargin(end+1:end+2) = {'endofline', eol_char}; + endif - ## Set up default whitespace param value if needed - if (isempty (find (strcmpi ('whitespace', varargin)))) - nargs = numel (varargin); - varargin(nargs+1:nargs+2) = {'whitespace', " \b\t"}; - endif + ## Set up default whitespace param value if needed + if (isempty (find (strcmpi ('whitespace', varargin)))) + varargin(end+1:end+2) = {'whitespace', " \b\t"}; + endif - ## Call strread to make it do the real work - [varargout{1:max (nargout, 1)}] = strread (str, format, varargin {:}); - - endif + ## Call strread to make it do the real work + [varargout{1:max (nargout, 1)}] = strread (str, format, varargin {:}); endfunction @@ -125,5 +143,6 @@ %% Test input validation %!error textread () %!error textread (1) -%!error textread ("fname", 1) +%!error <arguments must be strings> textread (1, '%f') +%!error <arguments must be strings> textread ("fname", 1)
--- a/scripts/io/textscan.m +++ b/scripts/io/textscan.m @@ -22,8 +22,8 @@ ## @deftypefnx {Function File} {@var{C} =} textscan (@var{fid}, @var{format}, @var{param}, @var{value}, @dots{}) ## @deftypefnx {Function File} {@var{C} =} textscan (@var{fid}, @var{format}, @var{n}, @var{param}, @var{value}, @dots{}) ## @deftypefnx {Function File} {@var{C} =} textscan (@var{str}, @dots{}) -## @deftypefnx {Function File} {[@var{C}, @var{position}] =} textscan (@dots{}) -## Read data from a text file. +## @deftypefnx {Function File} {[@var{C}, @var{position}] =} textscan (@var{fid}, @dots{}) +## Read data from a text file or string. ## ## The file associated with @var{fid} is read and parsed according to ## @var{format}. The function behaves like @code{strread} except it works by @@ -35,7 +35,7 @@ ## ## @itemize ## @item "headerlines": -## The first @var{value} number of lines of @var{str} are skipped. +## The first @var{value} number of lines of @var{fid} are skipped. ## ## @item "endofline": ## Specify a single character or "\r\n". If no value is given, it will be @@ -47,10 +47,10 @@ ## have been encountered. If set to 0 or false, return an error and no data. ## @end itemize ## -## The optional input, @var{n}, specifes the number of lines to be read from -## the file, associated with @var{fid}. +## The optional input @var{n} specifes the number of times to use +## @var{format} when parsing, i.e., the format repeat count. ## -## The output, @var{C}, is a cell array whose length is given by the number +## The output @var{C} is a cell array whose length is given by the number ## of format specifiers. ## ## The second output, @var{position}, provides the position, in characters, @@ -75,21 +75,19 @@ endif if (! ischar (format)) - error ("textscan: FORMAT must be a valid specification"); + error ("textscan: FORMAT must be a string"); endif - if (nargin > 2 && isnumeric (varargin{1})) - nlines = varargin{1}; - args = varargin(2:end); + args = varargin; + if (nargin > 2 && isnumeric (args{1})) + nlines = args{1}; else nlines = Inf; - args = varargin; endif if (! any (strcmpi (args, "emptyvalue"))) ## Matlab returns NaNs for missing values - args{end+1} = "emptyvalue"; - args{end+1} = NaN; + args(end+1:end+2) = {'emptyvalue', NaN}; endif ## Check default parameter values that differ for strread & textread @@ -97,37 +95,34 @@ ipos = find (strcmpi (args, "whitespace")); if (isempty (ipos)) ## Matlab default whitespace = " \b\t" - args{end+1} = "whitespace"; - args{end+1} = " \b\t"; + args(end+1:end+2) = {'whitespace', " \b\t"}; whitespace = " \b\t"; else ## Check if there's at least one string format specifier fmt = strrep (format, "%", " %"); - [~, ~, ~, fmt] = regexp (fmt, '[^ ]+'); + fmt = regexp (fmt, '[^ ]+', 'match'); fmt = strtrim (fmt(strmatch ("%", fmt))) has_str_fmt = all (cellfun ("isempty", strfind (strtrim (fmt(strmatch ("%", fmt))), 's'))); ## If there is a format, AND whitespace value = empty, ## don't add a space (char(32)) to whitespace if (! (isempty (args{ipos+1}) && has_str_fmt)) - args {ipos+1} = unique ([" " whitespace]); + args{ipos+1} = unique ([" ", whitespace]); endif endif if (! any (strcmpi (args, "delimiter"))) ## Matlab says default delimiter = whitespace. ## strread() will pick this up further - args{end+1} = "delimiter"; - args{end+1} = ""; + args(end+1:end+2) = {'delimiter', ""}; endif if (any (strcmpi (args, "returnonerror"))) ## Because of the way strread() reads data (columnwise) this parameter ## can't be neatly implemented. strread() will pick it up anyway - warning ('ReturnOnError is not fully implemented'); + warning ('textscan: ReturnOnError is not fully implemented'); else ## Set default value (=true) - args{end+1} = "returnonerror"; - args{end+1} = 1; + args(end+1:end+2) = {"returnonerror", 1}; endif if (ischar (fid)) @@ -145,11 +140,13 @@ args(headerlines:headerlines+1) = []; endif if (isfinite (nlines)) - str = ""; - ## FIXME: Can this be done without slow for loop? - for n = 1:nlines - str = strcat (str, fgets (fid)); - endfor + str = tmp_str = ""; + n = 0; + ## FIXME: Can this be done without slow loop? + while (ischar (tmp_str) && n++ <= nlines) + str = strcat (str, tmp_str); + tmp_str = fgets (fid); + endwhile else str = fread (fid, "char=>char").'; endif @@ -159,53 +156,48 @@ if (isempty (str)) warning ("textscan: no data read"); C = []; - else - ## Check value of 'endofline'. String or file doesn't seem to matter - endofline = find (strcmpi (args, "endofline"), 1); - if (! isempty (endofline)) - if (! ischar (args{endofline + 1})) - error ("textscan: character value required for EndOfLine"); - endif + return; + endif + + ## Check value of 'endofline'. String or file doesn't seem to matter + endofline = find (strcmpi (args, "endofline"), 1); + if (! isempty (endofline)) + if (ischar (args{endofline + 1})) + eol_char = args{endofline + 1}; else - ## Determine EOL from file. Search for EOL candidates in first 3000 chars - BUFLEN = 3000; - ## First try DOS (CRLF) - eol_srch_len = min (length (str), 3000); - if (! isempty (findstr ("\r\n", str(1 : eol_srch_len)))) - eol_char = "\r\n"; - ## Perhaps old Macintosh? (CR) - elseif (! isempty (findstr ("\r", str(1 : eol_srch_len)))) - eol_char = "\r"; - ## Otherwise, use plain UNIX (LF) - else - eol_char = "\n"; - endif - ## Set up the default endofline param value - args{end+1} = "endofline"; - args{end+1} = eol_char; + error ("textscan: character value required for EndOfLine"); endif + else + ## Determine EOL from file. Search for EOL candidates in first 3000 chars + eol_srch_len = min (length (str), 3000); + ## First try DOS (CRLF) + if (! isempty (findstr ("\r\n", str(1 : eol_srch_len)))) + eol_char = "\r\n"; + ## Perhaps old Macintosh? (CR) + elseif (! isempty (findstr ("\r", str(1 : eol_srch_len)))) + eol_char = "\r"; + ## Otherwise, use plain UNIX (LF) + else + eol_char = "\n"; + endif + ## Set up the default endofline param value + args(end+1:end+2) = {'endofline', eol_char}; + endif - ## Determine the number of data fields - num_fields = numel (strfind (format, "%")) - ... - numel (idx_star = strfind (format, "%*")); - - ## Strip trailing EOL to avoid returning stray missing values (f. strread) - if (strcmp (str(end-length (eol_char) + 1 : end), eol_char)); - str = str(1 : end-length (eol_char)); - endif + ## Determine the number of data fields + num_fields = numel (strfind (format, "%")) - numel (strfind (format, "%*")); - ## Call strread to make it do the real work - C = cell (1, num_fields); - [C{:}] = strread (str, format, args{:}); + ## Strip trailing EOL to avoid returning stray missing values (f. strread) + if (strcmp (str(end-length (eol_char) + 1 : end), eol_char)); + str(end-length (eol_char) + 1 : end) = ""; + endif - if (ischar (fid) && isfinite (nlines)) - C = cellfun (@(x) x(1:nlines), C, "uniformoutput", false); - endif + ## Call strread to make it do the real work + C = cell (1, num_fields); + [C{:}] = strread (str, format, args{:}); - if (nargout == 2) - position = ftell (fid); - endif - + if (nargout == 2) + position = ftell (fid); endif endfunction @@ -249,3 +241,10 @@ %! assert (a{2}', {'B' 'J' 'R' 'Z'}); %! assert (a{3}', [16 241 3 NaN], 1e-5); +%% Test input validation +%!error textscan () +%!error textscan (single (4)) +%!error textscan ({4}) +%!error <must be a string> textscan ("Hello World", 2) +%!error <cannot provide position information> [C, pos] = textscan ("Hello World") +%!error <character value required> textscan ("Hello World", '%s', 'EndOfLine', 3)