Mercurial > hg > octave-lyh
diff scripts/io/textscan.m @ 12866:fe6e2afcd9ee
Revamp strread, textscan, textread functions for Matlab compatability
Implemented ML-compatible whitespace and delimiter defaults
Implemented ML-compatible options: 'whitespace', treatasempty',
format string repeat count, user-specified comment style, uneven-length
output arrays, %n and %u conversion specifiers (provisionally)
Implemented processing of given-width format specifiers
* textscan.m: Add new tests. Implement EndofLine, ReturnOnError, TreatAsEmpty
options. Improve whitespace handling.
* textread.m: Add new tests Implement EndofLine option. Improve whitespace
handling.
* strread.m: Major rewrite.
author | Philip Nienhuis <prnienhuis@users.sf.net> |
---|---|
date | Fri, 22 Jul 2011 13:05:26 -0700 |
parents | f38cf6224452 |
children | e8c8e118a1e6 |
line wrap: on
line diff
--- a/scripts/io/textscan.m +++ b/scripts/io/textscan.m @@ -28,12 +28,24 @@ ## The file associated with @var{fid} is read and parsed according to ## @var{format}. The function behaves like @code{strread} except it works by ## parsing a file instead of a string. See the documentation of -## @code{strread} for details. In addition to the options supported by -## @code{strread}, this function supports one more: +## @code{strread} for details. +## +## In addition to the options supported by +## @code{strread}, this function supports a few more: +## ## @itemize ## @item "headerlines": +## The first @var{value} number of lines of @var{str} are skipped. +## +## @item "endofline": +## Specify a single character or "\r\n". If no value is given, it will be +## inferred from the file. If set to "" (empty string) EOLs are ignored as +## delimiters. +## +## @item "returnonerror": +## If set to numerical 1 or true (default), return normally when read errors +## have been encountered. If set to 0 or false, return an error and no data. ## @end itemize -## The first @var{value} number of lines of @var{str} are skipped. ## ## The optional input, @var{n}, specifes the number of lines to be read from ## the file, associated with @var{fid}. @@ -47,15 +59,25 @@ ## @seealso{dlmread, fscanf, load, strread, textread} ## @end deftypefn -function [C, p] = textscan (fid, format, varargin) +function [C, position] = textscan (fid, format = "%f", varargin) ## Check input if (nargin < 1) print_usage (); - elseif (nargin == 1 || isempty (format)) + endif + + if (isempty (format)) format = "%f"; endif + if (! (isa (fid, "double") && fid > 0) && ! ischar (fid)) + error ("textscan: first argument must be a file id or character string"); + endif + + if (! ischar (format)) + error ("textscan: FORMAT must be a valid specification"); + endif + if (nargin > 2 && isnumeric (varargin{1})) nlines = varargin{1}; args = varargin(2:end); @@ -70,66 +92,132 @@ args{end+1} = NaN; endif - if (isa (fid, "double") && fid > 0 || ischar (fid)) - if (ischar (format)) - if (ischar (fid)) - if (nargout == 2) - error ("textscan: cannot provide position information for character input"); - endif - str = fid; + ## Check default parameter values that differ for strread & textread + + ipos = find (strcmpi (args, "whitespace")); + if (isempty (ipos)) + ## Matlab default whitespace = " \b\t" + args{end+1} = "whitespace"; + args{end+1} = " \b\t"; + whitespace = " \b\t"; + else + ## Check if there's at least one string format specifier + fmt = strrep (format, "%", " %"); + [~, ~, ~, fmt] = regexp (fmt, '[^ ]+'); + fmt = strtrim (fmt(strmatch ("%", fmt))) + has_str_fmt = all (cellfun ("isempty", strfind (strtrim (fmt(strmatch ("%", fmt))), 's'))); + ## If there is a format, AND whitespace value = empty, + ## don't add a space (char(32)) to whitespace + if (! (isempty (args{ipos+1}) && has_str_fmt)) + args {ipos+1} = unique ([" " whitespace]); + endif + endif + + if (! any (strcmpi (args, "delimiter"))) + ## Matlab says default delimiter = whitespace. + ## strread() will pick this up further + args{end+1} = "delimiter"; + args{end+1} = ""; + endif + + if (any (strcmpi (args, "returnonerror"))) + ## Because of the way strread() reads data (columnwise) this parameter + ## can't be neatly implemented. strread() will pick it up anyway + warning ('ReturnOnError is not fully implemented'); + else + ## Set default value (=true) + args{end+1} = "returnonerror"; + args{end+1} = 1; + endif + + if (ischar (fid)) + ## Read from a text string + if (nargout == 2) + error ("textscan: cannot provide position information for character input"); + endif + str = fid; + else + ## Skip header lines if requested + headerlines = find (strcmpi (args, "headerlines"), 1); + ## Beware of zero valued headerline, fskipl would skip to EOF + if (! isempty (headerlines) && (args{headerlines + 1} > 0)) + fskipl (fid, varargin{headerlines + 1}); + endif + if (isfinite (nlines)) + str = ""; + ## FIXME: Can this be done without slow for loop? + for n = 1:nlines + str = strcat (str, fgets (fid)); + endfor + else + str = fread (fid, "char=>char").'; + endif + endif + + ## Check for empty result + if (isempty (str)) + warning ("textscan: no data read"); + C = []; + else + ## Check value of 'endofline'. String or file doesn't seem to matter + endofline = find (strcmpi (args, "endofline"), 1); + if (! isempty (endofline)) + if (! ischar (args{endofline + 1})) + error ("textscan: character value required for EndOfLine"); + endif + else + ## Determine EOL from file. Search for EOL candidates in first 3000 chars + BUFLEN = 3000; + ## First try DOS (CRLF) + eol_srch_len = min (length (str), 3000); + if (! isempty (findstr ("\r\n", str(1 : eol_srch_len)))) + eol_char = "\r\n"; + ## Perhaps old Macintosh? (CR) + elseif (! isempty (findstr ("\r", str(1 : eol_srch_len)))) + eol_char = "\r"; + ## Otherwise, use plain UNIX (LF) else - ## Maybe skip header lines - headerlines = find (strcmpi (args, "headerlines"), 1); - if (! isempty (headerlines)) - hdr_lines = floor (varargin{headerlines + 1}); - ## Beware of zero valued headerline, fskipl will count lines to EOF - if (hdr_lines > 0) - fskipl (fid, hdr_lines); - endif - endif - if (isfinite (nlines)) - str = ""; - for n = 1:nlines - str = strcat (str, fgets (fid)); - endfor - else - str = fread (fid, "char=>char").'; - endif + eol_char = "\n"; endif + ## Set up the default endofline param value + args{end+1} = "endofline"; + args{end+1} = eol_char; + endif + + ## Determine the number of data fields + num_fields = numel (strfind (format, "%")) - ... + numel (idx_star = strfind (format, "%*")); - ## Determine the number of data fields - num_fields = numel (strfind (format, "%")) - ... - numel (idx_star = strfind (format, "%*")); + ## Strip trailing EOL to avoid returning stray missing values (f. strread) + if (strcmp (str(end-length (eol_char) + 1 : end), eol_char)); + str = str(1 : end-length (eol_char)); + endif - ## Call strread to make it do the real work - C = cell (1, num_fields); - [C{:}] = strread (str, format, args{:}); + ## Call strread to make it do the real work + C = cell (1, num_fields); + [C{:}] = strread (str, format, args{:}); - if (ischar (fid) && isfinite (nlines)) - C = cellfun (@(x) x(1:nlines), C, "uniformoutput", false); - endif + if (ischar (fid) && isfinite (nlines)) + C = cellfun (@(x) x(1:nlines), C, "uniformoutput", false); + endif - if (nargout == 2) - p = ftell (fid); - endif + if (nargout == 2) + position = ftell (fid); + endif - else - error ("textscan: FORMAT must be a valid specification"); - endif - else - error ("textscan: first argument must be a file id or character string"); endif endfunction + %!test %! str = "1, 2, 3, 4\n 5, , , 8\n 9, 10, 11, 12"; %! fmtstr = "%f %d %f %s"; %! c = textscan (str, fmtstr, 2, "delimiter", ",", "emptyvalue", -Inf); -%! assert (isequal (c{1}, [1;5])) +%! assert (isequal (c{1}, [1;5])); %! assert (length (c{1}), 2); -%! assert (iscellstr (c{4})) -%! assert (isequal (c{3}, [3; -Inf])) +%! assert (iscellstr (c{4})); +%! assert (isequal (c{3}, [3; -Inf])); %!test %! b = [10:10:100]; @@ -137,7 +225,26 @@ %! str = sprintf ("%g miles/hr = %g kilometers/hr\n", b); %! fmt = "%f miles/hr = %f kilometers/hr"; %! c = textscan (str, fmt); -%! assert (b(1,:)', c{1}) -%! assert (b(2,:)', c{2}) +%! assert (b(1,:)', c{1}); +%! assert (b(2,:)', c{2}); + +#%!test +#%! str = "13, 72, NA, str1, 25\r\n// Middle line\r\n36, na, 05, str3, 6"; +#%! a = textscan(str, '%d %n %f %s %n', 'delimiter', ',','treatAsEmpty', {'NA', 'na'},'commentStyle', '//'); +#%! assert (a{1}, [13; 36]); +#%! assert (a{2}, [72; NaN]); +#%! assert (a{3}, [NaN; 5]); +#%! assert (a{4}, {"str1"; "str3"}); +#%! assert (a{5}, [25; 6]); +%!test +%! str = "Km:10 = hhhBjjj miles16hour\r\n"; +%! str = [str "Km:15 = hhhJjjj miles241hour\r\n"]; +%! str = [str "Km:2 = hhhRjjj miles3hour\r\n"]; +%! str = [str "Km:25 = hhhZ\r\n"]; +%! fmt = "Km:%d = hhh%1sjjj miles%dhour"; +%! a = textscan (str, fmt, 'delimiter', ' '); +%! assert (a{1}', [10 15 2 25], 1e-5); +%! assert (a{2}', {'B' 'J' 'R' 'Z'}); +%! assert (a{3}', [16 241 3 NaN], 1e-5);