diff scripts/io/textscan.m @ 12866:fe6e2afcd9ee

Revamp strread, textscan, textread functions for Matlab compatability Implemented ML-compatible whitespace and delimiter defaults Implemented ML-compatible options: 'whitespace', treatasempty', format string repeat count, user-specified comment style, uneven-length output arrays, %n and %u conversion specifiers (provisionally) Implemented processing of given-width format specifiers * textscan.m: Add new tests. Implement EndofLine, ReturnOnError, TreatAsEmpty options. Improve whitespace handling. * textread.m: Add new tests Implement EndofLine option. Improve whitespace handling. * strread.m: Major rewrite.
author Philip Nienhuis <prnienhuis@users.sf.net>
date Fri, 22 Jul 2011 13:05:26 -0700
parents f38cf6224452
children e8c8e118a1e6
line wrap: on
line diff
--- a/scripts/io/textscan.m
+++ b/scripts/io/textscan.m
@@ -28,12 +28,24 @@
 ## The file associated with @var{fid} is read and parsed according to
 ## @var{format}.  The function behaves like @code{strread} except it works by
 ## parsing a file instead of a string.  See the documentation of
-## @code{strread} for details.  In addition to the options supported by
-## @code{strread}, this function supports one more:
+## @code{strread} for details.  
+##
+## In addition to the options supported by
+## @code{strread}, this function supports a few more:
+##
 ## @itemize
 ## @item "headerlines":
+## The first @var{value} number of lines of @var{str} are skipped.
+##
+## @item "endofline":
+## Specify a single character or "\r\n".  If no value is given, it will be
+## inferred from the file.  If set to "" (empty string) EOLs are ignored as
+## delimiters.
+##
+## @item "returnonerror":
+## If set to numerical 1 or true (default), return normally when read errors
+## have been encountered.  If set to 0 or false, return an error and no data.
 ## @end itemize
-## The first @var{value} number of lines of @var{str} are skipped.
 ##
 ## The optional input, @var{n}, specifes the number of lines to be read from
 ## the file, associated with @var{fid}.
@@ -47,15 +59,25 @@
 ## @seealso{dlmread, fscanf, load, strread, textread}
 ## @end deftypefn
 
-function [C, p] = textscan (fid, format, varargin)
+function [C, position] = textscan (fid, format = "%f", varargin)
 
   ## Check input
   if (nargin < 1)
     print_usage ();
-  elseif (nargin == 1 || isempty (format))
+  endif
+
+  if (isempty (format))
     format = "%f";
   endif
 
+  if (! (isa (fid, "double") && fid > 0) && ! ischar (fid))
+    error ("textscan: first argument must be a file id or character string");
+  endif
+
+  if (! ischar (format))
+    error ("textscan: FORMAT must be a valid specification");
+  endif
+
   if (nargin > 2 && isnumeric (varargin{1}))
     nlines = varargin{1};
     args = varargin(2:end);
@@ -70,66 +92,132 @@
     args{end+1} = NaN;
   endif
 
-  if (isa (fid, "double") && fid > 0 || ischar (fid))
-    if (ischar (format))
-      if (ischar (fid))
-        if (nargout == 2)
-          error ("textscan: cannot provide position information for character input");
-        endif
-        str = fid;
+  ## Check default parameter values that differ for strread & textread
+
+  ipos = find (strcmpi (args, "whitespace"));
+  if (isempty (ipos))
+    ## Matlab default whitespace = " \b\t"
+    args{end+1} = "whitespace";
+    args{end+1} = " \b\t";
+    whitespace = " \b\t";
+  else
+    ## Check if there's at least one string format specifier
+    fmt = strrep (format, "%", " %");
+    [~, ~, ~, fmt] = regexp (fmt, '[^ ]+');
+    fmt = strtrim (fmt(strmatch ("%", fmt)))
+    has_str_fmt = all (cellfun ("isempty", strfind (strtrim (fmt(strmatch ("%", fmt))), 's')));
+    ## If there is a format, AND whitespace value = empty, 
+    ## don't add a space (char(32)) to whitespace
+    if (! (isempty (args{ipos+1}) &&  has_str_fmt))
+      args {ipos+1} = unique ([" " whitespace]);
+    endif
+  endif
+
+  if (! any (strcmpi (args, "delimiter")))
+    ## Matlab says default delimiter = whitespace.  
+    ## strread() will pick this up further
+    args{end+1} = "delimiter";
+    args{end+1} = "";
+  endif
+
+  if (any (strcmpi (args, "returnonerror")))
+    ## Because of the way strread() reads data (columnwise) this parameter
+    ## can't be neatly implemented.  strread() will pick it up anyway
+    warning ('ReturnOnError is not fully implemented');
+  else
+    ## Set default value (=true)
+    args{end+1} = "returnonerror";
+    args{end+1} = 1;
+  endif
+
+  if (ischar (fid))
+    ## Read from a text string
+    if (nargout == 2)
+      error ("textscan: cannot provide position information for character input");
+    endif
+    str = fid;
+  else
+    ## Skip header lines if requested
+    headerlines = find (strcmpi (args, "headerlines"), 1);
+    ## Beware of zero valued headerline, fskipl would skip to EOF
+    if (! isempty (headerlines) && (args{headerlines + 1} > 0))
+      fskipl (fid, varargin{headerlines + 1});
+    endif
+    if (isfinite (nlines))
+      str = "";
+      ## FIXME: Can this be done without slow for loop?
+      for n = 1:nlines
+        str = strcat (str, fgets (fid));
+      endfor
+    else
+      str = fread (fid, "char=>char").';
+    endif
+  endif
+
+  ## Check for empty result
+  if (isempty (str))
+    warning ("textscan: no data read");
+    C = [];
+  else
+    ## Check value of 'endofline'.  String or file doesn't seem to matter
+    endofline = find (strcmpi (args, "endofline"), 1);
+    if (! isempty (endofline))
+      if (! ischar (args{endofline + 1})) 
+        error ("textscan: character value required for EndOfLine"); 
+      endif
+    else
+      ## Determine EOL from file.  Search for EOL candidates in first 3000 chars
+      BUFLEN = 3000;
+      ## First try DOS (CRLF)
+      eol_srch_len = min (length (str), 3000);
+      if (! isempty (findstr ("\r\n", str(1 : eol_srch_len))))
+        eol_char = "\r\n";
+      ## Perhaps old Macintosh? (CR)
+      elseif (! isempty (findstr ("\r", str(1 : eol_srch_len))))
+        eol_char = "\r";
+      ## Otherwise, use plain UNIX (LF)
       else
-        ## Maybe skip header lines
-        headerlines = find (strcmpi (args, "headerlines"), 1);
-        if (! isempty (headerlines))
-          hdr_lines = floor (varargin{headerlines + 1});
-          ## Beware of zero valued headerline, fskipl will count lines to EOF
-          if (hdr_lines > 0)
-            fskipl (fid, hdr_lines);
-          endif
-        endif
-        if (isfinite (nlines))
-          str = "";
-          for n = 1:nlines
-            str = strcat (str, fgets (fid));
-          endfor
-            else
-          str = fread (fid, "char=>char").';
-        endif
+        eol_char = "\n";
       endif
+      ## Set up the default endofline param value
+      args{end+1} = "endofline";
+      args{end+1} = eol_char;
+    endif
+
+    ## Determine the number of data fields
+    num_fields = numel (strfind (format, "%")) - ...
+                 numel (idx_star = strfind (format, "%*"));
 
-      ## Determine the number of data fields
-      num_fields = numel (strfind (format, "%")) - ...
-                   numel (idx_star = strfind (format, "%*"));
+    ## Strip trailing EOL to avoid returning stray missing values (f. strread)
+    if (strcmp (str(end-length (eol_char) + 1 : end), eol_char));
+      str = str(1 : end-length (eol_char)); 
+    endif
 
-      ## Call strread to make it do the real work
-      C = cell (1, num_fields);
-      [C{:}] = strread (str, format, args{:});
+    ## Call strread to make it do the real work
+    C = cell (1, num_fields);
+    [C{:}] = strread (str, format, args{:});
 
-      if (ischar (fid) && isfinite (nlines))
-        C = cellfun (@(x) x(1:nlines), C, "uniformoutput", false);
-      endif
+    if (ischar (fid) && isfinite (nlines))
+      C = cellfun (@(x) x(1:nlines), C, "uniformoutput", false);
+    endif
 
-      if (nargout == 2)
-        p = ftell (fid);
-      endif
+    if (nargout == 2)
+      position = ftell (fid);
+    endif
 
-    else
-      error ("textscan: FORMAT must be a valid specification");
-    endif
-  else
-    error ("textscan: first argument must be a file id or character string");
   endif
 
 endfunction
 
+
 %!test
 %! str = "1,  2,  3,  4\n 5,  ,  ,  8\n 9, 10, 11, 12";
 %! fmtstr = "%f %d %f %s";
 %! c = textscan (str, fmtstr, 2, "delimiter", ",", "emptyvalue", -Inf);
-%! assert (isequal (c{1}, [1;5]))
+%! assert (isequal (c{1}, [1;5]));
 %! assert (length (c{1}), 2);
-%! assert (iscellstr (c{4}))
-%! assert (isequal (c{3}, [3; -Inf]))
+%! assert (iscellstr (c{4}));
+%! assert (isequal (c{3}, [3; -Inf]));
 
 %!test
 %! b = [10:10:100];
@@ -137,7 +225,26 @@
 %! str = sprintf ("%g miles/hr = %g kilometers/hr\n", b);
 %! fmt = "%f miles/hr = %f kilometers/hr";
 %! c = textscan (str, fmt);
-%! assert (b(1,:)', c{1})
-%! assert (b(2,:)', c{2})
+%! assert (b(1,:)', c{1});
+%! assert (b(2,:)', c{2});
+
+#%!test
+#%! str = "13, 72, NA, str1, 25\r\n// Middle line\r\n36, na, 05, str3, 6";
+#%! a = textscan(str, '%d %n %f %s %n', 'delimiter', ',','treatAsEmpty', {'NA', 'na'},'commentStyle', '//');
+#%! assert (a{1}, [13; 36]);
+#%! assert (a{2}, [72; NaN]);
+#%! assert (a{3}, [NaN; 5]);
+#%! assert (a{4}, {"str1"; "str3"});
+#%! assert (a{5}, [25; 6]);
 
+%!test
+%! str = "Km:10 = hhhBjjj miles16hour\r\n";
+%! str = [str "Km:15 = hhhJjjj miles241hour\r\n"];
+%! str = [str "Km:2 = hhhRjjj miles3hour\r\n"];
+%! str = [str "Km:25 = hhhZ\r\n"];
+%! fmt = "Km:%d = hhh%1sjjj miles%dhour";
+%! a = textscan (str, fmt, 'delimiter', ' ');
+%! assert (a{1}', [10 15 2 25], 1e-5);
+%! assert (a{2}', {'B' 'J' 'R' 'Z'});
+%! assert (a{3}', [16 241 3 NaN], 1e-5);