view scripts/io/textscan.m @ 12880:ff264eae88cf

textread.m, textscan.m: Read entire file when format repeat count is -1 * textread.m, textscan.m: Read entire file when format repeat count is -1.
author Rik <octave@nomad.inbox5.com>
date Sun, 24 Jul 2011 22:11:26 -0700
parents ddea3962b024
children f5a3f77d51aa
line wrap: on
line source

## Copyright (C) 2010-2011 Ben Abbott <bpabbott@mac.com>
##
## This file is part of Octave.
##
## Octave is free software; you can redistribute it and/or modify it
## under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 3 of the License, or (at
## your option) any later version.
##
## Octave is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Octave; see the file COPYING.  If not, see
## <http://www.gnu.org/licenses/>.

## -*- texinfo -*-
## @deftypefn  {Function File} {@var{C} =} textscan (@var{fid}, @var{format})
## @deftypefnx {Function File} {@var{C} =} textscan (@var{fid}, @var{format}, @var{n})
## @deftypefnx {Function File} {@var{C} =} textscan (@var{fid}, @var{format}, @var{param}, @var{value}, @dots{})
## @deftypefnx {Function File} {@var{C} =} textscan (@var{fid}, @var{format}, @var{n}, @var{param}, @var{value}, @dots{})
## @deftypefnx {Function File} {@var{C} =} textscan (@var{str}, @dots{})
## @deftypefnx {Function File} {[@var{C}, @var{position}] =} textscan (@var{fid}, @dots{})
## Read data from a text file or string.
##
## The file associated with @var{fid} is read and parsed according to
## @var{format}.  The function behaves like @code{strread} except it works by
## parsing a file instead of a string.  See the documentation of
## @code{strread} for details.  
##
## In addition to the options supported by
## @code{strread}, this function supports a few more:
##
## @itemize
## @item "headerlines":
## The first @var{value} number of lines of @var{fid} are skipped.
##
## @item "endofline":
## Specify a single character or "\r\n".  If no value is given, it will be
## inferred from the file.  If set to "" (empty string) EOLs are ignored as
## delimiters.
##
## @item "returnonerror":
## If set to numerical 1 or true (default), return normally when read errors
## have been encountered.  If set to 0 or false, return an error and no data.
## @end itemize
##
## The optional input @var{n} specifes the number of times to use 
## @var{format} when parsing, i.e., the format repeat count.
##
## The output @var{C} is a cell array whose length is given by the number
## of format specifiers.
##
## The second output, @var{position}, provides the position, in characters,
## from the beginning of the file.
##
## @seealso{dlmread, fscanf, load, strread, textread}
## @end deftypefn

function [C, position] = textscan (fid, format = "%f", varargin)

  ## Check input
  if (nargin < 1)
    print_usage ();
  endif

  if (isempty (format))
    format = "%f";
  endif

  if (! (isa (fid, "double") && fid > 0) && ! ischar (fid))
    error ("textscan: first argument must be a file id or character string");
  endif

  if (! ischar (format))
    error ("textscan: FORMAT must be a string");
  endif

  args = varargin;
  if (nargin > 2 && isnumeric (args{1}))
    nlines = args{1};
  else
    nlines = Inf;
  endif

  if (! any (strcmpi (args, "emptyvalue")))
    ## Matlab returns NaNs for missing values
    args(end+1:end+2) = {'emptyvalue', NaN};
  endif

  ## Check default parameter values that differ for strread & textread

  ipos = find (strcmpi (args, "whitespace"));
  if (isempty (ipos))
    ## Matlab default whitespace = " \b\t"
    args(end+1:end+2) = {'whitespace', " \b\t"};
    whitespace = " \b\t";
  else
    ## Check if there's at least one string format specifier
    fmt = strrep (format, "%", " %");
    fmt = regexp (fmt, '[^ ]+', 'match');
    fmt = strtrim (fmt(strmatch ("%", fmt)))
    has_str_fmt = all (cellfun ("isempty", strfind (strtrim (fmt(strmatch ("%", fmt))), 's')));
    ## If there is a format, AND whitespace value = empty, 
    ## don't add a space (char(32)) to whitespace
    if (! (isempty (args{ipos+1}) &&  has_str_fmt))
      args{ipos+1} = unique ([" ", whitespace]);
    endif
  endif

  if (! any (strcmpi (args, "delimiter")))
    ## Matlab says default delimiter = whitespace.  
    ## strread() will pick this up further
    args(end+1:end+2) = {'delimiter', ""};
  endif

  if (any (strcmpi (args, "returnonerror")))
    ## Because of the way strread() reads data (columnwise) this parameter
    ## can't be neatly implemented.  strread() will pick it up anyway
    warning ('textscan: ReturnOnError is not fully implemented');
  else
    ## Set default value (=true)
    args(end+1:end+2) = {"returnonerror", 1};
  endif

  if (ischar (fid))
    ## Read from a text string
    if (nargout == 2)
      error ("textscan: cannot provide position information for character input");
    endif
    str = fid;
  else
    ## Skip header lines if requested
    headerlines = find (strcmpi (args, "headerlines"), 1);
    ## Beware of zero valued headerline, fskipl would skip to EOF
    if (! isempty (headerlines) && (args{headerlines + 1} > 0))
      fskipl (fid, varargin{headerlines + 1});
      args(headerlines:headerlines+1) = []; 
    endif
    if (isfinite (nlines) && (nlines >= 0))
      str = tmp_str = "";
      n = 0;
      ## FIXME: Can this be done without slow loop?
      while (ischar (tmp_str) && n++ <= nlines)
        str = strcat (str, tmp_str);
        tmp_str = fgets (fid);
      endwhile
    else
      str = fread (fid, "char=>char").';
    endif
  endif

  ## Check for empty result
  if (isempty (str))
    warning ("textscan: no data read");
    C = [];
    return;
  endif

  ## Check value of 'endofline'.  String or file doesn't seem to matter
  endofline = find (strcmpi (args, "endofline"), 1);
  if (! isempty (endofline))
    if (ischar (args{endofline + 1})) 
      eol_char = args{endofline + 1};
    else
      error ("textscan: character value required for EndOfLine"); 
    endif
  else
    ## Determine EOL from file.  Search for EOL candidates in first 3000 chars
    eol_srch_len = min (length (str), 3000);
    ## First try DOS (CRLF)
    if (! isempty (findstr ("\r\n", str(1 : eol_srch_len))))
      eol_char = "\r\n";
    ## Perhaps old Macintosh? (CR)
    elseif (! isempty (findstr ("\r", str(1 : eol_srch_len))))
      eol_char = "\r";
    ## Otherwise, use plain UNIX (LF)
    else
      eol_char = "\n";
    endif
    ## Set up the default endofline param value
    args(end+1:end+2) = {'endofline', eol_char};
  endif

  ## Determine the number of data fields
  num_fields = numel (strfind (format, "%")) - numel (strfind (format, "%*"));

  ## Strip trailing EOL to avoid returning stray missing values (f. strread)
  if (strcmp (str(end-length (eol_char) + 1 : end), eol_char));
    str(end-length (eol_char) + 1 : end) = "";
  endif

  ## Call strread to make it do the real work
  C = cell (1, num_fields);
  [C{:}] = strread (str, format, args{:});

  if (nargout == 2)
    position = ftell (fid);
  endif

endfunction


%!test
%! str = "1,  2,  3,  4\n 5,  ,  ,  8\n 9, 10, 11, 12";
%! fmtstr = "%f %d %f %s";
%! c = textscan (str, fmtstr, 2, "delimiter", ",", "emptyvalue", -Inf);
%! assert (isequal (c{1}, [1;5]));
%! assert (length (c{1}), 2);
%! assert (iscellstr (c{4}));
%! assert (isequal (c{3}, [3; -Inf]));

%!test
%! b = [10:10:100];
%! b = [b; 8*b/5];
%! str = sprintf ("%g miles/hr = %g kilometers/hr\n", b);
%! fmt = "%f miles/hr = %f kilometers/hr";
%! c = textscan (str, fmt);
%! assert (b(1,:)', c{1});
%! assert (b(2,:)', c{2});

#%!test
#%! str = "13, 72, NA, str1, 25\r\n// Middle line\r\n36, na, 05, str3, 6";
#%! a = textscan(str, '%d %n %f %s %n', 'delimiter', ',','treatAsEmpty', {'NA', 'na'},'commentStyle', '//');
#%! assert (a{1}, [13; 36]);
#%! assert (a{2}, [72; NaN]);
#%! assert (a{3}, [NaN; 5]);
#%! assert (a{4}, {"str1"; "str3"});
#%! assert (a{5}, [25; 6]);

%!test
%! str = "Km:10 = hhhBjjj miles16hour\r\n";
%! str = [str "Km:15 = hhhJjjj miles241hour\r\n"];
%! str = [str "Km:2 = hhhRjjj miles3hour\r\n"];
%! str = [str "Km:25 = hhhZ\r\n"];
%! fmt = "Km:%d = hhh%1sjjj miles%dhour";
%! a = textscan (str, fmt, 'delimiter', ' ');
%! assert (a{1}', [10 15 2 25], 1e-5);
%! assert (a{2}', {'B' 'J' 'R' 'Z'});
%! assert (a{3}', [16 241 3 NaN], 1e-5);

%% Test input validation
%!error textscan ()
%!error textscan (single (4))
%!error textscan ({4})
%!error <must be a string> textscan ("Hello World", 2)
%!error <cannot provide position information> [C, pos] = textscan ("Hello World")
%!error <character value required> textscan ("Hello World", '%s', 'EndOfLine', 3)