Mercurial > hg > octave-nkf
changeset 16357:0cbe330f39a2
textscan.m, textread.m: allow reading multi-column data files with empty format + tests (bug #38317)
author | Philip Nienhuis <prnienhuis@users.sf.net> |
---|---|
date | Fri, 22 Mar 2013 17:46:04 +0100 |
parents | df643a532b61 |
children | 0db0926c2d0f |
files | scripts/io/textread.m scripts/io/textscan.m |
diffstat | 2 files changed, 255 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/scripts/io/textread.m +++ b/scripts/io/textread.m @@ -44,6 +44,11 @@ ## The optional input @var{n} specifies the number of data lines to read; in ## this sense it differs slightly from the format repeat count in strread. ## +## If the format string is empty (not: omitted) and the file contains only +## numeric data (excluding headerlines), textread will return a rectangular +## matrix with the number of columns matching the number of numeric fields on +## the first data line of the file. Empty fields are returned as zero values. +## ## @seealso{strread, load, dlmread, fscanf, textscan} ## @end deftypefn @@ -174,9 +179,46 @@ ## Call strread to make it do the real work [varargout{1:max (nargout, 1)}] = strread (str, format, varargin {:}); + ## Hack to concatenate/reshape numeric output into 2D array (undocumented ML) + ## In ML this only works in case of an empty format string + if (isempty (format)) + ## Get number of fields per line. + ## 1. Get eol_char position + iwhsp = find (strcmpi ("whitespace", varargin)); + whsp = varargin{iwhsp + 1}; + idx = regexp (str, eol_char, "once"); + ## 2. Get first data line til EOL. Avoid corner case of just one line + if (! isempty (idx)) + str = str(1:idx-1); + endif + idelimiter = find (strcmpi (varargin, "delimiter"), 1); + if (isempty (idelimiter)) + ## Assume delimiter = whitespace + ## 3A. whitespace incl. consecutive whitespace => single space + str = regexprep (str, sprintf ("[%s]+", whsp), ' '); + ## 4A. Remove possible leading & trailing spaces + str = strtrim (str); + ## 5A. Count spaces, add one to get nr of data fields per line + ncols = numel (strfind (str, " ")) + 1; + else + ## 3B. Just count delimiters. FIXME: delimiters could occur in literals + delimiter = varargin {idelimiter+1}; + ncols = numel (regexp (str, sprintf ("[%s]", delimiter))) + 1; + endif + ## 6. Reshape; watch out, we need a transpose + nrows = ceil (numel (varargout{1}) / ncols); + pad = mod (numel (varargout{1}), ncols); + if (pad > 0) + pad = ncols - pad; + varargout{1}(end+1 : end+pad) = NaN; + endif + varargout{1} = reshape (varargout{1}, ncols, nrows)'; + ## ML replaces empty values with NaNs + varargout{1}(find (isnan (varargout{1}))) = 0; + endif + endfunction - %!test %! f = tmpnam (); %! d = rand (5, 3); @@ -195,6 +237,76 @@ %! unlink (f); %! assert (a, d(2:7, 1), 1e-2); +%% Test reading 2D matrix with empty format +%!test +%! f = tmpnam (); +%! d = rand (5, 2); +%! dlmwrite (f, d, "precision", "%5.2f"); +%! A = textread (f, "", "headerlines", 3); +%! unlink (f); +%! assert (A, d(4:5, :), 1e-2); + +%% Read multiple lines using empty format string +%!test +%! f = tmpnam (); +%! unlink (f); +%! fid = fopen (f, "w"); +%! d = rand (1, 4); +%! fprintf (fid, " %f %f %f %f ", d); +%! fclose (fid); +%! A = textread (f, ""); +%! unlink (f); +%! assert (A, d, 1e-6); + +%% Empty format, corner case = one line w/o EOL +%!test +%! f = tmpnam (); +%! unlink (f); +%! fid = fopen (f, "w"); +%! d = rand (1, 4); +%! fprintf (fid, " %f %f %f %f ", d); +%! fclose (fid); +%! A = textread (f, ""); +%! unlink (f); +%! assert (A, d, 1e-6); + +%% Read multiple lines using empty format string, missing data (should be 0) +%!test +%! f = tmpnam (); +%! unlink (f); +%! fid = fopen (f, "w"); +%! d = rand (1, 4); +%! fprintf (fid, "%f, %f, , %f, %f ", d); +%! fclose (fid); +%! A = textread (f, ""); +%! unlink (f); +%! assert (A, [ d(1:2) 0 d(3:4)], 1e-6); + +%% Test with empty positions - ML returns 0 for empty fields +%!test +%! f = tmpnam (); +%! unlink (f); +%! fid = fopen (f, "w"); +%! d = rand (1, 4); +%! fprintf (fid, ",2,,4\n5,,7,\n"); +%! fclose (fid); +%! A = textread (f, "", "delimiter", ","); +%! unlink (f); +%! assert (A, [0 2 0 4; 5 0 7 0], 1e-6); + +%% Another test with empty format + positions, now with more incomplete lower +%% row (must be appended with zeros to get rectangular matrix) +%!test +%! f = tmpnam (); +%! unlink (f); +%! fid = fopen (f, "w"); +%! d = rand (1, 4); +%! fprintf (fid, ",2,,4\n5,\n"); +%! fclose (fid); +%! A = textread (f, "", "delimiter", ","); +%! unlink (f); +%! assert (A, [0 2 0 4; 5 0 0 0], 1e-6); + %% Test input validation %!error textread () %!error textread (1)
--- a/scripts/io/textscan.m +++ b/scripts/io/textscan.m @@ -67,19 +67,26 @@ ## The second output, @var{position}, provides the position, in characters, ## from the beginning of the file. ## +## If the format string is empty (not: omitted) and the file contains only +## numeric data (excluding headerlines), textscan will return data in a number +## of columns matching the number of numeric fields on the first data line of +## the file. +## ## @seealso{dlmread, fscanf, load, strread, textread} ## @end deftypefn function [C, position] = textscan (fid, format = "%f", varargin) BUFLENGTH = 4096; ## Read buffer - + emptfmt = 0; ## Signals deliberately empty format string + ## Check input if (nargin < 1) print_usage (); endif if (isempty (format)) + emptfmt = 1; format = "%f"; endif @@ -132,6 +139,9 @@ ## Matlab says default delimiter = whitespace. ## strread() will pick this up further args(end+1:end+2) = {'delimiter', ""}; + delimiter = ""; + else + delimiter = args{find (strcmpi (args, "delimiter")) + 1}; endif collop = false; @@ -157,6 +167,15 @@ args(end+1:end+2) = {"returnonerror", 1}; endif + ## Check if a headerlines argument is specified + headerlines = find (strcmpi (args, "headerlines"), 1); + if (! isempty (headerlines)) + ## Yep. But it is stray when reading from strings... + if (ischar (fid)) + warning ("textscan: 'headerlines' ignored when reading from strings"); + endif + endif + if (ischar (fid)) ## Read from a text string if (nargout == 2) @@ -166,7 +185,6 @@ else st_pos = ftell (fid); ## Skip header lines if requested - headerlines = find (strcmpi (args, "headerlines"), 1); if (! isempty (headerlines)) ## Beware of missing or wrong headerline value if (headerlines == numel (args) @@ -268,9 +286,10 @@ endif ## Strip trailing EOL to avoid returning stray missing values (f. strread). - ## However, in case of CollectOutput request, presence of EOL is required + ## However, in case of CollectOutput request, presence of EOL is required; + ## also in case of deliberately entered empty format string eol_at_end = strcmp (str(end-length (eol_char) + 1 : end), eol_char); - if (collop) + if (collop || emptfmt) if (! eol_at_end) str(end+1 : end+length (eol_char)) = eol_char; endif @@ -284,6 +303,36 @@ C = cell (1, num_fields); [C{:}] = strread (str, format, args{:}); + ## I.c.o. empty format, match nr. of cols to nr. of fields on first read line + if (emptfmt) + ## Find end of first line + eoi = index (str, eol_char); + if (eoi) + ## str contains an EOL, proceed with assessing nr. of columns + ncols = countcols (C, str(1 : eoi-1), delimiter, whitespace); + ## See if lowermost data row must be completed + pad = mod (numel (C{1}), ncols); + if (pad) + ## Textscan returns NaNs for empty fields + C(1) = [C{1}; NaN(ncols - pad, 1)]; + endif + ## Replace NaNs with EmptyValue, if any + ipos = find (strcmpi (args, "emptyvalue")); + if (ipos) + C{1}(find (isnan (C{1}))) = args{ipos+1}; + endif + ## Compute nr. of rows + nrows = floor (numel (C{1}) / ncols); + ## Reshape C; watch out, transpose needed + C(1) = reshape (C{1}, ncols, numel (C{1}) / ncols)'; + ## Distribute columns over C and wipe cols 2:end of C{1} + for ii=2:ncols + C(ii) = C{1}(:, ii); + endfor + C{1} = C{1}(:, 1); + endif + endif + ## If requested, collect output columns of same class if (collop) C = colloutp (C); @@ -297,6 +346,21 @@ endfunction +## Assess nr of data fields on first line of data +function ncols = countcols (C, str, dlm, wsp) + + if (isempty (dlm)) + ## Field separator = whitespace. Fold multiple whitespace into one + str = regexprep (str, sprintf ("[%s]", wsp), " "); + str = strtrim (str); + ncols = numel (strfind (str, " ")) + 1; + else + ncols = numel (regexp (str, sprintf ("[%s]", dlm))) + 1; + endif + +endfunction + + ## Collect consecutive columns of same class into one cell column function C = colloutp (C) @@ -520,6 +584,77 @@ %! assert (A{1}, [d(1); d(3)], 1e-6); %! assert (A{2}, [d(2); d(4)], 1e-6); -%!error <missing or illegal value for> textread (file_in_loadpath ("textscan.m"), "", "headerlines") -%!error <missing or illegal value for> textread (file_in_loadpath ("textscan.m"), "", "headerlines", 'hh') -%!error <character value required for> textread (file_in_loadpath ("textscan.m"), "", "endofline", true) +%% Tests reading with empty format, should return proper nr of columns +%!test +%! f = tmpnam (); +%! fid = fopen (f, "w+"); +%! fprintf (fid, " 1 2 3 4\n5 6 7 8"); +%! fseek (fid, 0, "bof"); +%! A = textscan (fid, ""); +%! fclose (fid); +%! unlink (f); +%! assert (A{1}, [1 ; 5], 1e-6); +%! assert (A{2}, [2 ; 6], 1e-6); +%! assert (A{3}, [3 ; 7], 1e-6); +%! assert (A{4}, [4 ; 8], 1e-6); + +%% Tests reading with empty format; empty fields & incomplete lower row +%!test +%! f = tmpnam (); +%! fid = fopen (f, "w+"); +%! fprintf (fid, " ,2,,4\n5,6"); +%! fseek (fid, 0, "bof"); +%! A = textscan (fid, "", "delimiter", ",", "EmptyValue", 999, "CollectOutput" , 1); +%! fclose (fid); +%! unlink (f); +%! assert (A{1}, [999, 2, 999, 4; 5, 6, 999, 999], 1e-6); + +%% Error message tests + +%!test +%! f = tmpnam (); +%! fid = fopen (f, "w+"); +%! msg1 = "Missing or illegal value for 'headerlines'"; +%! try +%! A = textscan (fid, "", "headerlines"); +%! end_try_catch; +%! fclose (fid); +%! unlink (f); +%! assert (msg1, lasterr); + +%!test +%! f = tmpnam (); +%! fid = fopen (f, "w+"); +%! msg1 = "Missing or illegal value for 'headerlines'"; +%! try +%! A = textscan (fid, "", "headerlines", "hh"); +%! end_try_catch; +%! fclose (fid); +%! unlink (f); +%! assert (msg1, lasterr); + +%!test +%! f = tmpnam (); +%! fid = fopen (f, "w+"); +%! fprintf (fid,"some_string"); +%! fseek (fid, 0, "bof"); +%! msg1 = "textscan: illegal EndOfLine character value specified"; +%! try +%! A = textscan (fid, "%f", "EndOfLine", "\n\r"); +%! end_try_catch; +%! fclose (fid); +%! unlink (f); +%! assert (msg1, lasterr); + +%!test +%! f = tmpnam (); +%! fid = fopen (f, "w+"); +%! fprintf (fid,"some_string"); +%! fseek (fid, 0, "bof"); +%! msg1 = "textscan: character value required for EndOfLine"; +%! try +%! A = textscan (fid, "%f", "EndOfLine", 33); +%! end_try_catch; +%! fclose (fid); +%! unlink (f); +%! assert (msg1, lasterr);