# HG changeset patch # User Philip Nienhuis # Date 1312409128 25200 # Node ID ac00ae83ea2eba7d0a1ee65670334ed6e157233f # Parent 34e5434563df2f1fc0e370996ac55bf33c494615 More compatibility improvements for textscan and strread * strread.m: Implement %u format to int32 type. Fix bug with Mac '\r' end-of-line character. * textscan.m: Implement CollectOutput option. diff --git a/scripts/io/strread.m b/scripts/io/strread.m --- a/scripts/io/strread.m +++ b/scripts/io/strread.m @@ -37,18 +37,20 @@ ## @item %s ## The word is parsed as a string. ## +## @itemx %f +## @itemx %n +## The word is parsed as a number and converted to double. +## ## @item %d -## @itemx %f ## @itemx %u -## @itemx %n -## The word is parsed as a number (and converted to double). +## The word is parsed as a number and converted to int32. ## ## @item %*', '%*f', '%*s ## The word is skipped. ## ## For %s and %d, %f, %n, %u and the associated %*s @dots{} specifiers an ## optional width can be specified as %Ns, etc. where N is an integer > 1. -## For %f, formats like %N.Mf are allowed. +## For %f, format specifiers like %N.Mf are allowed. ## ## @item literals ## In addition the format may contain literal character strings; these will be @@ -354,10 +356,17 @@ str = cell2mat (str); ## Remove leading & trailing space, but preserve delimiters. str = strtrim (str); + ## FIXME: Double strrep on str is enormously expensive of CPU time. + ## Can this be eliminated + ## Wipe leading and trailing whitespace on each line (it may be delimiter too) + if (! isempty (eol_char)) + str = strrep (str, [eol_char " "], eol_char); + str = strrep (str, [" " eol_char], eol_char); + endif endif ## Split 'str' into words - words = split_by (str, delimiter_str, mult_dlms_s1); + words = split_by (str, delimiter_str, mult_dlms_s1, eol_char); if (! isempty (white_spaces)) ## Trim leading and trailing white_spaces ## FIXME: Is this correct? strtrim clears what matches isspace(), not @@ -398,7 +407,8 @@ ## 1. Assess "period" in the split-up words array ( < num_words_per_line). ## Could be done using EndOfLine but that prohibits EndOfLine = "" option. - ## Alternative below goes by simply parsing the first "line" of words: + ## Alternative below goes by simply parsing a first grab of words + ## and counting words until the fmt_words array is exhausted: iwrd = 1; iwrdp = 0; iwrdl = length (words{iwrd}); for ii = 1:numel (fmt_words) @@ -480,7 +490,7 @@ if ((idf(ii) || idg(ii)) && (rows(words) < num_words_per_line)) if (idf(ii)) s = strfind (words(icol, 1), fmt_words{ii}); - if (isempty (s)) + if (isempty (s{:})) error ("strread: Literal '%s' not found in column %d", fmt_words{ii}, icol); endif s = s{:}(1); @@ -585,6 +595,11 @@ n = cellfun ("isempty", data); ### FIXME - erroneously formatted data lead to NaN, not an error data = str2double (data); + if (! isempty (regexp (fmt_words{m}, "%[du]"))) + ## Cast to integer + ## FIXME: NaNs will be transformed into zeros + data = int32 (data); + end data(n) = numeric_fill_value; if (pad_out) data(end+1:num_lines) = numeric_fill_value; @@ -607,7 +622,11 @@ if (numel (nfmt) > 1) sprec = str2double (nfmt{2}); data = 10^-sprec * round (10^sprec * data); - endif + elseif (! isempty (regexp (fmt_words{m}, "[du]"))) + ## Cast to integer + ## FIXME: NaNs will be transformed into zeros + data = int32 (data); + end varargout{k} = data.'; k++; case "s" @@ -643,17 +662,16 @@ endfunction -function out = split_by (text, sep, mult_dlms_s1) +function out = split_by (text, sep, mult_dlms_s1, eol_char) ## Check & if needed, process MultipleDelimsAsOne parameter if (mult_dlms_s1) mult_dlms_s1 = true; + ## FIXME: Should re-implement strsplit() function here in order + ## to avoid strrep on megabytes of data. ## If \n is in sep collection we need to enclose it in spaces in text ## to avoid it being included in consecutive delim series - ## FIXME: This only works if eol is LF or CRLF. Won't work on Mac - ## Should probably use eol_char in this case. - ## Also unlikely to work if is not in white_space - text = strrep (text, "\n", " \n "); + text = strrep (text, eol_char, [" " eol_char " "]); else mult_dlms_s1 = false; endif @@ -661,7 +679,7 @@ ## Split text string along delimiters out = strsplit (text, sep, mult_dlms_s1); ## In case of trailing delimiter, strip stray last empty word - if (any (sep == text(end))) + if (!isempty (out) && any (sep == text(end))) out(end) = []; endif @@ -673,13 +691,8 @@ %!test %! [a, b] = strread ("1 2", "%f%f"); -%! assert (a == 1 && b == 2); - -%!test -%! str = "# comment\n# comment\n1 2 3"; -%! [a, b] = strread (str, '%d %s', 'commentstyle', 'shell', 'endofline', "\n"); -%! assert (a, [1; 3]); -%! assert (b, {"2"}); +%! assert (a, 1); +%! assert (b, 2); %!test %! str = ''; @@ -689,7 +702,7 @@ %! str = sprintf ('%s %.6f %s\n', str, a(k), b(k)); %! endfor %! [aa, bb] = strread (str, '%f %s'); -%! assert (a, aa, 1e-5); +%! assert (a, aa, 1e-6); %! assert (cellstr (b), bb); %!test @@ -700,7 +713,7 @@ %! str = sprintf ('%s %.6f %s\n', str, a(k), b(k)); %! endfor %! aa = strread (str, '%f %*s'); -%! assert (a, aa, 1e-5); +%! assert (a, aa, 1e-6); %!test %! str = sprintf ('/* this is\nacomment*/ 1 2 3'); @@ -708,6 +721,12 @@ %! assert (a, [1; 2; 3]); %!test +%! str = "# comment\n# comment\n1 2 3"; +%! [a, b] = strread (str, '%n %s', 'commentstyle', 'shell', 'endofline', "\n"); +%! assert (a, [1; 3]); +%! assert (b, {"2"}); + +%!test %! str = sprintf ("Tom 100 miles/hr\nDick 90 miles/hr\nHarry 80 miles/hr"); %! fmt = "%s %f miles/hr"; %! c = cell (1, 2); @@ -738,14 +757,14 @@ %! assert (double (a{3}(end-5:end)), [32 110 97 109 101 62]); %!test -%! [a, b, c, d] = strread ("1,2,3,,5,6", "%d%d%d%d", 'delimiter', ','); -%! assert (c, 3); +%! [a, b, c, d] = strread ("1,2,3,,5,6", "%d%f%d%f", 'delimiter', ','); +%! assert (c, int32 (3)); %! assert (d, NaN); %!test -%! [a, b, c, d] = strread ("1,2,3,,5,6\n", "%d%d%d%d", 'delimiter', ','); +%! [a, b, c, d] = strread ("1,2,3,,5,6\n", "%d%d%f%d", 'delimiter', ','); %! assert (c, [3; NaN]); -%! assert (d, [NaN; NaN]); +%! assert (d, int32 ([0; 0])); %!test %! # Default format (= %f) @@ -760,24 +779,18 @@ %!test %! # TreatAsEmpty -%! [a, b, c, d] = strread ("1,2,3,NN,5,6\n", "%d%d%d%d", 'delimiter', ',', 'TreatAsEmpty', 'NN'); -%! assert (c, [3; NaN]); +%! [a, b, c, d] = strread ("1,2,3,NN,5,6\n", "%d%d%d%f", 'delimiter', ',', 'TreatAsEmpty', 'NN'); +%! assert (c, int32 ([3; 0])); %! assert (d, [NaN; NaN]); %!test %! # No delimiters at all besides EOL. Plain reading numbers & strings %! str = "Text1Text2Text\nText398Text4Text\nText57Text"; %! c = textscan (str, "Text%dText%1sText"); -%! assert (c{1}, [1; 398; 57]); +%! assert (c{1}, int32 ([1; 398; 57])); %! assert (c{2}(1:2), {'2'; '4'}); %! assert (isempty (c{2}{3}), true); -%!test -%! # No delimiters at all besides EOL. Skip fields, even empty fields -%! str = "Text1Text2Text\nTextText4Text\nText57Text"; -%! c = textscan (str, "Text%*dText%dText"); -%! assert (c{1}, [2; 4; NaN]); - %% MultipleDelimsAsOne %!test %! str = "11, 12, 13,, 15\n21,, 23, 24, 25\n,, 33, 34, 35"; diff --git a/scripts/io/textscan.m b/scripts/io/textscan.m --- a/scripts/io/textscan.m +++ b/scripts/io/textscan.m @@ -34,13 +34,18 @@ ## @code{strread}, this function supports a few more: ## ## @itemize -## @item "headerlines": -## The first @var{value} number of lines of @var{fid} are skipped. +## @item "collectoutput": +## A value of 1 or true instructs textscan to concatenate consecutive columns +## of the same class in the output cell array. A value of 0 or false (default) +## leaves output in distinct columns. ## ## @item "endofline": -## Specify a single character or "\r\n". If no value is given, it will be -## inferred from the file. If set to "" (empty string) EOLs are ignored as -## delimiters. +## Specify "\r", "\n" or "\r\n" (for CR, LF, or CRLF). If no value is given, +## it will be inferred from the file. If set to "" (empty string) EOLs are +## ignored as delimiters and added to whitespace. +## +## @item "headerlines": +## The first @var{value} number of lines of @var{fid} are skipped. ## ## @item "returnonerror": ## If set to numerical 1 or true (default), return normally when read errors @@ -116,6 +121,20 @@ args(end+1:end+2) = {'delimiter', ""}; endif + collop = false; + ipos = find (strcmpi (args, "collectoutput")); + if (! isempty (ipos)) + ## Search & concatenate consecutive columns of same class requested + if (isscalar (args{ipos+1}) + && (islogical (args{ipos+1}) || isnumeric (args{ipos+1}))) + collop = args{ipos+1}; + else + warning ("textscan: illegal value for CollectOutput parameter - ignored"); + endif + ## Remove argument before call to strread() below + args(ipos:ipos+1) = []; + endif + if (any (strcmpi (args, "returnonerror"))) ## Because of the way strread() reads data (columnwise) this parameter ## can't be neatly implemented. strread() will pick it up anyway @@ -164,6 +183,9 @@ if (! isempty (endofline)) if (ischar (args{endofline + 1})) eol_char = args{endofline + 1}; + if (isempty (strmatch (eol_char, {"", "\n", "\r", "\r\n"}, 'exact'))) + error ("textscan: illegal EndOfLine character value specified"); + endif else error ("textscan: character value required for EndOfLine"); endif @@ -195,6 +217,11 @@ ## Call strread to make it do the real work C = cell (1, num_fields); [C{:}] = strread (str, format, args{:}); + + ## If requested, collect output columns of same class + if (collop) + C = colloutp (C); + endif if (nargout == 2) position = ftell (fid); @@ -203,6 +230,32 @@ endfunction +## Collect consecutive columns of same class into one cell column +function C = colloutp (C) + + ## Start at rightmost column and work backwards to avoid ptr mixup + ii = numel (C); + while ii > 1 + clss1 = class (C{ii}); + jj = ii; + while (jj > 1 && strcmp (clss1, class (C{jj - 1}))) + ## Column to the left is still same class; check next column to the left + --jj; + endwhile + if (jj < ii) + ## Concatenate columns into current column + C{jj} = [C{jj : ii}]; + ## Wipe concatenated columns to the right, resume search to the left + C(jj+1 : ii) = []; + ii = jj - 1; + else + ## No similar class in column to the left, search from there + --ii; + endif + endwhile + +endfunction + %!test %! str = "1, 2, 3, 4\n 5, , , 8\n 9, 10, 11, 12"; %! fmtstr = "%f %d %f %s"; @@ -218,13 +271,13 @@ %! str = sprintf ("%g miles/hr = %g kilometers/hr\n", b); %! fmt = "%f miles/hr = %f kilometers/hr"; %! c = textscan (str, fmt); -%! assert (b(1,:)', c{1}); -%! assert (b(2,:)', c{2}); +%! assert (b(1,:)', c{1}, 1e-5); +%! assert (b(2,:)', c{2}, 1e-5); #%!test #%! str = "13, 72, NA, str1, 25\r\n// Middle line\r\n36, na, 05, str3, 6"; #%! a = textscan(str, '%d %n %f %s %n', 'delimiter', ',','treatAsEmpty', {'NA', 'na'},'commentStyle', '//'); -#%! assert (a{1}, [13; 36]); +#%! assert (a{1}, int32([13; 36])); #%! assert (a{2}, [72; NaN]); #%! assert (a{3}, [NaN; 5]); #%! assert (a{4}, {"str1"; "str3"}); @@ -237,9 +290,9 @@ %! str = [str "Km:25 = hhhZ\r\n"]; %! fmt = "Km:%d = hhh%1sjjj miles%dhour"; %! a = textscan (str, fmt, 'delimiter', ' '); -%! assert (a{1}', [10 15 2 25], 1e-5); +%! assert (a{1}', int32([10 15 2 25])); %! assert (a{2}', {'B' 'J' 'R' 'Z'}); -%! assert (a{3}', [16 241 3 NaN], 1e-5); +%! assert (a{3}', int32([16 241 3 0])); %% Test with default endofline parameter %!test @@ -251,6 +304,22 @@ %! c = textscan ("L1\nL2", "%s", 'endofline', ''); %! assert (int8(c{:}{:}), int8([ 76, 49, 10, 76, 50 ])); +%!test +%! # No delimiters at all besides EOL. Skip fields, even empty fields +%! str = "Text1Text2Text\nTextText4Text\nText57Text"; +%! c = textscan (str, "Text%*dText%dText"); +%! assert (c{1}, int32 ([2; 4; 0])); + +%!test +%% CollectOutput test +%! b = [10:10:100]; +%! b = [b; 8*b/5; 8*b*1000/5]; +%! str = sprintf ("%g miles/hr = %g (%g) kilometers (meters)/hr\n", b); +%! fmt = "%f miles%s %s %f (%f) kilometers %*s"; +%! c = textscan (str, fmt, 'collectoutput', 1); +%! assert (size(c{3}), [10, 2]); +%! assert (size(c{2}), [10, 2]); + %% Test input validation %!error textscan () %!error textscan (single (4)) @@ -258,3 +327,4 @@ %!error textscan ("Hello World", 2) %!error [C, pos] = textscan ("Hello World") %!error textscan ("Hello World", '%s', 'EndOfLine', 3) +