changeset 12914:ac00ae83ea2e

More compatibility improvements for textscan and strread * strread.m: Implement %u format to int32 type. Fix bug with Mac '\r' end-of-line character. * textscan.m: Implement CollectOutput option.
author Philip Nienhuis <prnienhuis@users.sf.net>
date Wed, 03 Aug 2011 15:05:28 -0700
parents 34e5434563df
children 9e1b9ca119eb
files scripts/io/strread.m scripts/io/textscan.m
diffstat 2 files changed, 129 insertions(+), 46 deletions(-) [+]
line wrap: on
line diff
--- a/scripts/io/strread.m
+++ b/scripts/io/strread.m
@@ -37,18 +37,20 @@
 ## @item %s
 ## The word is parsed as a string.
 ##
+## @itemx %f
+## @itemx %n
+## The word is parsed as a number and converted to double.
+##
 ## @item  %d
-## @itemx %f
 ## @itemx %u
-## @itemx %n
-## The word is parsed as a number (and converted to double).
+## The word is parsed as a number and converted to int32.
 ##
 ## @item %*', '%*f', '%*s
 ## The word is skipped.
 ##
 ## For %s and %d, %f, %n, %u and the associated %*s @dots{} specifiers an
 ## optional width can be specified as %Ns, etc. where N is an integer > 1.
-## For %f, formats like %N.Mf are allowed.
+## For %f, format specifiers like %N.Mf are allowed.
 ##
 ## @item literals
 ## In addition the format may contain literal character strings; these will be
@@ -354,10 +356,17 @@
     str = cell2mat (str);
     ## Remove leading & trailing space, but preserve delimiters.
     str = strtrim (str);
+    ## FIXME: Double strrep on str is enormously expensive of CPU time.
+    ## Can this be eliminated
+    ## Wipe leading and trailing whitespace on each line (it may be delimiter too)
+    if (! isempty (eol_char))
+      str = strrep (str, [eol_char " "], eol_char);
+      str = strrep (str, [" " eol_char], eol_char);
+    endif
   endif
 
   ## Split 'str' into words
-  words = split_by (str, delimiter_str, mult_dlms_s1);
+  words = split_by (str, delimiter_str, mult_dlms_s1, eol_char);
   if (! isempty (white_spaces))
     ## Trim leading and trailing white_spaces
     ## FIXME: Is this correct?  strtrim clears what matches isspace(), not
@@ -398,7 +407,8 @@
 
       ## 1. Assess "period" in the split-up words array ( < num_words_per_line).
       ## Could be done using EndOfLine but that prohibits EndOfLine = "" option.
-      ## Alternative below goes by simply parsing the first "line" of words:
+      ## Alternative below goes by simply parsing a first grab of words
+      ## and counting words until the fmt_words array is exhausted:
       iwrd = 1; iwrdp = 0; iwrdl = length (words{iwrd});
       for ii = 1:numel (fmt_words)
 
@@ -480,7 +490,7 @@
         if ((idf(ii) || idg(ii)) && (rows(words) < num_words_per_line))
           if (idf(ii))
             s = strfind (words(icol, 1), fmt_words{ii});
-            if (isempty (s))
+            if (isempty (s{:}))
               error ("strread: Literal '%s' not found in column %d", fmt_words{ii}, icol);
             endif
             s = s{:}(1);
@@ -585,6 +595,11 @@
           n = cellfun ("isempty", data);
           ### FIXME - erroneously formatted data lead to NaN, not an error
           data = str2double (data);
+          if (! isempty (regexp (fmt_words{m}, "%[du]")))
+            ## Cast to integer 
+            ## FIXME: NaNs will be transformed into zeros
+            data = int32 (data);
+          end
           data(n) = numeric_fill_value;
           if (pad_out)
             data(end+1:num_lines) = numeric_fill_value;
@@ -607,7 +622,11 @@
               if (numel (nfmt) > 1)
                 sprec = str2double (nfmt{2});
                 data = 10^-sprec * round (10^sprec * data);
-              endif
+              elseif (! isempty (regexp (fmt_words{m}, "[du]")))
+                ## Cast to integer 
+                ## FIXME: NaNs will be transformed into zeros
+                data = int32 (data);
+              end
               varargout{k} = data.';
               k++;
             case "s"
@@ -643,17 +662,16 @@
 
 endfunction
 
-function out = split_by (text, sep, mult_dlms_s1)
+function out = split_by (text, sep, mult_dlms_s1, eol_char)
 
   ## Check & if needed, process MultipleDelimsAsOne parameter
   if (mult_dlms_s1)
     mult_dlms_s1 = true;
+    ## FIXME: Should re-implement strsplit() function here in order
+    ## to avoid strrep on megabytes of data.
     ## If \n is in sep collection we need to enclose it in spaces in text
     ## to avoid it being included in consecutive delim series
-    ## FIXME: This only works if eol is LF or CRLF.  Won't work on Mac
-    ##        Should probably use eol_char in this case.
-    ##        Also unlikely to work if <space> is not in white_space
-    text = strrep (text, "\n", " \n ");
+    text = strrep (text, eol_char, [" " eol_char " "]);
   else
     mult_dlms_s1 = false;
   endif
@@ -661,7 +679,7 @@
   ## Split text string along delimiters
   out = strsplit (text, sep, mult_dlms_s1);
   ## In case of trailing delimiter, strip stray last empty word
-  if (any (sep == text(end)))
+  if (!isempty (out) && any (sep == text(end)))
     out(end) = [];
   endif
   
@@ -673,13 +691,8 @@
 
 %!test
 %! [a, b] = strread ("1 2", "%f%f");
-%! assert (a == 1 && b == 2);
-
-%!test
-%! str = "# comment\n# comment\n1 2 3";
-%! [a, b] = strread (str, '%d %s', 'commentstyle', 'shell', 'endofline', "\n");
-%! assert (a, [1; 3]);
-%! assert (b, {"2"});
+%! assert (a, 1);
+%! assert (b, 2);
 
 %!test
 %! str = '';
@@ -689,7 +702,7 @@
 %!   str = sprintf ('%s %.6f %s\n', str, a(k), b(k));
 %! endfor
 %! [aa, bb] = strread (str, '%f %s');
-%! assert (a, aa, 1e-5);
+%! assert (a, aa, 1e-6);
 %! assert (cellstr (b), bb);
 
 %!test
@@ -700,7 +713,7 @@
 %!   str = sprintf ('%s %.6f %s\n', str, a(k), b(k));
 %! endfor
 %! aa = strread (str, '%f %*s');
-%! assert (a, aa, 1e-5);
+%! assert (a, aa, 1e-6);
 
 %!test
 %! str = sprintf ('/* this is\nacomment*/ 1 2 3');
@@ -708,6 +721,12 @@
 %! assert (a, [1; 2; 3]);
 
 %!test
+%! str = "# comment\n# comment\n1 2 3";
+%! [a, b] = strread (str, '%n %s', 'commentstyle', 'shell', 'endofline', "\n");
+%! assert (a, [1; 3]);
+%! assert (b, {"2"});
+
+%!test
 %! str = sprintf ("Tom 100 miles/hr\nDick 90 miles/hr\nHarry 80 miles/hr");
 %! fmt = "%s %f miles/hr";
 %! c = cell (1, 2);
@@ -738,14 +757,14 @@
 %! assert (double (a{3}(end-5:end)), [32 110 97 109 101 62]);
 
 %!test
-%! [a, b, c, d] = strread ("1,2,3,,5,6", "%d%d%d%d", 'delimiter', ',');
-%! assert (c, 3);
+%! [a, b, c, d] = strread ("1,2,3,,5,6", "%d%f%d%f", 'delimiter', ',');
+%! assert (c, int32 (3));
 %! assert (d, NaN);
 
 %!test
-%! [a, b, c, d] = strread ("1,2,3,,5,6\n", "%d%d%d%d", 'delimiter', ',');
+%! [a, b, c, d] = strread ("1,2,3,,5,6\n", "%d%d%f%d", 'delimiter', ',');
 %! assert (c, [3; NaN]);
-%! assert (d, [NaN; NaN]);
+%! assert (d, int32 ([0; 0]));
 
 %!test
 %! # Default format (= %f)
@@ -760,24 +779,18 @@
 
 %!test
 %! # TreatAsEmpty
-%! [a, b, c, d] = strread ("1,2,3,NN,5,6\n", "%d%d%d%d", 'delimiter', ',', 'TreatAsEmpty', 'NN');
-%! assert (c, [3; NaN]);
+%! [a, b, c, d] = strread ("1,2,3,NN,5,6\n", "%d%d%d%f", 'delimiter', ',', 'TreatAsEmpty', 'NN');
+%! assert (c, int32 ([3; 0]));
 %! assert (d, [NaN; NaN]);
 
 %!test
 %! # No delimiters at all besides EOL.  Plain reading numbers & strings
 %! str = "Text1Text2Text\nText398Text4Text\nText57Text";
 %! c = textscan (str, "Text%dText%1sText");
-%! assert (c{1}, [1; 398; 57]);
+%! assert (c{1}, int32 ([1; 398; 57]));
 %! assert (c{2}(1:2), {'2'; '4'});
 %! assert (isempty (c{2}{3}), true);
 
-%!test
-%! # No delimiters at all besides EOL.  Skip fields, even empty fields
-%! str = "Text1Text2Text\nTextText4Text\nText57Text";
-%! c = textscan (str, "Text%*dText%dText");
-%! assert (c{1}, [2; 4; NaN]);
-
 %% MultipleDelimsAsOne
 %!test
 %! str = "11, 12, 13,, 15\n21,, 23, 24, 25\n,, 33, 34, 35";
--- a/scripts/io/textscan.m
+++ b/scripts/io/textscan.m
@@ -34,13 +34,18 @@
 ## @code{strread}, this function supports a few more:
 ##
 ## @itemize
-## @item "headerlines":
-## The first @var{value} number of lines of @var{fid} are skipped.
+## @item "collectoutput":
+## A value of 1 or true instructs textscan to concatenate consecutive columns
+## of the same class in the output cell array.  A value of 0 or false (default)
+## leaves output in distinct columns.
 ##
 ## @item "endofline":
-## Specify a single character or "\r\n".  If no value is given, it will be
-## inferred from the file.  If set to "" (empty string) EOLs are ignored as
-## delimiters.
+## Specify "\r", "\n" or "\r\n" (for CR, LF, or CRLF).  If no value is given,
+## it will be inferred from the file.  If set to "" (empty string) EOLs are
+## ignored as delimiters and added to whitespace.
+##
+## @item "headerlines":
+## The first @var{value} number of lines of @var{fid} are skipped.
 ##
 ## @item "returnonerror":
 ## If set to numerical 1 or true (default), return normally when read errors
@@ -116,6 +121,20 @@
     args(end+1:end+2) = {'delimiter', ""};
   endif
 
+  collop = false;
+  ipos = find (strcmpi (args, "collectoutput"));
+  if (! isempty (ipos))
+    ## Search & concatenate consecutive columns of same class requested
+    if (isscalar (args{ipos+1})
+        && (islogical (args{ipos+1}) || isnumeric (args{ipos+1})))
+      collop = args{ipos+1};
+    else
+      warning ("textscan: illegal value for CollectOutput parameter - ignored");
+    endif
+    ## Remove argument before call to strread() below
+    args(ipos:ipos+1) = [];
+  endif
+
   if (any (strcmpi (args, "returnonerror")))
     ## Because of the way strread() reads data (columnwise) this parameter
     ## can't be neatly implemented.  strread() will pick it up anyway
@@ -164,6 +183,9 @@
   if (! isempty (endofline))
     if (ischar (args{endofline + 1})) 
       eol_char = args{endofline + 1};
+      if (isempty (strmatch (eol_char, {"", "\n", "\r", "\r\n"}, 'exact')))
+        error ("textscan: illegal EndOfLine character value specified");
+      endif
     else
       error ("textscan: character value required for EndOfLine"); 
     endif
@@ -195,6 +217,11 @@
   ## Call strread to make it do the real work
   C = cell (1, num_fields);
   [C{:}] = strread (str, format, args{:});
+  
+  ## If requested, collect output columns of same class
+  if (collop)
+    C = colloutp (C);
+  endif
 
   if (nargout == 2)
     position = ftell (fid);
@@ -203,6 +230,32 @@
 endfunction
 
 
+## Collect consecutive columns of same class into one cell column
+function C = colloutp (C)
+
+  ## Start at rightmost column and work backwards to avoid ptr mixup
+  ii = numel (C);
+  while ii > 1
+    clss1 = class (C{ii});
+    jj = ii;
+    while  (jj > 1 && strcmp (clss1, class (C{jj - 1})))
+      ## Column to the left is still same class; check next column to the left
+      --jj;
+    endwhile
+    if (jj < ii)
+      ## Concatenate columns into current column
+      C{jj} = [C{jj : ii}];
+      ## Wipe concatenated columns to the right, resume search to the left
+      C(jj+1 : ii) = [];
+      ii = jj - 1;
+    else
+      ## No similar class in column to the left, search from there
+      --ii;
+    endif
+  endwhile
+
+endfunction
+
 %!test
 %! str = "1,  2,  3,  4\n 5,  ,  ,  8\n 9, 10, 11, 12";
 %! fmtstr = "%f %d %f %s";
@@ -218,13 +271,13 @@
 %! str = sprintf ("%g miles/hr = %g kilometers/hr\n", b);
 %! fmt = "%f miles/hr = %f kilometers/hr";
 %! c = textscan (str, fmt);
-%! assert (b(1,:)', c{1});
-%! assert (b(2,:)', c{2});
+%! assert (b(1,:)', c{1}, 1e-5);
+%! assert (b(2,:)', c{2}, 1e-5);
 
 #%!test
 #%! str = "13, 72, NA, str1, 25\r\n// Middle line\r\n36, na, 05, str3, 6";
 #%! a = textscan(str, '%d %n %f %s %n', 'delimiter', ',','treatAsEmpty', {'NA', 'na'},'commentStyle', '//');
-#%! assert (a{1}, [13; 36]);
+#%! assert (a{1}, int32([13; 36]));
 #%! assert (a{2}, [72; NaN]);
 #%! assert (a{3}, [NaN; 5]);
 #%! assert (a{4}, {"str1"; "str3"});
@@ -237,9 +290,9 @@
 %! str = [str "Km:25 = hhhZ\r\n"];
 %! fmt = "Km:%d = hhh%1sjjj miles%dhour";
 %! a = textscan (str, fmt, 'delimiter', ' ');
-%! assert (a{1}', [10 15 2 25], 1e-5);
+%! assert (a{1}', int32([10 15 2 25]));
 %! assert (a{2}', {'B' 'J' 'R' 'Z'});
-%! assert (a{3}', [16 241 3 NaN], 1e-5);
+%! assert (a{3}', int32([16 241 3 0]));
 
 %% Test with default endofline parameter
 %!test
@@ -251,6 +304,22 @@
 %! c = textscan ("L1\nL2", "%s", 'endofline', '');
 %! assert (int8(c{:}{:}), int8([ 76,  49,  10,  76,  50 ]));
 
+%!test
+%! # No delimiters at all besides EOL.  Skip fields, even empty fields
+%! str = "Text1Text2Text\nTextText4Text\nText57Text";
+%! c = textscan (str, "Text%*dText%dText");
+%! assert (c{1}, int32 ([2; 4; 0]));
+
+%!test
+%% CollectOutput test
+%! b = [10:10:100];
+%! b = [b; 8*b/5; 8*b*1000/5];
+%! str = sprintf ("%g miles/hr = %g (%g) kilometers (meters)/hr\n", b);
+%! fmt = "%f miles%s %s %f (%f) kilometers %*s";
+%! c = textscan (str, fmt, 'collectoutput', 1);
+%! assert (size(c{3}), [10, 2]);
+%! assert (size(c{2}), [10, 2]);
+
 %% Test input validation
 %!error textscan ()
 %!error textscan (single (4))
@@ -258,3 +327,4 @@
 %!error <must be a string> textscan ("Hello World", 2)
 %!error <cannot provide position information> [C, pos] = textscan ("Hello World")
 %!error <character value required> textscan ("Hello World", '%s', 'EndOfLine', 3)
+