comparison libinterp/parse-tree/lex.ll @ 16228:e19b1632d7c1

revamp most comment handling * comment-list.h (octave_comment_elt::comment_type): New value, full_line. * lex.h (lexical_feedback::comment_text): New member variable. (lexical_feedback::finish_comment): New function. (octave_lexer::grab_block_comment, octave_lexer::grab_comment_block, octave_lexer::process_comment): Delete. * lex.ll (BLOCK_COMMENT_START, LINE_COMMENT_START): New exclusive start states. (ANY_INCLUDING_NL): New pattern. (<INPUT_FILE_START>{ANY_INCLUDING_NL}): Use it instead of ".". (^{S}*{CCHAR}\{{S}*{NL}, <BLOCK_COMMENT_START>^{S}*{CCHAR}\{{S}*{NL}, <BLOCK_COMMENT_START>^{S}*{CCHAR}\}{S}*{NL}, <BLOCK_COMMENT_START>.*{NL}, {S}*{CCHAR}.*{NL}, <LINE_COMMENT_START>{S}*{CCHAR}.*{NL}, <LINE_COMMENT_START>{ANY_INCLUDING_NL}): New patterns and rules for handling comments. ({CCHAR}, ^{S}*{CCHAR}\{{S}*{NL}): Delete old rules for comments. (display_start_state): Also handle BLOCK_COMMENT_START and LINE_COMMENT_START.
author John W. Eaton <jwe@octave.org>
date Fri, 08 Mar 2013 17:13:54 -0500
parents 4a848eb52de2
children 7b7b1e4968e8
comparison
equal deleted inserted replaced
16227:054d9e8f99b6 16228:e19b1632d7c1
45 45
46 %s COMMAND_START 46 %s COMMAND_START
47 %s MATRIX_START 47 %s MATRIX_START
48 48
49 %x INPUT_FILE_START 49 %x INPUT_FILE_START
50
51 %x BLOCK_COMMENT_START
52 %x LINE_COMMENT_START
50 53
51 %{ 54 %{
52 55
53 #include <cctype> 56 #include <cctype>
54 #include <cstring> 57 #include <cstring>
250 POW ((\*\*)|(\^)) 253 POW ((\*\*)|(\^))
251 EPOW (\.{POW}) 254 EPOW (\.{POW})
252 IDENT ([_$a-zA-Z][_$a-zA-Z0-9]*) 255 IDENT ([_$a-zA-Z][_$a-zA-Z0-9]*)
253 EXPON ([DdEe][+-]?{D}+) 256 EXPON ([DdEe][+-]?{D}+)
254 NUMBER (({D}+\.?{D}*{EXPON}?)|(\.{D}+{EXPON}?)|(0[xX][0-9a-fA-F]+)) 257 NUMBER (({D}+\.?{D}*{EXPON}?)|(\.{D}+{EXPON}?)|(0[xX][0-9a-fA-F]+))
258
259 ANY_INCLUDING_NL (.|{NL})
260
255 %% 261 %%
256 262
257 %{ 263 %{
258 // Make script and function files start with a bogus token. This makes 264 // Make script and function files start with a bogus token. This makes
259 // the parser go down a special path. 265 // the parser go down a special path.
260 %} 266 %}
261 267
262 <INPUT_FILE_START>. { 268 <INPUT_FILE_START>{ANY_INCLUDING_NL} {
263 LEXER_DEBUG ("<INPUT_FILE_START>."); 269 LEXER_DEBUG ("<INPUT_FILE_START>{ANY_INCLUDING_NL}");
264 270
265 curr_lexer->xunput (yytext[0]); 271 curr_lexer->xunput (yytext[0]);
266 272
267 // May be reset later if we see "function" or "classdef" appears 273 // May be reset later if we see "function" or "classdef" appears
268 // as the first token. 274 // as the first token.
559 565
560 curr_lexer->looking_for_object_index = true; 566 curr_lexer->looking_for_object_index = true;
561 curr_lexer->at_beginning_of_statement = false; 567 curr_lexer->at_beginning_of_statement = false;
562 568
563 TOK_RETURN (']'); 569 TOK_RETURN (']');
570 }
571
572 %{
573 // Gobble comments.
574 %}
575
576 %{
577 // Start of a block comment. If the comment marker appears immediately
578 // after a block of full-line comments, finish the full line comment
579 // block.
580 %}
581
582 ^{S}*{CCHAR}\{{S}*{NL} {
583 LEXER_DEBUG ("^{S}*{CCHAR}\{{S}*{NL}");
584
585 int tok = 0;
586
587 if (curr_lexer->start_state () == LINE_COMMENT_START)
588 {
589 if (! curr_lexer->comment_text.empty ())
590 tok = curr_lexer->finish_comment (octave_comment_elt::full_line);
591
592 curr_lexer->pop_start_state ();
593 }
594
595 curr_lexer->push_start_state (BLOCK_COMMENT_START);
596
597 yyless (0);
598
599 if (tok > 0)
600 COUNT_TOK_AND_RETURN (tok);
601 }
602
603 <BLOCK_COMMENT_START>^{S}*{CCHAR}\{{S}*{NL} {
604 LEXER_DEBUG ("<BLOCK_COMMENT_START>^{S}*{CCHAR}\{{S}*{NL}");
605
606 curr_lexer->input_line_number++;
607 curr_lexer->current_input_column = 1;
608
609 if (curr_lexer->block_comment_nesting_level)
610 curr_lexer->comment_text = "\n";
611
612 curr_lexer->block_comment_nesting_level++;
613 }
614
615 %{
616 // End of a block comment. If this block comment is nested inside
617 // another, wait for the outermost block comment block to be closed
618 // before storing the comment.
619 %}
620
621 <BLOCK_COMMENT_START>^{S}*{CCHAR}\}{S}*{NL} {
622 LEXER_DEBUG ("<BLOCK_COMMENT_START>^{S}*{CCHAR}\}{S}*{NL}");
623
624 curr_lexer->input_line_number++;
625 curr_lexer->current_input_column = 1;
626
627 int tok = 0;
628
629 if (curr_lexer->block_comment_nesting_level > 1)
630 curr_lexer->comment_text = "\n";
631 else
632 tok = curr_lexer->finish_comment (octave_comment_elt::block);
633
634 curr_lexer->block_comment_nesting_level--;
635 curr_lexer->pop_start_state ();
636
637 if (tok > 0)
638 COUNT_TOK_AND_RETURN (tok);
639 }
640
641 %{
642 // Body of a block comment.
643 %}
644
645 <BLOCK_COMMENT_START>.*{NL} {
646 LEXER_DEBUG ("<BLOCK_COMMENT_START>.*{NL}");
647
648 curr_lexer->input_line_number++;
649 curr_lexer->current_input_column = 1;
650 curr_lexer->comment_text += yytext;
651 }
652
653 %{
654 // Full-line or end-of-line comment.
655 %}
656
657 {S}*{CCHAR}.*{NL} {
658 LEXER_DEBUG ("{S}*{CCHAR}.*{NL}");
659
660 curr_lexer->push_start_state (LINE_COMMENT_START);
661 yyless (0);
662 }
663
664 <LINE_COMMENT_START>{S}*{CCHAR}.*{NL} {
665 LEXER_DEBUG ("<LINE_COMMENT_START>{S}*{CCHAR}.*{NL}");
666
667 bool full_line_comment = curr_lexer->current_input_column == 1;
668 curr_lexer->input_line_number++;
669 curr_lexer->current_input_column = 1;
670
671 size_t len = yyleng;
672 size_t i = 0;
673 while (i < len)
674 {
675 char c = yytext[i];
676 if (c == '#' || c == '%' || c == ' ' || c == '\t')
677 i++;
678 else
679 break;
680 }
681
682 curr_lexer->comment_text += &yytext[i];
683
684 int tok = 0;
685
686 if (! full_line_comment)
687 {
688 tok = curr_lexer->finish_comment (octave_comment_elt::end_of_line);
689
690 curr_lexer->pop_start_state ();
691
692 if (curr_lexer->start_state () == COMMAND_START)
693 {
694 // Allow the actions for the end of a COMMAND line to be
695 // executed next.
696
697 tok = 0;
698 curr_lexer->xunput ('\n');
699 }
700 }
701
702 if (tok > 0)
703 COUNT_TOK_AND_RETURN (tok);
704 }
705
706 %{
707 // End of a block of full-line comments.
708 %}
709
710 <LINE_COMMENT_START>{ANY_INCLUDING_NL} {
711 LEXER_DEBUG ("<LINE_COMMENT_START>{ANY_INCLUDING_NL}");
712
713 curr_lexer->xunput (yytext[0]);
714
715 int tok = curr_lexer->finish_comment (octave_comment_elt::full_line);
716
717 curr_lexer->pop_start_state ();
718
719 if (tok > 0)
720 COUNT_TOK_AND_RETURN (tok);
564 } 721 }
565 722
566 %{ 723 %{
567 // Imaginary numbers. 724 // Imaginary numbers.
568 %} 725 %}
750 curr_lexer->current_input_column++; 907 curr_lexer->current_input_column++;
751 int tok = curr_lexer->handle_string ('"'); 908 int tok = curr_lexer->handle_string ('"');
752 909
753 COUNT_TOK_AND_RETURN (tok); 910 COUNT_TOK_AND_RETURN (tok);
754 } 911 }
755
756 %{
757 // Gobble comments.
758 %}
759
760 {CCHAR} {
761 LEXER_DEBUG ("{CCHAR}");
762
763 curr_lexer->looking_for_object_index = false;
764
765 curr_lexer->xunput (yytext[0]);
766
767 bool eof = false;
768 int tok = curr_lexer->process_comment (false, eof);
769
770 if (eof)
771 return curr_lexer->handle_end_of_input ();
772 else if (tok > 0)
773 COUNT_TOK_AND_RETURN (tok);
774 }
775
776 %{
777 // Block comments.
778 %}
779
780 ^{S}*{CCHAR}\{{S}*{NL} {
781 LEXER_DEBUG ("^{S}*{CCHAR}\\{{S}*{NL}");
782
783 curr_lexer->looking_for_object_index = false;
784
785 curr_lexer->input_line_number++;
786 curr_lexer->current_input_column = 1;
787 curr_lexer->block_comment_nesting_level++;
788 curr_lexer->decrement_promptflag ();
789
790 bool eof = false;
791 curr_lexer->process_comment (true, eof);
792 }
793 912
794 %{ 913 %{
795 // Other operators. 914 // Other operators.
796 %} 915 %}
797 916
1342 defining_func = 0; 1461 defining_func = 0;
1343 looking_at_function_handle = 0; 1462 looking_at_function_handle = 0;
1344 block_comment_nesting_level = 0; 1463 block_comment_nesting_level = 0;
1345 token_count = 0; 1464 token_count = 0;
1346 current_input_line = ""; 1465 current_input_line = "";
1466 comment_text = "";
1347 help_text = ""; 1467 help_text = "";
1348 fcn_file_name = ""; 1468 fcn_file_name = "";
1349 fcn_file_full_name = ""; 1469 fcn_file_full_name = "";
1350 looking_at_object_index.clear (); 1470 looking_at_object_index.clear ();
1351 looking_at_object_index.push_front (false); 1471 looking_at_object_index.push_front (false);
1356 pending_local_variables.clear (); 1476 pending_local_variables.clear ();
1357 1477
1358 nesting_level.reset (); 1478 nesting_level.reset ();
1359 1479
1360 reset_token_stack (); 1480 reset_token_stack ();
1481 }
1482
1483 static bool
1484 looks_like_copyright (const std::string& s)
1485 {
1486 bool retval = false;
1487
1488 if (! s.empty ())
1489 {
1490 size_t offset = s.find_first_not_of (" \t");
1491
1492 retval = (s.substr (offset, 9) == "Copyright" || s.substr (offset, 6) == "Author");
1493 }
1494
1495 return retval;
1496 }
1497
1498 int
1499 lexical_feedback::finish_comment (octave_comment_elt::comment_type typ)
1500 {
1501 bool copyright = looks_like_copyright (comment_text);
1502
1503 if (nesting_level.none () && help_text.empty ()
1504 && ! comment_text.empty () && ! copyright)
1505 help_text = comment_text;
1506
1507 if (copyright)
1508 typ = octave_comment_elt::copyright;
1509
1510 octave_comment_buffer::append (comment_text, typ);
1511
1512 comment_text = "";
1513
1514 quote_is_transpose = false;
1515 convert_spaces_to_comma = true;
1516 at_beginning_of_statement = true;
1517
1518 if (nesting_level.none ())
1519 return '\n';
1520 else if (nesting_level.is_bracket_or_brace ())
1521 // FIXME -- this result will be different if the comment follows a
1522 // continuation token.
1523 return ';';
1524 else
1525 return 0;
1361 } 1526 }
1362 1527
1363 void 1528 void
1364 lexical_feedback::reset_token_stack (void) 1529 lexical_feedback::reset_token_stack (void)
1365 { 1530 {
1893 octave_lexer::is_variable (const std::string& name) 2058 octave_lexer::is_variable (const std::string& name)
1894 { 2059 {
1895 return (symbol_table::is_variable (name) 2060 return (symbol_table::is_variable (name)
1896 || (pending_local_variables.find (name) 2061 || (pending_local_variables.find (name)
1897 != pending_local_variables.end ())); 2062 != pending_local_variables.end ()));
1898 }
1899
1900 std::string
1901 octave_lexer::grab_block_comment (stream_reader& reader, bool& eof)
1902 {
1903 std::string buf;
1904
1905 bool at_bol = true;
1906 bool look_for_marker = false;
1907
1908 bool warned_incompatible = false;
1909
1910 int c = 0;
1911
1912 while ((c = reader.getc ()) != EOF)
1913 {
1914 current_input_column++;
1915
1916 if (look_for_marker)
1917 {
1918 at_bol = false;
1919 look_for_marker = false;
1920
1921 if (c == '{' || c == '}')
1922 {
1923 std::string tmp_buf (1, static_cast<char> (c));
1924
1925 int type = c;
1926
1927 bool done = false;
1928
1929 while ((c = reader.getc ()) != EOF && ! done)
1930 {
1931 current_input_column++;
1932
1933 switch (c)
1934 {
1935 case ' ':
1936 case '\t':
1937 tmp_buf += static_cast<char> (c);
1938 break;
1939
1940 case '\n':
1941 {
1942 current_input_column = 0;
1943 at_bol = true;
1944 done = true;
1945
1946 if (type == '{')
1947 {
1948 block_comment_nesting_level++;
1949 decrement_promptflag ();
1950 }
1951 else
1952 {
1953 block_comment_nesting_level--;
1954 increment_promptflag ();
1955
1956 if (block_comment_nesting_level == 0)
1957 {
1958 buf += grab_comment_block (reader, true, eof);
1959
1960 return buf;
1961 }
1962 }
1963 }
1964 break;
1965
1966 default:
1967 at_bol = false;
1968 tmp_buf += static_cast<char> (c);
1969 buf += tmp_buf;
1970 done = true;
1971 break;
1972 }
1973 }
1974 }
1975 }
1976
1977 if (at_bol && (c == '%' || c == '#'))
1978 {
1979 if (c == '#' && ! warned_incompatible)
1980 {
1981 warned_incompatible = true;
1982 maybe_gripe_matlab_incompatible_comment (c);
1983 }
1984
1985 at_bol = false;
1986 look_for_marker = true;
1987 }
1988 else
1989 {
1990 buf += static_cast<char> (c);
1991
1992 if (c == '\n')
1993 {
1994 current_input_column = 0;
1995 at_bol = true;
1996 }
1997 }
1998 }
1999
2000 if (c == EOF)
2001 eof = true;
2002
2003 return buf;
2004 }
2005
2006 std::string
2007 octave_lexer::grab_comment_block (stream_reader& reader, bool at_bol,
2008 bool& eof)
2009 {
2010 std::string buf;
2011
2012 // TRUE means we are at the beginning of a comment block.
2013 bool begin_comment = false;
2014
2015 // TRUE means we are currently reading a comment block.
2016 bool in_comment = false;
2017
2018 bool warned_incompatible = false;
2019
2020 int c = 0;
2021
2022 while ((c = reader.getc ()) != EOF)
2023 {
2024 current_input_column++;
2025
2026 if (begin_comment)
2027 {
2028 if (c == '%' || c == '#')
2029 {
2030 at_bol = false;
2031 continue;
2032 }
2033 else if (at_bol && c == '{')
2034 {
2035 std::string tmp_buf (1, static_cast<char> (c));
2036
2037 bool done = false;
2038
2039 while ((c = reader.getc ()) != EOF && ! done)
2040 {
2041 current_input_column++;
2042
2043 switch (c)
2044 {
2045 case ' ':
2046 case '\t':
2047 tmp_buf += static_cast<char> (c);
2048 break;
2049
2050 case '\n':
2051 {
2052 current_input_column = 0;
2053 at_bol = true;
2054 done = true;
2055
2056 block_comment_nesting_level++;
2057 decrement_promptflag ();
2058
2059 buf += grab_block_comment (reader, eof);
2060
2061 in_comment = false;
2062
2063 if (eof)
2064 goto done;
2065 }
2066 break;
2067
2068 default:
2069 at_bol = false;
2070 tmp_buf += static_cast<char> (c);
2071 buf += tmp_buf;
2072 done = true;
2073 break;
2074 }
2075 }
2076 }
2077 else
2078 {
2079 at_bol = false;
2080 begin_comment = false;
2081 }
2082 }
2083
2084 if (in_comment)
2085 {
2086 buf += static_cast<char> (c);
2087
2088 if (c == '\n')
2089 {
2090 at_bol = true;
2091 current_input_column = 0;
2092 in_comment = false;
2093
2094 // FIXME -- bailing out here prevents things like
2095 //
2096 // octave> # comment
2097 // octave> x = 1
2098 //
2099 // from failing at the command line, while still
2100 // allowing blocks of comments to be grabbed properly
2101 // for function doc strings. But only the first line of
2102 // a mult-line doc string will be picked up for
2103 // functions defined on the command line. We need a
2104 // better way of collecting these comments...
2105 if (! (reading_fcn_file || reading_script_file))
2106 goto done;
2107 }
2108 }
2109 else
2110 {
2111 switch (c)
2112 {
2113 case ' ':
2114 case '\t':
2115 break;
2116
2117 case '#':
2118 if (! warned_incompatible)
2119 {
2120 warned_incompatible = true;
2121 maybe_gripe_matlab_incompatible_comment (c);
2122 }
2123 // fall through...
2124
2125 case '%':
2126 in_comment = true;
2127 begin_comment = true;
2128 break;
2129
2130 default:
2131 current_input_column--;
2132 reader.ungetc (c);
2133 goto done;
2134 }
2135 }
2136 }
2137
2138 done:
2139
2140 if (c == EOF)
2141 eof = true;
2142
2143 return buf;
2144 }
2145
2146 static bool
2147 looks_like_copyright (const std::string& s)
2148 {
2149 bool retval = false;
2150
2151 if (! s.empty ())
2152 {
2153 size_t offset = s.find_first_not_of (" \t");
2154
2155 retval = (s.substr (offset, 9) == "Copyright" || s.substr (offset, 6) == "Author");
2156 }
2157
2158 return retval;
2159 }
2160
2161 int
2162 octave_lexer::process_comment (bool start_in_block, bool& eof)
2163 {
2164 eof = false;
2165
2166 char *yytxt = flex_yytext ();
2167 flex_stream_reader flex_reader (this, yytxt);
2168
2169 // process_comment is only supposed to be called when we are not
2170 // initially looking at a block comment.
2171
2172 std::string txt = start_in_block
2173 ? grab_block_comment (flex_reader, eof)
2174 : grab_comment_block (flex_reader, false, eof);
2175
2176 if (lexer_debug_flag)
2177 std::cerr << "C: " << txt << std::endl;
2178
2179 if (nesting_level.none () && help_text.empty () && ! txt.empty ()
2180 && ! looks_like_copyright (txt))
2181 help_text = txt;
2182
2183 octave_comment_buffer::append (txt);
2184
2185 current_input_column = 1;
2186 quote_is_transpose = false;
2187 convert_spaces_to_comma = true;
2188 at_beginning_of_statement = true;
2189
2190 if (start_state () == COMMAND_START)
2191 pop_start_state ();
2192
2193 if (nesting_level.none ())
2194 return '\n';
2195 else if (nesting_level.is_bracket_or_brace ())
2196 return ';';
2197 else
2198 return 0;
2199 } 2063 }
2200 2064
2201 // Recognize separators. If the separator is a CRLF pair, it is 2065 // Recognize separators. If the separator is a CRLF pair, it is
2202 // replaced by a single LF. 2066 // replaced by a single LF.
2203 2067
3814 3678
3815 case INPUT_FILE_START: 3679 case INPUT_FILE_START:
3816 std::cerr << "INPUT_FILE_BEGIN" << std::endl; 3680 std::cerr << "INPUT_FILE_BEGIN" << std::endl;
3817 break; 3681 break;
3818 3682
3683 case BLOCK_COMMENT_START:
3684 std::cerr << "BLOCK_COMMENT_START" << std::endl;
3685 break;
3686
3687 case LINE_COMMENT_START:
3688 std::cerr << "LINE_COMMENT_START" << std::endl;
3689 break;
3690
3819 default: 3691 default:
3820 std::cerr << "UNKNOWN START STATE!" << std::endl; 3692 std::cerr << "UNKNOWN START STATE!" << std::endl;
3821 break; 3693 break;
3822 } 3694 }
3823 } 3695 }