Mercurial > hg > octave-lyh
changeset 16228:e19b1632d7c1
revamp most comment handling
* comment-list.h (octave_comment_elt::comment_type): New value,
full_line.
* lex.h (lexical_feedback::comment_text): New member variable.
(lexical_feedback::finish_comment): New function.
(octave_lexer::grab_block_comment, octave_lexer::grab_comment_block,
octave_lexer::process_comment): Delete.
* lex.ll (BLOCK_COMMENT_START, LINE_COMMENT_START): New exclusive
start states.
(ANY_INCLUDING_NL): New pattern.
(<INPUT_FILE_START>{ANY_INCLUDING_NL}): Use it instead of ".".
(^{S}*{CCHAR}\{{S}*{NL}, <BLOCK_COMMENT_START>^{S}*{CCHAR}\{{S}*{NL},
<BLOCK_COMMENT_START>^{S}*{CCHAR}\}{S}*{NL},
<BLOCK_COMMENT_START>.*{NL}, {S}*{CCHAR}.*{NL},
<LINE_COMMENT_START>{S}*{CCHAR}.*{NL},
<LINE_COMMENT_START>{ANY_INCLUDING_NL}):
New patterns and rules for handling comments.
({CCHAR}, ^{S}*{CCHAR}\{{S}*{NL}): Delete old rules for comments.
(display_start_state): Also handle BLOCK_COMMENT_START and
LINE_COMMENT_START.
author | John W. Eaton <jwe@octave.org> |
---|---|
date | Fri, 08 Mar 2013 17:13:54 -0500 |
parents | 054d9e8f99b6 |
children | 7b7b1e4968e8 |
files | libinterp/interp-core/comment-list.h libinterp/parse-tree/lex.h libinterp/parse-tree/lex.ll |
diffstat | 3 files changed, 222 insertions(+), 350 deletions(-) [+] |
line wrap: on
line diff
--- a/libinterp/interp-core/comment-list.h +++ b/libinterp/interp-core/comment-list.h @@ -42,6 +42,7 @@ { unknown, block, + full_line, end_of_line, doc_string, copyright
--- a/libinterp/parse-tree/lex.h +++ b/libinterp/parse-tree/lex.h @@ -27,6 +27,7 @@ #include <set> #include <stack> +#include "comment-list.h" #include "input.h" extern OCTINTERP_API void cleanup_parser (void); @@ -177,8 +178,8 @@ bracketflag (0), braceflag (0), looping (0), defining_func (0), looking_at_function_handle (0), block_comment_nesting_level (0), token_count (0), - current_input_line (), help_text (), fcn_file_name (), - fcn_file_full_name (), looking_at_object_index (), + current_input_line (), comment_text (), help_text (), + fcn_file_name (), fcn_file_full_name (), looking_at_object_index (), parsed_function_name (), pending_local_variables (), nesting_level (), token_stack () { @@ -191,6 +192,8 @@ void reset (void); + int finish_comment (octave_comment_elt::comment_type typ); + // true means that we have encountered eof on the input stream. bool end_of_input; @@ -291,6 +294,9 @@ // The current line of input. std::string current_input_line; + // The current comment text. + std::string comment_text; + // The current help text. std::string help_text; @@ -421,13 +427,6 @@ bool is_variable (const std::string& name); - std::string grab_block_comment (stream_reader& reader, bool& eof); - - std::string grab_comment_block (stream_reader& reader, bool at_bol, - bool& eof); - - int process_comment (bool start_in_block, bool& eof); - bool next_token_is_sep_op (void); bool next_token_is_postfix_unary_op (bool spc_prev);
--- a/libinterp/parse-tree/lex.ll +++ b/libinterp/parse-tree/lex.ll @@ -48,6 +48,9 @@ %x INPUT_FILE_START +%x BLOCK_COMMENT_START +%x LINE_COMMENT_START + %{ #include <cctype> @@ -252,6 +255,9 @@ IDENT ([_$a-zA-Z][_$a-zA-Z0-9]*) EXPON ([DdEe][+-]?{D}+) NUMBER (({D}+\.?{D}*{EXPON}?)|(\.{D}+{EXPON}?)|(0[xX][0-9a-fA-F]+)) + +ANY_INCLUDING_NL (.|{NL}) + %% %{ @@ -259,8 +265,8 @@ // the parser go down a special path. %} -<INPUT_FILE_START>. { - LEXER_DEBUG ("<INPUT_FILE_START>."); +<INPUT_FILE_START>{ANY_INCLUDING_NL} { + LEXER_DEBUG ("<INPUT_FILE_START>{ANY_INCLUDING_NL}"); curr_lexer->xunput (yytext[0]); @@ -564,6 +570,157 @@ } %{ +// Gobble comments. +%} + +%{ +// Start of a block comment. If the comment marker appears immediately +// after a block of full-line comments, finish the full line comment +// block. +%} + +^{S}*{CCHAR}\{{S}*{NL} { + LEXER_DEBUG ("^{S}*{CCHAR}\{{S}*{NL}"); + + int tok = 0; + + if (curr_lexer->start_state () == LINE_COMMENT_START) + { + if (! curr_lexer->comment_text.empty ()) + tok = curr_lexer->finish_comment (octave_comment_elt::full_line); + + curr_lexer->pop_start_state (); + } + + curr_lexer->push_start_state (BLOCK_COMMENT_START); + + yyless (0); + + if (tok > 0) + COUNT_TOK_AND_RETURN (tok); + } + +<BLOCK_COMMENT_START>^{S}*{CCHAR}\{{S}*{NL} { + LEXER_DEBUG ("<BLOCK_COMMENT_START>^{S}*{CCHAR}\{{S}*{NL}"); + + curr_lexer->input_line_number++; + curr_lexer->current_input_column = 1; + + if (curr_lexer->block_comment_nesting_level) + curr_lexer->comment_text = "\n"; + + curr_lexer->block_comment_nesting_level++; + } + +%{ +// End of a block comment. If this block comment is nested inside +// another, wait for the outermost block comment block to be closed +// before storing the comment. +%} + +<BLOCK_COMMENT_START>^{S}*{CCHAR}\}{S}*{NL} { + LEXER_DEBUG ("<BLOCK_COMMENT_START>^{S}*{CCHAR}\}{S}*{NL}"); + + curr_lexer->input_line_number++; + curr_lexer->current_input_column = 1; + + int tok = 0; + + if (curr_lexer->block_comment_nesting_level > 1) + curr_lexer->comment_text = "\n"; + else + tok = curr_lexer->finish_comment (octave_comment_elt::block); + + curr_lexer->block_comment_nesting_level--; + curr_lexer->pop_start_state (); + + if (tok > 0) + COUNT_TOK_AND_RETURN (tok); + } + +%{ +// Body of a block comment. +%} + +<BLOCK_COMMENT_START>.*{NL} { + LEXER_DEBUG ("<BLOCK_COMMENT_START>.*{NL}"); + + curr_lexer->input_line_number++; + curr_lexer->current_input_column = 1; + curr_lexer->comment_text += yytext; + } + +%{ +// Full-line or end-of-line comment. +%} + +{S}*{CCHAR}.*{NL} { + LEXER_DEBUG ("{S}*{CCHAR}.*{NL}"); + + curr_lexer->push_start_state (LINE_COMMENT_START); + yyless (0); + } + +<LINE_COMMENT_START>{S}*{CCHAR}.*{NL} { + LEXER_DEBUG ("<LINE_COMMENT_START>{S}*{CCHAR}.*{NL}"); + + bool full_line_comment = curr_lexer->current_input_column == 1; + curr_lexer->input_line_number++; + curr_lexer->current_input_column = 1; + + size_t len = yyleng; + size_t i = 0; + while (i < len) + { + char c = yytext[i]; + if (c == '#' || c == '%' || c == ' ' || c == '\t') + i++; + else + break; + } + + curr_lexer->comment_text += &yytext[i]; + + int tok = 0; + + if (! full_line_comment) + { + tok = curr_lexer->finish_comment (octave_comment_elt::end_of_line); + + curr_lexer->pop_start_state (); + + if (curr_lexer->start_state () == COMMAND_START) + { + // Allow the actions for the end of a COMMAND line to be + // executed next. + + tok = 0; + curr_lexer->xunput ('\n'); + } + } + + if (tok > 0) + COUNT_TOK_AND_RETURN (tok); + } + +%{ +// End of a block of full-line comments. +%} + +<LINE_COMMENT_START>{ANY_INCLUDING_NL} { + LEXER_DEBUG ("<LINE_COMMENT_START>{ANY_INCLUDING_NL}"); + + curr_lexer->xunput (yytext[0]); + + int tok = curr_lexer->finish_comment (octave_comment_elt::full_line); + + curr_lexer->pop_start_state (); + + if (tok > 0) + COUNT_TOK_AND_RETURN (tok); + } + +%{ // Imaginary numbers. %} @@ -754,44 +911,6 @@ } %{ -// Gobble comments. -%} - -{CCHAR} { - LEXER_DEBUG ("{CCHAR}"); - - curr_lexer->looking_for_object_index = false; - - curr_lexer->xunput (yytext[0]); - - bool eof = false; - int tok = curr_lexer->process_comment (false, eof); - - if (eof) - return curr_lexer->handle_end_of_input (); - else if (tok > 0) - COUNT_TOK_AND_RETURN (tok); - } - -%{ -// Block comments. -%} - -^{S}*{CCHAR}\{{S}*{NL} { - LEXER_DEBUG ("^{S}*{CCHAR}\\{{S}*{NL}"); - - curr_lexer->looking_for_object_index = false; - - curr_lexer->input_line_number++; - curr_lexer->current_input_column = 1; - curr_lexer->block_comment_nesting_level++; - curr_lexer->decrement_promptflag (); - - bool eof = false; - curr_lexer->process_comment (true, eof); - } - -%{ // Other operators. %} @@ -1344,6 +1463,7 @@ block_comment_nesting_level = 0; token_count = 0; current_input_line = ""; + comment_text = ""; help_text = ""; fcn_file_name = ""; fcn_file_full_name = ""; @@ -1360,6 +1480,51 @@ reset_token_stack (); } +static bool +looks_like_copyright (const std::string& s) +{ + bool retval = false; + + if (! s.empty ()) + { + size_t offset = s.find_first_not_of (" \t"); + + retval = (s.substr (offset, 9) == "Copyright" || s.substr (offset, 6) == "Author"); + } + + return retval; +} + +int +lexical_feedback::finish_comment (octave_comment_elt::comment_type typ) +{ + bool copyright = looks_like_copyright (comment_text); + + if (nesting_level.none () && help_text.empty () + && ! comment_text.empty () && ! copyright) + help_text = comment_text; + + if (copyright) + typ = octave_comment_elt::copyright; + + octave_comment_buffer::append (comment_text, typ); + + comment_text = ""; + + quote_is_transpose = false; + convert_spaces_to_comma = true; + at_beginning_of_statement = true; + + if (nesting_level.none ()) + return '\n'; + else if (nesting_level.is_bracket_or_brace ()) + // FIXME -- this result will be different if the comment follows a + // continuation token. + return ';'; + else + return 0; +} + void lexical_feedback::reset_token_stack (void) { @@ -1897,307 +2062,6 @@ != pending_local_variables.end ())); } -std::string -octave_lexer::grab_block_comment (stream_reader& reader, bool& eof) -{ - std::string buf; - - bool at_bol = true; - bool look_for_marker = false; - - bool warned_incompatible = false; - - int c = 0; - - while ((c = reader.getc ()) != EOF) - { - current_input_column++; - - if (look_for_marker) - { - at_bol = false; - look_for_marker = false; - - if (c == '{' || c == '}') - { - std::string tmp_buf (1, static_cast<char> (c)); - - int type = c; - - bool done = false; - - while ((c = reader.getc ()) != EOF && ! done) - { - current_input_column++; - - switch (c) - { - case ' ': - case '\t': - tmp_buf += static_cast<char> (c); - break; - - case '\n': - { - current_input_column = 0; - at_bol = true; - done = true; - - if (type == '{') - { - block_comment_nesting_level++; - decrement_promptflag (); - } - else - { - block_comment_nesting_level--; - increment_promptflag (); - - if (block_comment_nesting_level == 0) - { - buf += grab_comment_block (reader, true, eof); - - return buf; - } - } - } - break; - - default: - at_bol = false; - tmp_buf += static_cast<char> (c); - buf += tmp_buf; - done = true; - break; - } - } - } - } - - if (at_bol && (c == '%' || c == '#')) - { - if (c == '#' && ! warned_incompatible) - { - warned_incompatible = true; - maybe_gripe_matlab_incompatible_comment (c); - } - - at_bol = false; - look_for_marker = true; - } - else - { - buf += static_cast<char> (c); - - if (c == '\n') - { - current_input_column = 0; - at_bol = true; - } - } - } - - if (c == EOF) - eof = true; - - return buf; -} - -std::string -octave_lexer::grab_comment_block (stream_reader& reader, bool at_bol, - bool& eof) -{ - std::string buf; - - // TRUE means we are at the beginning of a comment block. - bool begin_comment = false; - - // TRUE means we are currently reading a comment block. - bool in_comment = false; - - bool warned_incompatible = false; - - int c = 0; - - while ((c = reader.getc ()) != EOF) - { - current_input_column++; - - if (begin_comment) - { - if (c == '%' || c == '#') - { - at_bol = false; - continue; - } - else if (at_bol && c == '{') - { - std::string tmp_buf (1, static_cast<char> (c)); - - bool done = false; - - while ((c = reader.getc ()) != EOF && ! done) - { - current_input_column++; - - switch (c) - { - case ' ': - case '\t': - tmp_buf += static_cast<char> (c); - break; - - case '\n': - { - current_input_column = 0; - at_bol = true; - done = true; - - block_comment_nesting_level++; - decrement_promptflag (); - - buf += grab_block_comment (reader, eof); - - in_comment = false; - - if (eof) - goto done; - } - break; - - default: - at_bol = false; - tmp_buf += static_cast<char> (c); - buf += tmp_buf; - done = true; - break; - } - } - } - else - { - at_bol = false; - begin_comment = false; - } - } - - if (in_comment) - { - buf += static_cast<char> (c); - - if (c == '\n') - { - at_bol = true; - current_input_column = 0; - in_comment = false; - - // FIXME -- bailing out here prevents things like - // - // octave> # comment - // octave> x = 1 - // - // from failing at the command line, while still - // allowing blocks of comments to be grabbed properly - // for function doc strings. But only the first line of - // a mult-line doc string will be picked up for - // functions defined on the command line. We need a - // better way of collecting these comments... - if (! (reading_fcn_file || reading_script_file)) - goto done; - } - } - else - { - switch (c) - { - case ' ': - case '\t': - break; - - case '#': - if (! warned_incompatible) - { - warned_incompatible = true; - maybe_gripe_matlab_incompatible_comment (c); - } - // fall through... - - case '%': - in_comment = true; - begin_comment = true; - break; - - default: - current_input_column--; - reader.ungetc (c); - goto done; - } - } - } - - done: - - if (c == EOF) - eof = true; - - return buf; -} - -static bool -looks_like_copyright (const std::string& s) -{ - bool retval = false; - - if (! s.empty ()) - { - size_t offset = s.find_first_not_of (" \t"); - - retval = (s.substr (offset, 9) == "Copyright" || s.substr (offset, 6) == "Author"); - } - - return retval; -} - -int -octave_lexer::process_comment (bool start_in_block, bool& eof) -{ - eof = false; - - char *yytxt = flex_yytext (); - flex_stream_reader flex_reader (this, yytxt); - - // process_comment is only supposed to be called when we are not - // initially looking at a block comment. - - std::string txt = start_in_block - ? grab_block_comment (flex_reader, eof) - : grab_comment_block (flex_reader, false, eof); - - if (lexer_debug_flag) - std::cerr << "C: " << txt << std::endl; - - if (nesting_level.none () && help_text.empty () && ! txt.empty () - && ! looks_like_copyright (txt)) - help_text = txt; - - octave_comment_buffer::append (txt); - - current_input_column = 1; - quote_is_transpose = false; - convert_spaces_to_comma = true; - at_beginning_of_statement = true; - - if (start_state () == COMMAND_START) - pop_start_state (); - - if (nesting_level.none ()) - return '\n'; - else if (nesting_level.is_bracket_or_brace ()) - return ';'; - else - return 0; -} - // Recognize separators. If the separator is a CRLF pair, it is // replaced by a single LF. @@ -3816,6 +3680,14 @@ std::cerr << "INPUT_FILE_BEGIN" << std::endl; break; + case BLOCK_COMMENT_START: + std::cerr << "BLOCK_COMMENT_START" << std::endl; + break; + + case LINE_COMMENT_START: + std::cerr << "LINE_COMMENT_START" << std::endl; + break; + default: std::cerr << "UNKNOWN START STATE!" << std::endl; break;