Mercurial > hg > octave-lyh
diff libinterp/parse-tree/lex.ll @ 16898:531473481084
rewrite string parsing to avoid unlimited lookahead
* NEWS: Mention change.
* lex.h, lex.ll (lexical_feedback::string_text,
lexical_feedback::string_line, lexical_feedback::string_column):
New data members
(lexical_feedback::lexical_feedback): Initialize them.
(lexical_feedback::reset): Initialize them.
(octave_base_lexer::begin_string): New function.
(\", "'", <COMMAND_START>[\"\']): Use begin_string to set start state
* instead of calling handle_string to parse string.
(DQ_STRING_START, SQ_STRING_START): New exclusive start states.
(<DQ_STRING_START>\"\", <DQ_STRING_START>\", <DQ_STRING_START>{NL},
<DQ_STRING_START>\\[0-7]{1,3}, <DQ_STRING_START>"\\a",
<DQ_STRING_START>"\\b", <DQ_STRING_START>"\\f",
<DQ_STRING_START>"\\n", <DQ_STRING_START>"\\r",
<DQ_STRING_START>"\\t", <DQ_STRING_START>"\\v",
<DQ_STRING_START>\\{ANY_INCLUDING_NL}, <DQ_STRING_START>[^\\\n\"]+,
<SQ_STRING_START>[^\'\n\r]*\', <SQ_STRING_START>{NL}):
New rules for parsing character strings.
(octave_base_lexer::have_continuation,
octave_base_lexer::have_ellipsis_continuation,
octave_base_lexer::handle_string): Delete.
author | John W. Eaton <jwe@octave.org> |
---|---|
date | Thu, 04 Jul 2013 20:33:02 -0400 |
parents | f89de736eecd |
children | f21194531877 |
line wrap: on
line diff
--- a/libinterp/parse-tree/lex.ll +++ b/libinterp/parse-tree/lex.ll @@ -51,6 +51,9 @@ %x BLOCK_COMMENT_START %x LINE_COMMENT_START +%x DQ_STRING_START +%x SQ_STRING_START + %{ #include <cctype> @@ -290,9 +293,9 @@ curr_lexer->at_beginning_of_statement = false; curr_lexer->current_input_column++; - int tok = curr_lexer->handle_string (yytext[0]); - - return curr_lexer->count_token_internal (tok); + + curr_lexer->begin_string (yytext[0] == '"' + ? DQ_STRING_START : SQ_STRING_START); } <COMMAND_START>[^#% \t\r\n\;\,\"\'][^ \t\r\n\;\,]*{S}* { @@ -622,6 +625,106 @@ } %{ +// Double-quoted character strings. +%} + +<DQ_STRING_START>\"\" { + curr_lexer->current_input_column += yyleng; + curr_lexer->string_text += '"'; + } + +<DQ_STRING_START>\" { + + curr_lexer->pop_start_state (); + + curr_lexer->looking_for_object_index = true; + curr_lexer->at_beginning_of_statement = false; + + curr_lexer->push_token (new token (DQ_STRING, + curr_lexer->string_text, + curr_lexer->string_line, + curr_lexer->string_column)); + + curr_lexer->string_text = ""; + + return curr_lexer->count_token_internal (DQ_STRING); + } + +<DQ_STRING_START>{NL} { + error ("unterminated character string constant"); + return LEXICAL_ERROR; + } + +<DQ_STRING_START>\\[0-7]{1,3} { + int result; + sscanf (yytext+1, "%o", &result); + + if (result > 0xff) + error ("invalid octal escape sequence in character string"); + else + curr_lexer->string_text += static_cast<unsigned char> (result); + } + +<DQ_STRING_START>"\\a" { curr_lexer->string_text += '\a'; } +<DQ_STRING_START>"\\b" { curr_lexer->string_text += '\b'; } +<DQ_STRING_START>"\\f" { curr_lexer->string_text += '\f'; } +<DQ_STRING_START>"\\n" { curr_lexer->string_text += '\n'; } +<DQ_STRING_START>"\\r" { curr_lexer->string_text += '\r'; } +<DQ_STRING_START>"\\t" { curr_lexer->string_text += '\t'; } +<DQ_STRING_START>"\\v" { curr_lexer->string_text += '\v'; } + +<DQ_STRING_START>\\{ANY_INCLUDING_NL} { + curr_lexer->string_text += yytext[1]; + } + +<DQ_STRING_START>[^\\\n\"]+ { + curr_lexer->string_text += yytext; + } + +%{ +// Single-quoted character strings. +%} + +<SQ_STRING_START>[^\'\n\r]*\' { + yytext[yyleng-1] = 0; + curr_lexer->string_text += yytext; + + curr_lexer->current_input_column += yyleng; + + int c = curr_lexer->text_yyinput (); + + if (c == '\'') + { + curr_lexer->string_text += c; + + curr_lexer->current_input_column++; + } + else + { + curr_lexer->xunput (c); + + curr_lexer->pop_start_state (); + + curr_lexer->looking_for_object_index = true; + curr_lexer->at_beginning_of_statement = false; + + curr_lexer->push_token (new token (SQ_STRING, + curr_lexer->string_text, + curr_lexer->string_line, + curr_lexer->string_column)); + + curr_lexer->string_text = ""; + + return curr_lexer->count_token_internal (SQ_STRING); + } + } + +<SQ_STRING_START>{NL} { + error ("unterminated character string constant"); + return LEXICAL_ERROR; + } + +%{ // Imaginary numbers. %} @@ -867,14 +970,14 @@ if (curr_lexer->previous_token_may_be_command () && curr_lexer->space_follows_previous_token ()) { - yyless (0); + curr_lexer->current_input_column++; curr_lexer->push_start_state (COMMAND_START); + curr_lexer->begin_string (SQ_STRING_START); } else if (curr_lexer->at_beginning_of_statement) { curr_lexer->current_input_column++; - int retval = curr_lexer->handle_string ('\''); - return curr_lexer->count_token_internal (retval); + curr_lexer->begin_string (SQ_STRING_START); } else { @@ -888,8 +991,7 @@ || curr_lexer->previous_token_is_binop ()) { curr_lexer->current_input_column++; - int retval = curr_lexer->handle_string ('\''); - return curr_lexer->count_token_internal (retval); + curr_lexer->begin_string (SQ_STRING_START); } else { @@ -906,8 +1008,7 @@ || curr_lexer->previous_token_is_keyword ()) { curr_lexer->current_input_column++; - int retval = curr_lexer->handle_string ('\''); - return curr_lexer->count_token_internal (retval); + curr_lexer->begin_string (SQ_STRING_START); } else return curr_lexer->count_token (HERMITIAN); @@ -920,8 +1021,7 @@ || curr_lexer->previous_token_is_keyword ()) { curr_lexer->current_input_column++; - int retval = curr_lexer->handle_string ('\''); - return curr_lexer->count_token_internal (retval); + curr_lexer->begin_string (SQ_STRING_START); } else return curr_lexer->count_token (HERMITIAN); @@ -939,8 +1039,9 @@ if (curr_lexer->previous_token_may_be_command () && curr_lexer->space_follows_previous_token ()) { - yyless (0); + curr_lexer->current_input_column++; curr_lexer->push_start_state (COMMAND_START); + curr_lexer->begin_string (DQ_STRING_START); } else { @@ -954,8 +1055,7 @@ || curr_lexer->previous_token_is_binop ()) { curr_lexer->current_input_column++; - int retval = curr_lexer->handle_string ('"'); - return curr_lexer->count_token_internal (retval); + curr_lexer->begin_string (DQ_STRING_START); } else { @@ -968,15 +1068,13 @@ else { curr_lexer->current_input_column++; - int retval = curr_lexer->handle_string ('"'); - return curr_lexer->count_token_internal (retval); + curr_lexer->begin_string (DQ_STRING_START); } } else { curr_lexer->current_input_column++; - int retval = curr_lexer->handle_string ('"'); - return curr_lexer->count_token_internal (retval); + curr_lexer->begin_string (DQ_STRING_START); } } } @@ -1555,6 +1653,9 @@ current_input_line = ""; comment_text = ""; help_text = ""; + string_text = ""; + string_line = 0; + string_column = 0; fcn_file_name = ""; fcn_file_full_name = ""; looking_at_object_index.clear (); @@ -1769,6 +1870,15 @@ push_start_state (INPUT_FILE_START); } +void +octave_base_lexer::begin_string (int state) +{ + string_line = input_line_number; + string_column = current_input_column; + + push_start_state (state); +} + int octave_base_lexer::handle_end_of_input (void) { @@ -2257,218 +2367,6 @@ at_beginning_of_statement = true; } -// We have seen a backslash and need to find out if it should be -// treated as a continuation character. If so, this eats it, up to -// and including the new line character. -// -// Match whitespace only, followed by a comment character or newline. -// Once a comment character is found, discard all input until newline. -// If non-whitespace characters are found before comment -// characters, return 0. Otherwise, return 1. - -// FIXME -- we need to handle block comments here. - -bool -octave_base_lexer::have_continuation (bool trailing_comments_ok) -{ - std::ostringstream buf; - - std::string comment_buf; - - bool in_comment = false; - bool beginning_of_comment = false; - - int c = 0; - - while ((c = text_yyinput ()) != EOF) - { - buf << static_cast<char> (c); - - switch (c) - { - case ' ': - case '\t': - if (in_comment) - { - comment_buf += static_cast<char> (c); - beginning_of_comment = false; - } - break; - - case '%': - case '#': - if (trailing_comments_ok) - { - if (in_comment) - { - if (! beginning_of_comment) - comment_buf += static_cast<char> (c); - } - else - { - maybe_gripe_matlab_incompatible_comment (c); - in_comment = true; - beginning_of_comment = true; - } - } - else - goto cleanup; - break; - - case '\n': - if (in_comment) - { - comment_buf += static_cast<char> (c); - octave_comment_buffer::append (comment_buf); - } - current_input_column = 0; - decrement_promptflag (); - gripe_matlab_incompatible_continuation (); - return true; - - default: - if (in_comment) - { - comment_buf += static_cast<char> (c); - beginning_of_comment = false; - } - else - goto cleanup; - break; - } - } - - xunput (c); - return false; - -cleanup: - - std::string s = buf.str (); - - int len = s.length (); - while (len--) - xunput (s[len]); - - return false; -} - -// We have seen a '.' and need to see if it is the start of a -// continuation. If so, this eats it, up to and including the new -// line character. - -bool -octave_base_lexer::have_ellipsis_continuation (bool trailing_comments_ok) -{ - char c1 = text_yyinput (); - if (c1 == '.') - { - char c2 = text_yyinput (); - if (c2 == '.' && have_continuation (trailing_comments_ok)) - return true; - else - { - xunput (c2); - xunput (c1); - } - } - else - xunput (c1); - - return false; -} - -int -octave_base_lexer::handle_string (char delim) -{ - std::ostringstream buf; - - int bos_line = input_line_number; - int bos_col = current_input_column; - - int c; - int escape_pending = 0; - - while ((c = text_yyinput ()) != EOF) - { - current_input_column++; - - if (c == '\\') - { - if (delim == '\'' || escape_pending) - { - buf << static_cast<char> (c); - escape_pending = 0; - } - else - { - if (have_continuation (false)) - escape_pending = 0; - else - { - buf << static_cast<char> (c); - escape_pending = 1; - } - } - continue; - } - else if (c == '.') - { - if (delim == '\'' || ! have_ellipsis_continuation (false)) - buf << static_cast<char> (c); - } - else if (c == '\n') - { - error ("unterminated string constant"); - break; - } - else if (c == delim) - { - if (escape_pending) - buf << static_cast<char> (c); - else - { - c = text_yyinput (); - if (c == delim) - { - buf << static_cast<char> (c); - } - else - { - std::string s; - xunput (c); - - if (delim == '\'') - s = buf.str (); - else - s = do_string_escapes (buf.str ()); - - if (delim == '"') - gripe_matlab_incompatible ("\" used as string delimiter"); - else if (delim == '\'') - gripe_single_quote_string (); - - looking_for_object_index = true; - at_beginning_of_statement = false; - - int tok = delim == '"' ? DQ_STRING : SQ_STRING; - - push_token (new token (tok, s, bos_line, bos_col)); - - return tok; - } - } - } - else - { - buf << static_cast<char> (c); - } - - escape_pending = 0; - } - - return LEXICAL_ERROR; -} - int octave_base_lexer::handle_close_bracket (int bracket_type) {