changeset 17335:fe6518a1d87c

Move TeX symbol decoding into the lexer (bug #39831). * libinterp/Makefile.am (BUILT_SOURCES): Add corefcn/oct-tex-symbols.cc. (BUILT_DISTFILES): Add corefcn/oct-tex-lexer.ll and corefcn/oct-tex-symbols.cc. (EXTRA_DIST): Add corefcn/oct-tex-lexer.in.ll and corefcn/oct-tex-symbols.in. (ULT_DIST_SRC): Filter out corefcn/oct-tex-lexer.ll from DIST_SRC and add corefcn/oct-tex-lexer.in.ll instead. (CLEAN_FILES): Add corefcn/oct-tex-parser.output. * libinterp/corefcn/modules.mk (corefcn/oct-tex-lexer.ll, corefcn/oct-tex-symbols.cc): New rules to build the TeX lexer. (corefcn/txt-eng.cc): Add dependency on corefcn/oct-tex-symbols.cc. * libinterp/corefcn/oct-tex-lexer.in.ll: Renamed from oct-tex.lexer.ll. Remove COMMAND state. Remove ID regex. Replace rules for symbols with tag @SYMBOL_RULES@. * libinterp/corefcn/oct-tex-parser.yy (ID, CMD, identifier): Remove tokens. (SYM, sym): New token and value. (symbol_element): Build from SYM. * libinterp/corefcn/oct-tex-symbols.in: New file with supported TeX symbols and corresponding codes (unicode and MS symbols). * libinterp/corefcn/txt-eng.h (class text_element_symbol): Make it inherit from text_element. (text_element_symbol::code): Removed member. (text_element_symbol::symbol): New member. (text_element_symbol::text_element_symbol): Adapt constructor. (text_element_symbol::get_symbol): New method. (text_element_symbol::get_symbol_code): Make const. * libinterp/corefcn/txt-eng.cc (symbol_names, symbol_codes): Remove static variables, now auto-generated from oct-tex-symbols.in. (oct-tex-symbols.cc): New include. (text_element_symbol::get_symbol_code): Change implementation to simply index into auto-generated symbol_codes array. * libinterp/corefcn/txt-eng-ft.cc (ft_render::visit(text_element_symbol)): Don't use text_element_symbol::string_value(), use text_element_symbol::get_symbol() instead.
author Michael Goffioul <michael.goffioul@gmail.com>
date Sat, 24 Aug 2013 14:27:09 -0400
parents f444e4cef9b9
children 4daf633100db
files libinterp/Makefile.am libinterp/corefcn/module.mk libinterp/corefcn/oct-tex-lexer.in.ll libinterp/corefcn/oct-tex-lexer.ll libinterp/corefcn/oct-tex-parser.yy libinterp/corefcn/oct-tex-symbols.in libinterp/corefcn/txt-eng-ft.cc libinterp/corefcn/txt-eng.cc libinterp/corefcn/txt-eng.h
diffstat 8 files changed, 151 insertions(+), 267 deletions(-) [+]
line wrap: on
line diff
--- a/libinterp/Makefile.am
+++ b/libinterp/Makefile.am
@@ -53,6 +53,7 @@
   corefcn/graphics.h \
   corefcn/oct-tex-lexer.cc \
   corefcn/oct-tex-parser.cc \
+  corefcn/oct-tex-symbols.cc \
   operators/ops.cc \
   parse-tree/lex.cc \
   parse-tree/oct-gperf.h \
@@ -64,7 +65,9 @@
   builtins.cc
 
 BUILT_DISTFILES = \
+  corefcn/oct-tex-lexer.ll \
   corefcn/oct-tex-parser.h \
+  corefcn/oct-tex-symbols.cc \
   parse-tree/oct-gperf.h \
   parse-tree/oct-parse.h \
   parse-tree/oct-parse.yy
@@ -90,6 +93,8 @@
   Makefile.in \
   DOCSTRINGS \
   config-features.sh \
+  corefcn/oct-tex-lexer.in.ll \
+  corefcn/oct-tex-symbols.in \
   find-defun-files.sh \
   gendoc.pl \
   genprops.awk \
@@ -197,7 +202,7 @@
 ## Section for defining and creating DEF_FILES
 
 ULT_DIST_SRC := \
-  $(filter-out parse-tree/oct-parse.yy, $(DIST_SRC)) parse-tree/oct-parse.in.yy
+  $(filter-out corefcn/oct-tex-lexer.ll parse-tree/oct-parse.yy, $(DIST_SRC)) corefcn/oct-tex-lexer.in.ll parse-tree/oct-parse.in.yy
 
 SRC_DEF_FILES := $(shell $(srcdir)/find-defun-files.sh "$(srcdir)" $(ULT_DIST_SRC))
 
@@ -360,6 +365,7 @@
 CLEANFILES = \
   $(DLDFCN_PKG_ADD_FILE) \
   corefcn/graphics-props.cc \
+  corefcn/oct-tex-parser.output \
   parse-tree/oct-parse.output
 
 DISTCLEANFILES = \
--- a/libinterp/corefcn/module.mk
+++ b/libinterp/corefcn/module.mk
@@ -300,3 +300,13 @@
 
 corefcn/oct-tex-lexer.cc: LEX_OUTPUT_ROOT := lex.octave_tex_
 corefcn/oct-tex-parser.h: corefcn/oct-tex-parser.yy
+
+corefcn/oct-tex-lexer.ll: corefcn/oct-tex-lexer.in.ll corefcn/oct-tex-symbols.in Makefile
+	$(AWK) 'BEGIN { print "/* DO NOT EDIT. AUTOMATICALLY GENERATED FROM oct-tex-lexer.in.ll and oct-tex-symbols.in. */"; } /^@SYMBOL_RULES@$$/ { count = 0; while (getline < "$(srcdir)/corefcn/oct-tex-symbols.in") { if ($$0 !~ /^#.*/ && NF == 3) { printf("\"\\\\%s\" { yylval->sym = %d; return SYM; }\n", $$1, count); count++; } } getline } ! /^@SYMBOL_RULES@$$/ { print }' $< > $@-t
+	mv $@-t $@
+
+corefcn/oct-tex-symbols.cc: corefcn/oct-tex-symbols.in Makefile
+	$(AWK) 'BEGIN { print "// DO NOT EDIT. AUTOMATICALLY GENERATED FROM oct-tex-symbols.in."; print "static uint32_t symbol_codes[][2] = {"; count = 0; } END { print "};"; printf("static int num_symbol_codes = %d;\n", count); } /^#/ { } { if (NF == 3) { printf("  { %s, %s },\n", $$2, $$3); count++; } }' $< > $@-t
+	mv $@-t $@
+
+corefcn/txt-eng.cc: corefcn/oct-tex-symbols.cc
rename from libinterp/corefcn/oct-tex-lexer.ll
rename to libinterp/corefcn/oct-tex-lexer.in.ll
--- a/libinterp/corefcn/oct-tex-lexer.ll
+++ b/libinterp/corefcn/oct-tex-lexer.in.ll
@@ -36,10 +36,8 @@
 
 %x	NUM_MODE
 %x	MAYBE_NUM_MODE
-%x	COMMAND
 
 D       [0-9]
-ID	[a-zA-Z0-9]
 NUM	(({D}+\.?{D}*)|(\.{D}+))
 
 %%
@@ -98,9 +96,7 @@
 // Symbols
 %}
 
-"\\"		{ BEGIN(COMMAND); return CMD; }
-<COMMAND>{ID}	{ yylval->ch = yytext[0]; return ID; }
-<COMMAND>"\n"|.	{ BEGIN(INITIAL); yyless (0); }
+@SYMBOL_RULES@
 
 %{
 // Generic character
--- a/libinterp/corefcn/oct-tex-parser.yy
+++ b/libinterp/corefcn/oct-tex-parser.yy
@@ -47,6 +47,7 @@
   /* Leaf symbols produced by the scanner */
   char                       ch;
   double                     num;
+  int                        sym;
 
   /* Used for string buffering */
   std::string*               str;
@@ -56,21 +57,22 @@
   text_element_list*         e_list;
 }
 
-%token BF IT SL RM CMD
+%token BF IT SL RM
 %token FONTNAME FONTSIZE
 %token COLOR COLOR_RGB
 %token START END SUPER SUB
-%token<ch> CH ID
+%token<ch> CH
 %token<num> NUM
+%token<sym> SYM
 
-%type<str> simple_string identifier
+%type<str> simple_string
 %type<e_base> string_element symbol_element
 %type<e_base> superscript_element subscript_element combined_script_element
 %type<e_base> font_modifier_element fontname_element fontsize_element color_element
 %type<e_list> string_element_list scoped_string_element_list
 
 /* Make sure there's no memory leak on parse error. */
-%destructor { } <ch> <num>
+%destructor { } <ch> <num> <sym>
 %destructor { delete $$; } <*>
 
 %nonassoc SCRIPT
@@ -89,17 +91,8 @@
 				  { $1->append (1, $2); $$ = $1; }
 				;
 
-identifier			: ID
-				  { $$ = new std::string (1, $1); }
-				| identifier ID
-				  { $1->append (1, $2); $$ = $1; }
-				;
-
-symbol_element			: CMD identifier
-				  {
-				    $$ = new text_element_symbol (*$2);
-				    delete $2;
-				  }
+symbol_element			: SYM
+				  { $$ = new text_element_symbol ($1); }
 				;
 
 font_modifier_element		: BF
new file mode 100644
--- /dev/null
+++ b/libinterp/corefcn/oct-tex-symbols.in
@@ -0,0 +1,110 @@
+# List of supported symbols for the TeX interpreter
+# (http://www.mathworks.com/help/matlab/ref/text_props.html):
+# - symbol name
+# - Unicode code
+# - MS symbol code (http://www.kostis.net/charsets/symbol.htm)
+
+alpha           0x03B1  0xF061
+angle           0x2220  0xF0D0
+ast             0x2217  0xF02A
+beta            0x03B2  0xF062
+gamma           0x03B3  0xF067
+delta           0x03B4  0xF064
+epsilon         0x03B5  0xF065
+zeta            0x03B6  0xF07A
+eta             0x03B7  0xF068
+theta           0x03B8  0xF071
+vartheta        0x03D1  0xF04A
+iota            0x03B9  0xF069
+kappa           0x03BA  0xF06B
+lambda          0x03BB  0xF06C
+mu              0x03BC  0xF06D
+nu              0x03BD  0xF06E
+xi              0x03BE  0xF078
+pi              0x03C0  0xF070
+rho             0x03C1  0xF072
+sigma           0x03C3  0xF073
+varsigma        0x03C2  0xF056
+tau             0x03C4  0xF074
+equiv           0x2261  0xF0BA
+Im              0x2111  0xF0C1
+otimes          0x2297  0xF0C4
+cap             0x2229  0xF0C7
+supset          0x2283  0xF0C9
+int             0x222B  0xF0F2
+rfloor          0x230B  0xF0FB
+lfloor          0x230A  0xF0EB
+perp            0x22A5  0xF05E
+wedge           0x2227  0xF0D9
+rceil           0x2309  0xF0F9
+vee             0x2228  0xF0DA
+langle          0x27E8  0xF0E1
+
+upsilon         0x03C5  0xF075
+phi             0x03C6  0xF066
+chi             0x03C7  0xF063
+psi             0x03C8  0xF079
+omega           0x03C9  0xF077
+Gamma           0x0393  0xF047
+Delta           0x0394  0xF044
+Theta           0x0398  0xF051
+Lambda          0x039B  0xF04C
+Xi              0x039E  0xF058
+Pi              0x03A0  0xF050
+Sigma           0x03A3  0xF053
+Upsilon         0x03D2  0xF055
+Phi             0x03A6  0xF046
+Psi             0x03A8  0xF059
+Omega           0x03A9  0xF057
+forall          0x2200  0xF022
+exists          0x2203  0xF024
+ni              0x220B  0xF027
+cong            0x2245  0xF040
+approx          0x2248  0xF0BB
+Re              0x211C  0xF0C2
+oplus           0x2295  0xF0C5
+cup             0x222A  0xF0C8
+subseteq        0x2286  0xF0CD
+in              0x2208  0xF0CE
+lceil           0x2308  0xF0E9
+cdot            0x22C5  0xF0D7
+neg             0x00AC  0xF0D8
+times           0x00D7  0xF0B4
+surd            0x221A  0xF0D6
+varpi           0x03D6  0xF076
+rangle          0x27E9  0xF0F1
+
+sim             0x223C  0xF07E
+leq             0x2264  0xF0A3
+infty           0x221E  0xF0A5
+clubsuit        0x2663  0xF0A7
+diamondsuit     0x2666  0xF0A8
+heartsuit       0x2665  0xF0A9
+spadesuit       0x2660  0xF0AA
+leftrightarrow  0x2194  0xF0AB
+leftarrow       0x2190  0xF0AC
+Leftarrow       0x21D0  0xF0DC
+uparrow         0x2191  0xF0AD
+rightarrow      0x2192  0xF0AE
+Rightarrow      0x21D2  0xF0DE
+downarrow       0x2193  0xF0AF
+circ            0x25CB  0xF0B0
+pm              0x00B1  0xF0B1
+geq             0x2265  0xF0B3
+propto          0x221D  0xF0B5
+partial         0x2202  0xF0B6
+bullet          0x2022  0xF0B7
+div             0x00F7  0xF0B8
+neq             0x2260  0xF0B9
+aleph           0x2135  0xF0C0
+wp              0x2118  0xF0C3
+oslash          0x2298  0xF0C6
+supseteq        0x2287  0xF0CA
+subset          0x2282  0xF0CC
+o               0x03BF  0xF0B0
+nabla           0x2207  0xF0D1
+ldots           0x2026  0xF0BC
+prime           0x2032  0xF0A2
+0               0x2205  0xF0C6
+mid             0x2223  0xF0BD
+copyright       0x00A9  0xF0E3
--- a/libinterp/corefcn/txt-eng-ft.cc
+++ b/libinterp/corefcn/txt-eng-ft.cc
@@ -782,7 +782,7 @@
   if (code != text_element_symbol::invalid_code && font.is_valid ())
     process_character (code);
   else if (font.is_valid ())
-    ::warning ("ignoring unknown symbol: %s", e.string_value ().c_str ());
+    ::warning ("ignoring unknown symbol: %d", e.get_symbol ());
 }
 
 void
--- a/libinterp/corefcn/txt-eng.cc
+++ b/libinterp/corefcn/txt-eng.cc
@@ -25,248 +25,15 @@
 #endif
 
 #include "txt-eng.h"
-
-static const char* symbol_names[] = {
-  "alpha",
-  "angle",
-  "ast",
-  "beta",
-  "gamma",
-  "delta",
-  "epsilon",
-  "zeta",
-  "eta",
-  "theta",
-  "vartheta",
-  "iota",
-  "kappa",
-  "lambda",
-  "mu",
-  "nu",
-  "xi",
-  "pi",
-  "rho",
-  "sigma",
-  "varsigma",
-  "tau",
-  "equiv",
-  "Im",
-  "otimes",
-  "cap",
-  "supset",
-  "int",
-  "rfloor",
-  "lfloor",
-  "perp",
-  "wedge",
-  "rceil",
-  "vee",
-  "langle",
-
-  "upsilon",
-  "phi",
-  "chi",
-  "psi",
-  "omega",
-  "Gamma",
-  "Delta",
-  "Theta",
-  "Lambda",
-  "Xi",
-  "Pi",
-  "Sigma",
-  "Upsilon",
-  "Phi",
-  "Psi",
-  "Omega",
-  "forall",
-  "exists",
-  "ni",
-  "cong",
-  "approx",
-  "Re",
-  "oplus",
-  "cup",
-  "subseteq",
-  "in",
-  "lceil",
-  "cdot",
-  "neg",
-  "times",
-  "surd",
-  "varpi",
-  "rangle",
-
-  "sim",
-  "leq",
-  "infty",
-  "clubsuit",
-  "diamondsuit",
-  "heartsuit",
-  "spadesuit",
-  "leftrightarrow",
-  "leftarrow",
-  "Leftarrow",
-  "uparrow",
-  "rightarrow",
-  "Rightarrow",
-  "downarrow",
-  "circ",
-  "pm",
-  "geq",
-  "propto",
-  "partial",
-  "bullet",
-  "div",
-  "neq",
-  "aleph",
-  "wp",
-  "oslash",
-  "supseteq",
-  "subset",
-  "o",
-  "nabla",
-  "ldots",
-  "prime",
-  "0",
-  "mid",
-  "copyright",
-
-  0
-};
+#include "oct-tex-symbols.cc"
 
-// Maps the symbol names (using index from symbol_names array) to
-// character codes, using 2 mapping:
-// - Unicode
-// - MS symbol (using Private Use Area)
-static uint32_t symbol_codes[][2] = {
-  { 0x03B1, 0xF061 },   // alpha
-  { 0x2220, 0xF0D0 },   // angle
-  { 0x2217, 0xF02A },   // ast
-  { 0x03B2, 0xF062 },   // beta
-  { 0x03B3, 0xF067 },   // gamma
-  { 0x03B4, 0xF064 },   // delta
-  { 0x03B5, 0xF065 },   // epsilon
-  { 0x03B6, 0xF07A },   // zeta
-  { 0x03B7, 0xF068 },   // eta
-  { 0x03B8, 0xF071 },   // theta
-  { 0x03D1, 0xF04A },   // vartheta
-  { 0x03B9, 0xF069 },   // iota
-  { 0x03BA, 0xF06B },   // kappa
-  { 0x03BB, 0xF06C },   // lambda
-  { 0x03BC, 0xF06D },   // mu
-  { 0x03BD, 0xF06E },   // nu
-  { 0x03BE, 0xF078 },   // xi
-  { 0x03C0, 0xF070 },   // pi
-  { 0x03C1, 0xF072 },   // rho
-  { 0x03C3, 0xF073 },   // sigma
-  { 0x03C2, 0xF056 },   // varsigma
-  { 0x03C4, 0xF074 },   // tau
-  { 0x2261, 0xF0BA },   // equiv
-  { 0x2111, 0xF0C1 },   // Im
-  { 0x2297, 0xF0C4 },   // otimes
-  { 0x2229, 0xF0C7 },   // cap
-  { 0x2283, 0xF0C9 },   // supset
-  { 0x222B, 0xF0F2 },   // int
-  { 0x230B, 0xF0FB },   // rfloor
-  { 0x230A, 0xF0EB },   // lfloor
-  { 0x22A5, 0xF05E },   // perp
-  { 0x2227, 0xF0D9 },   // wedge
-  { 0x2309, 0xF0F9 },   // rceil
-  { 0x2228, 0xF0DA },   // vee
-  { 0x27E8, 0xF0E1 },   // langle
+uint32_t
+text_element_symbol::get_symbol_code (void) const
+{
+  uint32_t code = invalid_code;
 
-  { 0x03C5, 0xF075 },   // upsilon
-  { 0x03C6, 0xF066 },   // phi
-  { 0x03C7, 0xF063 },   // chi
-  { 0x03C8, 0xF079 },   // psi
-  { 0x03C9, 0xF077 },   // omega
-  { 0x0393, 0xF047 },   // Gamma
-  { 0x0394, 0xF044 },   // Delta
-  { 0x0398, 0xF051 },   // Theta
-  { 0x039B, 0xF04C },   // Lambda
-  { 0x039E, 0xF058 },   // Xi
-  { 0x03A0, 0xF050 },   // Pi
-  { 0x03A3, 0xF053 },   // Sigma
-  { 0x03D2, 0xF055 },   // Upsilon
-  { 0x03A6, 0xF046 },   // Phi
-  { 0x03A8, 0xF059 },   // Psi
-  { 0x03A9, 0xF057 },   // Omega
-  { 0x2200, 0xF022 },   // forall
-  { 0x2203, 0xF024 },   // exists
-  { 0x220B, 0xF027 },   // ni
-  { 0x2245, 0xF040 },   // cong
-  { 0x2248, 0xF0BB },   // approx
-  { 0x211C, 0xF0C2 },   // Re
-  { 0x2295, 0xF0C5 },   // oplus
-  { 0x222A, 0xF0C8 },   // cup
-  { 0x2286, 0xF0CD },   // subseteq
-  { 0x2208, 0xF0CE },   // in
-  { 0x2308, 0xF0E9 },   // lceil
-  { 0x22C5, 0xF0D7 },   // cdot
-  { 0x00AC, 0xF0D8 },   // neg
-  { 0x00D7, 0xF0B4 },   // times
-  { 0x221A, 0xF0D6 },   // surd
-  { 0x03D6, 0xF076 },   // varpi
-  { 0x27E9, 0xF0F1 },   // rangle
-
-  { 0x223C, 0xF07E },   // sim
-  { 0x2264, 0xF0A3 },   // leq
-  { 0x221E, 0xF0A5 },   // infty
-  { 0x2663, 0xF0A7 },   // clubsuit
-  { 0x2666, 0xF0A8 },   // diamondsuit
-  { 0x2665, 0xF0A9 },   // heartsuit
-  { 0x2660, 0xF0AA },   // spadesuit
-  { 0x2194, 0xF0AB },   // leftrightarrow
-  { 0x2190, 0xF0AC },   // leftarrow
-  { 0x21D0, 0xF0DC },   // Leftarrow
-  { 0x2191, 0xF0AD },   // uparrow
-  { 0x2192, 0xF0AE },   // rightarrow
-  { 0x21D2, 0xF0DE },   // Rightarrow
-  { 0x2193, 0xF0AF },   // downarrow
-  { 0x25CB, 0xF0B0 },   // circ
-  { 0x00B1, 0xF0B1 },   // pm
-  { 0x2265, 0xF0B3 },   // geq
-  { 0x221D, 0xF0B5 },   // propto
-  { 0x2202, 0xF0B6 },   // partial
-  { 0x2022, 0xF0B7 },   // bullet
-  { 0x00F7, 0xF0B8 },   // div
-  { 0x2260, 0xF0B9 },   // neq
-  { 0x2135, 0xF0C0 },   // aleph
-  { 0x2118, 0xF0C3 },   // wp
-  { 0x2298, 0xF0C6 },   // oslash
-  { 0x2287, 0xF0CA },   // supseteq
-  { 0x2282, 0xF0CC },   // subset
-  { 0x03BF, 0xF0B0 },   // o
-  { 0x2207, 0xF0D1 },   // nabla
-  { 0x2026, 0xF0BC },   // ldots
-  { 0x2032, 0xF0A2 },   // prime
-  { 0x2205, 0xF0C6 },   // 0 (empty set)
-  { 0x2223, 0xF0BD },   // mid
-  { 0x00A9, 0xF0E3 },   // copyright
-
-  { 0, 0 }
-};
-
-// FIXME: May want to replace lookup with a map from STL
-//        Number of symbols is ~100 which means linear search
-//        is bordering on inefficient.
-uint32_t
-text_element_symbol::get_symbol_code (void)
-{
-  if (code == invalid_code)
-    {
-      std::string sym = string_value ();
-
-      for (int i = 0; symbol_names[i]; i++)
-        {
-          if (symbol_names[i] == sym)
-            {
-              code = symbol_codes[i][0];
-              break;
-            }
-        }
-    }
+  if (0 <= symbol && symbol < num_symbol_codes)
+    code = symbol_codes[symbol][0];
 
   return code;
 }
--- a/libinterp/corefcn/txt-eng.h
+++ b/libinterp/corefcn/txt-eng.h
@@ -82,23 +82,25 @@
 
 class
 OCTINTERP_API
-text_element_symbol : public text_element_string
+text_element_symbol : public text_element
 {
 public:
   enum { invalid_code = 0xFFFFFFFFU };
 
 public:
-  text_element_symbol (const std::string& sym)
-    : text_element_string (sym), code (invalid_code) { }
+  text_element_symbol (int sym)
+    : text_element (), symbol (sym) { }
 
   ~text_element_symbol (void) { }
 
-  uint32_t get_symbol_code (void);
+  int get_symbol (void) const { return symbol; }
+
+  uint32_t get_symbol_code (void) const;
 
   void accept (text_processor& p);
 
 private:
-  uint32_t code;
+  int symbol;
 };
 
 class