changeset 3925:098b9043b95d

(PATFETCH): Remove the translating fetch. (PATFETCH_RAW): Rename to PATFETCH. (set_image_of_range): New fun. (SET_RANGE_TABLE_WORK_AREA): Use it. (regex_compile): Don't translate the pattern chars so eagerly. Only do it when inserting an `exactn' bytecode or when handling a char-range. (mutually_exclusive_p): Avoid empty statement.
author Stefan Monnier <monnier@iro.umontreal.ca>
date Fri, 23 Aug 2002 22:19:56 +0000
parents 105ea603d35b
children 06cd223546da
files regex.c
diffstat 1 files changed, 50 insertions(+), 26 deletions(-) [+]
line wrap: on
line diff
--- a/regex.c
+++ b/regex.c
@@ -19,7 +19,9 @@
    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
    USA.	 */
 
-/* TODO:
+/* BUGS:
+   - (x?)*y\1z should match both xxxxyxz and xxxyz.
+   TODO:
    - structure the opcode space into opcode+flag.
    - merge with glibc's regex.[ch].
    - replace (succeed_n + jump_n + set_number_at) with something that doesn't
@@ -1682,17 +1684,9 @@
 static int analyse_first _RE_ARGS ((re_char *p, re_char *pend,
 				    char *fastmap, const int multibyte));
 
-/* Fetch the next character in the uncompiled pattern---translating it
-   if necessary.  */
-#define PATFETCH(c)							\
-  do {									\
-    PATFETCH_RAW (c);							\
-    c = TRANSLATE (c);							\
-  } while (0)
-
 /* Fetch the next character in the uncompiled pattern, with no
    translation.  */
-#define PATFETCH_RAW(c)							\
+#define PATFETCH(c)							\
   do {									\
     int len;								\
     if (p == pend) return REG_EEND;					\
@@ -1914,12 +1908,13 @@
 #define BIT_UPPER	0x10
 #define BIT_MULTIBYTE	0x20
 
-/* Set a range (RANGE_START, RANGE_END) to WORK_AREA.  */
-#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end)	\
-  do {									\
-    EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2);			\
-    (work_area).table[(work_area).used++] = (range_start);		\
-    (work_area).table[(work_area).used++] = (range_end);		\
+/* Set a range START..END to WORK_AREA.
+   The range is passed through TRANSLATE, so START and END
+   should be untranslated.  */
+#define SET_RANGE_TABLE_WORK_AREA(work_area, start, end)	\
+  do {								\
+    EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2);		\
+    set_image_of_range (&work_area, start, end, translate);	\
   } while (0)
 
 /* Free allocated memory for WORK_AREA.	 */
@@ -2077,6 +2072,31 @@
 }
 #endif
 
+
+
+/* We need to find the image of the range start..end when passed through
+   TRANSLATE.  This is not necessarily TRANSLATE(start)..TRANSLATE(end)
+   and is not even necessarily contiguous.
+   We approximate it with the smallest contiguous range that contains
+   all the chars we need.  */
+static void
+set_image_of_range (work_area, start, end, translate)
+     RE_TRANSLATE_TYPE translate;
+     struct range_table_work_area *work_area;
+     re_wchar_t start, end;
+{
+  re_wchar_t cmin = TRANSLATE (start), cmax = TRANSLATE (end);
+  if (RE_TRANSLATE_P (translate))
+    for (; start <= end; start++)
+      {
+	re_wchar_t c = TRANSLATE (start);
+	cmin = MIN (cmin, c);
+	cmax = MAX (cmax, c);
+      }
+  work_area->table[work_area->used++] = (cmin);
+  work_area->table[work_area->used++] = (cmax);
+}
+
 /* Explicit quit checking is only used on NTemacs.  */
 #if defined WINDOWSNT && defined emacs && defined QUIT
 extern int immediate_quit;
@@ -2525,6 +2545,10 @@
 
 		if (p == pend) FREE_STACK_RETURN (REG_EBRACK);
 
+		/* Don't translate yet.  The range TRANSLATE(X..Y) cannot
+		   always be determined from TRANSLATE(X) and TRANSLATE(Y)
+		   So the translation is done later in a loop.  Example:
+		   (let ((case-fold-search t)) (string-match "[A-_]" "A"))  */
 		PATFETCH (c);
 
 		/* \ might escape characters inside [...] and [^...].  */
@@ -2584,7 +2608,7 @@
 		       them).  */
 		    if (c == ':' && *p == ']')
 		      {
-			int ch;
+			re_wchar_t ch;
 			re_wctype_t cc;
 
 			cc = re_wctype (str);
@@ -2653,8 +2677,8 @@
 			       starting at the smallest character in
 			       the charset of C1 and ending at C1.  */
 			    int charset = CHAR_CHARSET (c1);
-			    int c2 = MAKE_CHAR (charset, 0, 0);
-			    
+			    re_wchar_t c2 = MAKE_CHAR (charset, 0, 0);
+
 			    SET_RANGE_TABLE_WORK_AREA (range_table_work,
 						       c2, c1);
 			    c1 = 0377;
@@ -2672,7 +2696,7 @@
 		  /* ... into bitmap.  */
 		  {
 		    re_wchar_t this_char;
-		    int range_start = c, range_end = c1;
+		    re_wchar_t range_start = c, range_end = c1;
 
 		    /* If the start is after the end, the range is empty.  */
 		    if (range_start > range_end)
@@ -2769,7 +2793,7 @@
 	  /* Do not translate the character after the \, so that we can
 	     distinguish, e.g., \B from \b, even if we normally would
 	     translate, e.g., B to b.  */
-	  PATFETCH_RAW (c);
+	  PATFETCH (c);
 
 	  switch (c)
 	    {
@@ -3129,13 +3153,13 @@
 
 	    case 'c':
 	      laststart = b;
-	      PATFETCH_RAW (c);
+	      PATFETCH (c);
 	      BUF_PUSH_2 (categoryspec, c);
 	      break;
 
 	    case 'C':
 	      laststart = b;
-	      PATFETCH_RAW (c);
+	      PATFETCH (c);
 	      BUF_PUSH_2 (notcategoryspec, c);
 	      break;
 #endif /* emacs */
@@ -3225,7 +3249,6 @@
 	      /* You might think it would be useful for \ to mean
 		 not to translate; but if we don't translate it
 		 it will never match anything.  */
-	      c = TRANSLATE (c);
 	      goto normal_char;
 	    }
 	  break;
@@ -3234,7 +3257,7 @@
 	default:
 	/* Expects the character in `c'.  */
 	normal_char:
-	      /* If no exactn currently being built.  */
+	  /* If no exactn currently being built.  */
 	  if (!pending_exact
 
 	      /* If last exactn not at current position.  */
@@ -3265,6 +3288,7 @@
 	  {
 	    int len;
 
+	    c = TRANSLATE (c);
 	    if (multibyte)
 	      len = CHAR_STRING (c, b);
 	    else
@@ -4427,7 +4451,7 @@
 	     they don't overlap.  The union of the two sets of excluded
 	     chars should cover all possible chars, which, as a matter of
 	     fact, is virtually impossible in multibyte buffers.  */
-	  ;
+	  break;
 	}
       break;