changeset 11702:daa1eb869953

Reduce the number of uc_is_cased calls.
author Bruno Haible <bruno@clisp.org>
date Wed, 01 Jul 2009 01:49:33 +0200
parents 4b50bb02dc1b
children 89b12821209e
files ChangeLog lib/unicase.h lib/unicase/context.h lib/unicase/empty-suffix-context.c lib/unicase/u-casemap.h lib/unicase/u-ct-totitle.h lib/unicase/u-suffix-context.h modules/unicase/u16-suffix-context modules/unicase/u32-suffix-context modules/unicase/u8-suffix-context
diffstat 10 files changed, 58 insertions(+), 38 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2009-06-30  Bruno Haible  <bruno@clisp.org>
+
+	Reduce the number of uc_is_cased calls.
+	* lib/unicase.h (casing_suffix_context_t): Add
+	'first_char_except_ignorable' field.
+	* lib/unicase/context.h (SCC_FINAL_SIGMA_MASK): Remove macro.
+	(SCC_MORE_ABOVE_MASK, SCC_BEFORE_DOT_MASK): Update.
+	* lib/unicase/empty-suffix-context.c (unicase_empty_suffix_context):
+	Update initializer.
+	* lib/unicase/u-casemap.h (FUNC): Don't invoke uc_is_cased on
+	case-ignorable characters.
+	* lib/unicase/u-ct-totitle.h (FUNC): Likewise.
+	* lib/unicase/u-suffix-context.h (FUNC2): Don't call uc_is_cased here.
+	* modules/unicase/u8-suffix-context (Depends-on): Remove unicase/cased.
+	* modules/unicase/u16-suffix-context (Depends-on): Likewise.
+	* modules/unicase/u32-suffix-context (Depends-on): Likewise.
+
 2009-06-30  Bruno Haible  <bruno@clisp.org>
 
 	Tests for module 'unicase/ignorable'.
--- a/lib/unicase.h
+++ b/lib/unicase.h
@@ -167,8 +167,8 @@
 typedef struct casing_suffix_context
 	{
 	  /* These fields are private, undocumented.  */
+	  uint32_t first_char_except_ignorable;
 	  uint32_t bits;
-	  uint32_t unused_bits;
 	}
 	casing_suffix_context_t;
 /* The case-mapping context of the empty suffix string.  */
--- a/lib/unicase/context.h
+++ b/lib/unicase/context.h
@@ -44,20 +44,22 @@
    casing_suffix_context_t contains the following fields:
 
      // For evaluating the FINAL_SIGMA condition:
-     // Bit 0 is set if the suffix starts with a sequence consisting of a
-     // case-ignorable sequence and then a cased letter.
-     //
+     //  First character that was not case-ignorable.
+     ucs4_t first_char_except_ignorable;
+
      // For evaluating the MORE_ABOVE condition:
-     // Bit 1 is set if the suffix contains a character of combining class
+     // Bit 0 is set if the suffix contains a character of combining class
      // 230 (Above) with no character of combining class 0 or 230 (Above)
      // before it.
      //
      // For evaluating the BEFORE_DOT condition:
-     // Bit 2 is set if the suffix contains a COMBINING DOT ABOVE (U+0307)
+     // Bit 1 is set if the suffix contains a COMBINING DOT ABOVE (U+0307)
      // with no character of combining class 0 or 230 (Above) before it.
      //
      uint32_t bits;
- */
-#define SCC_FINAL_SIGMA_MASK 1
-#define SCC_MORE_ABOVE_MASK  2
-#define SCC_BEFORE_DOT_MASK  4
+
+   Three bits would be sufficient to carry the context information, but
+   that would require to invoke uc_is_cased ahead of time, more often than
+   actually needed.  */
+#define SCC_MORE_ABOVE_MASK  1
+#define SCC_BEFORE_DOT_MASK  2
--- a/lib/unicase/empty-suffix-context.c
+++ b/lib/unicase/empty-suffix-context.c
@@ -22,6 +22,6 @@
 
 const casing_suffix_context_t unicase_empty_suffix_context =
   {
-    0 /* bits */,
-    0 /* unused_bits */
+    0xFFFD /* first_char_except_ignorable */,
+    0 /* bits */
   };
--- a/lib/unicase/u-casemap.h
+++ b/lib/unicase/u-casemap.h
@@ -118,18 +118,20 @@
 				  {
 				    ucs4_t uc2;
 				    int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
-				    if (uc_is_cased (uc2))
+				    /* Our uc_is_case_ignorable function is
+				       known to return false for all cased
+				       characters.  So we can call
+				       uc_is_case_ignorable first.  */
+				    if (!uc_is_case_ignorable (uc2))
 				      {
-					applies = false;
+					applies = ! uc_is_cased (uc2);
 					break;
 				      }
-				    if (!uc_is_case_ignorable (uc2))
-				      break;
 				    s2 += count2;
 				  }
 				else
 				  {
-				    applies = ((suffix_context.bits & SCC_FINAL_SIGMA_MASK) == 0);
+				    applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
 				    break;
 				  }
 			      }
--- a/lib/unicase/u-ct-totitle.h
+++ b/lib/unicase/u-ct-totitle.h
@@ -194,18 +194,20 @@
 				  {
 				    ucs4_t uc2;
 				    int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
-				    if (uc_is_cased (uc2))
+				    /* Our uc_is_case_ignorable function is
+				       known to return false for all cased
+				       characters.  So we can call
+				       uc_is_case_ignorable first.  */
+				    if (!uc_is_case_ignorable (uc2))
 				      {
-					applies = false;
+					applies = ! uc_is_cased (uc2);
 					break;
 				      }
-				    if (!uc_is_case_ignorable (uc2))
-				      break;
 				    s2 += count2;
 				  }
 				else
 				  {
-				    applies = ((suffix_context.bits & SCC_FINAL_SIGMA_MASK) == 0);
+				    applies = ! uc_is_cased (suffix_context.first_char_except_ignorable);
 				    break;
 				  }
 			      }
--- a/lib/unicase/u-suffix-context.h
+++ b/lib/unicase/u-suffix-context.h
@@ -28,7 +28,7 @@
   /* Evaluate all three conditions in a single pass through the string S.
      The three variables are -1 as long as the value of the condition has
      not been determined.  */
-  int scc_FINAL_SIGMA = -1;
+  ucs4_t first_char_except_ignorable = (ucs4_t)(-1);
   int scc_MORE_ABOVE = -1;
   int scc_BEFORE_DOT = -1;
   const UNIT *s_end = s + n;
@@ -38,12 +38,10 @@
       ucs4_t uc;
       int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
 
-      if (scc_FINAL_SIGMA < 0)
+      if (first_char_except_ignorable == (ucs4_t)(-1))
 	{
-	  if (uc_is_cased (uc))
-	    scc_FINAL_SIGMA = SCC_FINAL_SIGMA_MASK;
-	  else if (!uc_is_case_ignorable (uc))
-	    scc_FINAL_SIGMA = 0;
+	  if (!uc_is_case_ignorable (uc))
+	    first_char_except_ignorable = uc;
 	}
 
       if (scc_MORE_ABOVE < 0)
@@ -67,7 +65,8 @@
 	    }
 	}
 
-      if ((scc_FINAL_SIGMA | scc_MORE_ABOVE | scc_BEFORE_DOT) >= 0)
+      if (first_char_except_ignorable != (ucs4_t)(-1)
+	  && (scc_MORE_ABOVE | scc_BEFORE_DOT) >= 0)
 	/* All conditions have been determined.  */
 	break;
 
@@ -76,13 +75,14 @@
 
   /* For those conditions that have not been determined so far, use the
      value from the argument context.  */
+  context.first_char_except_ignorable =
+    (first_char_except_ignorable != (ucs4_t)(-1)
+     ? first_char_except_ignorable
+     : a_context.first_char_except_ignorable);
   context.bits =
-    (scc_FINAL_SIGMA >= 0
-     ? scc_FINAL_SIGMA
-     : a_context.bits & SCC_FINAL_SIGMA_MASK)
-    | (scc_MORE_ABOVE >= 0
-       ? scc_MORE_ABOVE
-       : a_context.bits & SCC_MORE_ABOVE_MASK)
+    (scc_MORE_ABOVE >= 0
+     ? scc_MORE_ABOVE
+     : a_context.bits & SCC_MORE_ABOVE_MASK)
     | (scc_BEFORE_DOT >= 0
        ? scc_BEFORE_DOT
        : a_context.bits & SCC_BEFORE_DOT_MASK);
--- a/modules/unicase/u16-suffix-context
+++ b/modules/unicase/u16-suffix-context
@@ -9,7 +9,6 @@
 Depends-on:
 unicase/base
 unicase/empty-prefix-context
-unicase/cased
 unicase/ignorable
 unictype/combining-class
 unistr/u16-mbtouc-unsafe
--- a/modules/unicase/u32-suffix-context
+++ b/modules/unicase/u32-suffix-context
@@ -9,7 +9,6 @@
 Depends-on:
 unicase/base
 unicase/empty-prefix-context
-unicase/cased
 unicase/ignorable
 unictype/combining-class
 unistr/u32-mbtouc-unsafe
--- a/modules/unicase/u8-suffix-context
+++ b/modules/unicase/u8-suffix-context
@@ -9,7 +9,6 @@
 Depends-on:
 unicase/base
 unicase/empty-prefix-context
-unicase/cased
 unicase/ignorable
 unictype/combining-class
 unistr/u8-mbtouc-unsafe