changeset 11322:5d7ae44ac10a

New module 'unicase/u8-totitle'.
author Bruno Haible <bruno@clisp.org>
date Sun, 08 Mar 2009 15:20:10 +0100
parents e1e651a8638c
children c72293bdd051
files ChangeLog lib/unicase/u-totitle.h lib/unicase/u8-totitle.c modules/unicase/u8-totitle
diffstat 4 files changed, 670 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
 2009-03-08  Bruno Haible  <bruno@clisp.org>
 
+	New module 'unicase/u8-totitle'.
+	* lib/unicase/u8-totitle.c: New file.
+	* lib/unicase/u-totitle.h: New file.
+	* modules/unicase/u8-totitle: New file.
+
 	Tests for module 'unicase/u32-tolower'.
 	* modules/unicase/u32-tolower-tests: New file.
 	* tests/unicase/test-u32-tolower.c: New file.
new file mode 100644
--- /dev/null
+++ b/lib/unicase/u-totitle.h
@@ -0,0 +1,502 @@
+/* Titlecase mapping for UTF-8/UTF-16/UTF-32 strings (locale dependent).
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+/* Quoting the Unicode standard:
+     Definition: A character is defined to be "cased" if it has the Lowercase or
+     Uppercase property or has a General_Category value of Titlecase_Letter.  */
+static inline bool
+is_cased (ucs4_t uc)
+{
+  return (uc_is_property_lowercase (uc)
+	  || uc_is_property_uppercase (uc)
+	  || uc_is_general_category (uc, UC_TITLECASE_LETTER));
+}
+
+/* Quoting the Unicode standard:
+     Definition: A character is defined to be "case-ignorable" if it has the
+     value MidLetter {or the value MidNumLet} for the Word_Break property or
+     its General_Category is one of Nonspacing_Mark (Mn), Enclosing_Mark (Me),
+     Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).
+   The text marked in braces was added in Unicode 5.1.0, see
+   <http://www.unicode.org/versions/Unicode5.1.0/> section "Update of
+   Definition of case-ignorable".   */
+static inline bool
+is_case_ignorable (ucs4_t uc)
+{
+  int wbp = uc_wordbreak_property (uc);
+
+  return (wbp == WBP_MIDLETTER || wbp == WBP_MIDNUMLET
+	  || uc_is_general_category_withtable (uc, UC_CATEGORY_MASK_Mn
+						   | UC_CATEGORY_MASK_Me
+						   | UC_CATEGORY_MASK_Cf
+						   | UC_CATEGORY_MASK_Lm
+						   | UC_CATEGORY_MASK_Sk));
+}
+
+/* Quoting the Unicode standard, section "Default Case Algorithms":
+     Find the word boundaries in X according to Unicode Standard Annex #29,
+     “Text Boundaries.” For each word boundary, find the first cased character
+     F following the word boundary. If F exists, map F to Titlecase_Mapping(F);
+     then map all characters C between F and the following word boundary to
+     Lowercase_Mapping(C).  */
+
+UNIT *
+FUNC (const UNIT *s, size_t n, const char *iso639_language,
+      uninorm_t nf,
+      UNIT *resultbuf, size_t *lengthp)
+{
+  /* The result being accumulated.  */
+  UNIT *result;
+  size_t length;
+  size_t allocated;
+  /* An array containing the word break positions.  */
+  char *wordbreaks;
+
+  /* Initialize the accumulator.  */
+  if (nf != NULL || resultbuf == NULL)
+    {
+      result = NULL;
+      allocated = 0;
+    }
+  else
+    {
+      result = resultbuf;
+      allocated = *lengthp;
+    }
+  length = 0;
+
+  /* Initialize the word breaks array.  */
+  if (n > 0)
+    {
+      wordbreaks = (char *) malloc (n);
+      if (wordbreaks == NULL)
+	{
+	  errno = ENOMEM;
+	  goto fail2;
+	}
+      U_WORDBREAKS (s, n, wordbreaks);
+    }
+  else
+    wordbreaks = NULL;
+
+  {
+    const UNIT *s_end = s + n;
+    const char *wp = wordbreaks;
+
+    /* When considering the string as segmented by word boundaries: For each
+       such segment:
+	- In the first part, we are searching for the first cased character.
+	  In this state, in_word_first_part = true, and no conversion takes
+	  place.
+	- In the second part, we are converting every character: the first
+	  among these characters to title case, the other ones to lower case.
+	  In this state, in_word_first_part = false.  */
+    bool in_word_first_part = true;
+
+    /* Helper for evaluating the FINAL_SIGMA condition:
+       Last character that was not case-ignorable.  */
+    ucs4_t last_char_except_ignorable = 0xFFFD;
+
+    /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions:
+       Last character that was of combining class 230 ("Above") or 0.  */
+    ucs4_t last_char_normal_or_above = 0xFFFD;
+
+    while (s < s_end)
+      {
+	/* Fetch the next character.  */
+	ucs4_t uc;
+	int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s);
+
+	ucs4_t (*single_character_map) (ucs4_t);
+	size_t offset_in_rule; /* offset in 'struct special_casing_rule' */
+
+	ucs4_t mapped_uc[3];
+	unsigned int mapped_count;
+
+	if (*wp)
+	  /* Crossing a word boundary.  */
+	  in_word_first_part = true;
+
+	/* Determine single_character_map, offset_in_rule.
+	   There are three possibilities:
+	     - uc should not be converted.
+	     - uc should be titlecased.
+	     - uc should be lowercased.  */
+	if (in_word_first_part)
+	  {
+	    if (is_cased (uc))
+	      {
+		/* uc is to be titlecased.  */
+		single_character_map = uc_totitle;
+		offset_in_rule = offsetof (struct special_casing_rule, title[0]);
+		in_word_first_part = false;
+	      }
+	    else
+	      {
+		/* uc is not converted.  */
+		single_character_map = NULL;
+		offset_in_rule = 0;
+	      }
+	  }
+	else
+	  {
+	    /* uc is to be lowercased.  */
+	    single_character_map = uc_tolower;
+	    offset_in_rule = offsetof (struct special_casing_rule, lower[0]);
+	  }
+
+	/* Actually map uc.  */
+	if (single_character_map == NULL)
+	  {
+	    mapped_uc[0] = uc;
+	    mapped_count = 1;
+	    goto found_mapping;
+	  }
+
+	if (uc < 0x10000)
+	  {
+	    /* Look first in the special-casing table.  */
+	    char code[3];
+
+	    code[0] = (uc >> 8) & 0xff;
+	    code[1] = uc & 0xff;
+
+	    for (code[2] = 0; ; code[2]++)
+	      {
+		const struct special_casing_rule *rule =
+		  gl_unicase_special_lookup (code, 3);
+
+		if (rule == NULL)
+		  break;
+
+		/* Test if the condition applies.  */
+		/* Does the language apply?  */
+		if (rule->language[0] == '\0'
+		    || (iso639_language != NULL
+			&& iso639_language[0] == rule->language[0]
+			&& iso639_language[1] == rule->language[1]))
+		  {
+		    /* Does the context apply?  */
+		    int context = rule->context;
+		    bool applies;
+
+		    if (context < 0)
+		      context = - context;
+		    switch (context)
+		      {
+		      case SCC_ALWAYS:
+			applies = true;
+			break;
+
+		      case SCC_FINAL_SIGMA:
+			/* "Before" condition: preceded by a sequence
+			   consisting of a cased letter and a case-ignorable
+			   sequence.
+			   "After" condition: not followed by a sequence
+			   consisting of a case-ignorable sequence and then a
+			   cased letter.  */
+			/* Test the "before" condition.  */
+			applies = is_cased (last_char_except_ignorable);
+			/* Test the "after" condition.  */
+			if (applies)
+			  {
+			    const UNIT *s2 = s + count;
+			    while (s2 < s_end)
+			      {
+				ucs4_t uc2;
+				int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
+				if (is_cased (uc2))
+				  {
+				    applies = false;
+				    break;
+				  }
+				if (!is_case_ignorable (uc2))
+				  break;
+				s2 += count2;
+			      }
+			  }
+			break;
+
+		      case SCC_AFTER_SOFT_DOTTED:
+			/* "Before" condition: There is a Soft_Dotted character
+			   before it, with no intervening character of
+			   combining class 0 or 230 (Above).  */
+			/* Test the "before" condition.  */
+			applies = uc_is_property_soft_dotted (last_char_normal_or_above);
+			break;
+
+		      case SCC_MORE_ABOVE:
+			/* "After" condition: followed by a character of
+			   combining class 230 (Above) with no intervening
+			   character of combining class 0 or 230 (Above).  */
+			/* Test the "after" condition.  */
+			{
+			  const UNIT *s2 = s + count;
+			  applies = false;
+			  while (s2 < s_end)
+			    {
+			      ucs4_t uc2;
+			      int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
+			      int ccc = uc_combining_class (uc2);
+			      if (ccc == UC_CCC_A)
+				{
+				  applies = true;
+				  break;
+				}
+			      if (ccc == UC_CCC_NR)
+				break;
+			      s2 += count2;
+			    }
+			}
+			break;
+
+		      case SCC_BEFORE_DOT:
+			/* "After" condition: followed by COMBINING DOT ABOVE
+			   (U+0307). Any sequence of characters with a
+			   combining class that is neither 0 nor 230 may
+			   intervene between the current character and the
+			   combining dot above.  */
+			/* Test the "after" condition.  */
+			{
+			  const UNIT *s2 = s + count;
+			  applies = false;
+			  while (s2 < s_end)
+			    {
+			      ucs4_t uc2;
+			      int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2);
+			      if (uc2 == 0x0307) /* COMBINING DOT ABOVE */
+				{
+				  applies = true;
+				  break;
+				}
+			      {
+				int ccc = uc_combining_class (uc2);
+				if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
+				  break;
+			      }
+			      s2 += count2;
+			    }
+			}
+			break;
+
+		      case SCC_AFTER_I:
+			/* "Before" condition: There is an uppercase I before
+			   it, and there is no intervening character of
+			   combining class 0 or 230 (Above).  */
+			/* Test the "before" condition.  */
+			applies = (last_char_normal_or_above == 'I');
+			break;
+
+		      default:
+			abort ();
+		      }
+		    if (rule->context < 0)
+		      applies = !applies;
+
+		    if (applies)
+		      {
+			/* The rule applies.
+			   Look up the mapping (0 to 3 characters).  */
+			const unsigned short *mapped_in_rule =
+			  (const unsigned short *)((const char *)rule + offset_in_rule);
+
+			if (mapped_in_rule[0] == 0)
+			  mapped_count = 0;
+			else
+			  {
+			    mapped_uc[0] = mapped_in_rule[0];
+			    if (mapped_in_rule[1] == 0)
+			      mapped_count = 1;
+			    else
+			      {
+				mapped_uc[1] = mapped_in_rule[1];
+				if (mapped_in_rule[2] == 0)
+				  mapped_count = 2;
+				else
+				  {
+				    mapped_uc[2] = mapped_in_rule[2];
+				    mapped_count = 3;
+				  }
+			      }
+			  }
+			goto found_mapping;
+		      }
+		  }
+
+		/* Optimization: Save a hash table lookup in the next round.  */
+		if (!rule->has_next)
+		  break;
+	      }
+	  }
+
+	/* No special-cased mapping.  So use the locale and context independent
+	   mapping.  */
+	mapped_uc[0] = single_character_map (uc);
+	mapped_count = 1;
+
+       found_mapping:
+	/* Found the mapping: uc maps to mapped_uc[0..mapped_count-1].  */
+	{
+	  unsigned int i;
+
+	  for (i = 0; i < mapped_count; i++)
+	    {
+	      ucs4_t muc = mapped_uc[i];
+
+	      /* Append muc to the result accumulator.  */
+	      if (length < allocated)
+		{
+		  int ret = U_UCTOMB (result + length, muc, allocated - length);
+		  if (ret == -1)
+		    {
+		      errno = EINVAL;
+		      goto fail1;
+		    }
+		  if (ret >= 0)
+		    {
+		      length += ret;
+		      goto done_appending;
+		    }
+		}
+	      {
+		size_t old_allocated = allocated;
+		size_t new_allocated = 2 * old_allocated;
+		if (new_allocated < 64)
+		  new_allocated = 64;
+		if (new_allocated < old_allocated) /* integer overflow? */
+		  abort ();
+		{
+		  UNIT *larger_result;
+		  if (result == NULL)
+		    {
+		      larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
+		      if (larger_result == NULL)
+			{
+			  errno = ENOMEM;
+			  goto fail1;
+			}
+		    }
+		  else if (result == resultbuf)
+		    {
+		      larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT));
+		      if (larger_result == NULL)
+			{
+			  errno = ENOMEM;
+			  goto fail1;
+			}
+		      U_CPY (larger_result, resultbuf, length);
+		    }
+		  else
+		    {
+		      larger_result =
+			(UNIT *) realloc (result, new_allocated * sizeof (UNIT));
+		      if (larger_result == NULL)
+			{
+			  errno = ENOMEM;
+			  goto fail1;
+			}
+		    }
+		  result = larger_result;
+		  allocated = new_allocated;
+		  {
+		    int ret = U_UCTOMB (result + length, muc, allocated - length);
+		    if (ret == -1)
+		      {
+			errno = EINVAL;
+			goto fail1;
+		      }
+		    if (ret < 0)
+		      abort ();
+		    length += ret;
+		    goto done_appending;
+		  }
+		}
+	      }
+	     done_appending: ;
+	    }
+	}
+
+	if (!is_case_ignorable (uc))
+	  last_char_except_ignorable = uc;
+
+	{
+	  int ccc = uc_combining_class (uc);
+	  if (ccc == UC_CCC_A || ccc == UC_CCC_NR)
+	    last_char_normal_or_above = uc;
+	}
+
+	s += count;
+	wp += count;
+      }
+  }
+
+  free (wordbreaks);
+
+  if (nf != NULL)
+    {
+      /* Finally, normalize the result.  */
+      UNIT *normalized_result;
+
+      normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp);
+      if (normalized_result == NULL)
+	goto fail2;
+
+      free (result);
+      return normalized_result;
+    }
+
+  if (length == 0)
+    {
+      if (result == NULL)
+	{
+	  /* Return a non-NULL value.  NULL means error.  */
+	  result = (UNIT *) malloc (1);
+	  if (result == NULL)
+	    {
+	      errno = ENOMEM;
+	      goto fail2;
+	    }
+	}
+    }
+  else if (result != resultbuf && length < allocated)
+    {
+      /* Shrink the allocated memory if possible.  */
+      UNIT *memory;
+
+      memory = (UNIT *) realloc (result, length * sizeof (UNIT));
+      if (memory != NULL)
+	result = memory;
+    }
+
+  *lengthp = length;
+  return result;
+
+ fail1:
+  {
+    int saved_errno = errno;
+    free (wordbreaks);
+    errno = saved_errno;
+  }
+ fail2:
+  if (result != resultbuf)
+    {
+      int saved_errno = errno;
+      free (result);
+      errno = saved_errno;
+    }
+  return NULL;
+}
new file mode 100644
--- /dev/null
+++ b/lib/unicase/u8-totitle.c
@@ -0,0 +1,122 @@
+/* Titlecase mapping for UTF-8 strings (locale dependent).
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+/* Specification.  */
+#include "unicase.h"
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "unistr.h"
+#include "unictype.h"
+#include "uniwbrk.h"
+#include "uninorm.h"
+#include "special-casing.h"
+
+#define FUNC u8_totitle
+#define UNIT uint8_t
+#define U_WORDBREAKS u8_wordbreaks
+#define U_MBTOUC_UNSAFE u8_mbtouc_unsafe
+#define U_UCTOMB u8_uctomb
+#define U_CPY u8_cpy
+#define U_NORMALIZE u8_normalize
+#include "u-totitle.h"
+
+
+#ifdef TEST
+
+#include <locale.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Read the contents of an input stream, and return it, terminated with a NUL
+   byte. */
+char *
+read_file (FILE *stream)
+{
+#define BUFSIZE 4096
+  char *buf = NULL;
+  int alloc = 0;
+  int size = 0;
+  int count;
+
+  while (! feof (stream))
+    {
+      if (size + BUFSIZE > alloc)
+	{
+	  alloc = alloc + alloc / 2;
+	  if (alloc < size + BUFSIZE)
+	    alloc = size + BUFSIZE;
+	  buf = realloc (buf, alloc);
+	  if (buf == NULL)
+	    {
+	      fprintf (stderr, "out of memory\n");
+	      exit (1);
+	    }
+	}
+      count = fread (buf + size, 1, BUFSIZE, stream);
+      if (count == 0)
+	{
+	  if (ferror (stream))
+	    {
+	      perror ("fread");
+	      exit (1);
+	    }
+	}
+      else
+	size += count;
+    }
+  buf = realloc (buf, size + 1);
+  if (buf == NULL)
+    {
+      fprintf (stderr, "out of memory\n");
+      exit (1);
+    }
+  buf[size] = '\0';
+  return buf;
+#undef BUFSIZE
+}
+
+int
+main (int argc, char * argv[])
+{
+  setlocale (LC_ALL, "");
+  if (argc == 1)
+    {
+      /* Display the upper case of the input string.  */
+      char *input = read_file (stdin);
+      int length = strlen (input);
+      size_t output_length;
+      uint8_t *output =
+	u8_toupper ((uint8_t *) input, length, uc_locale_language (),
+		    NULL,
+		    NULL, &output_length);
+
+      fwrite (output, 1, output_length, stdout);
+
+      return 0;
+    }
+  else
+    return 1;
+}
+
+#endif /* TEST */
new file mode 100644
--- /dev/null
+++ b/modules/unicase/u8-totitle
@@ -0,0 +1,41 @@
+Description:
+Titlecase mapping for UTF-8 strings (locale dependent).
+
+Files:
+lib/unicase/u8-totitle.c
+lib/unicase/u-totitle.h
+
+Depends-on:
+unicase/base
+unicase/special-casing
+unicase/totitle
+unicase/tolower
+uniwbrk/wordbreak-property
+uniwbrk/u8-wordbreaks
+unictype/category-of
+unictype/category-test
+unictype/category-Lt
+unictype/combining-class
+unictype/property-lowercase
+unictype/property-uppercase
+unictype/property-soft-dotted
+unistr/u8-mbtouc-unsafe
+unistr/u8-uctomb
+unistr/u8-cpy
+uninorm/u8-normalize
+stdbool
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += unicase/u8-totitle.c
+
+Include:
+"unicase.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible
+