# HG changeset patch # User Bruno Haible # Date 1246312748 -7200 # Node ID aa8baf2916ba039a9c800de09562c47de7685352 # Parent e7ba784b12c46f70a71c3180b79dab8cf3c2ec8f New module 'unicase/u8-ct-totitle'. diff --git a/ChangeLog b/ChangeLog --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2009-06-29 Bruno Haible + New module 'unicase/u8-ct-totitle'. + * lib/unicase/u8-ct-totitle.c: New file. + * lib/unicase/u-ct-totitle.h: New file. + * modules/unicase/u8-ct-totitle: New file. + New module 'unicase/u32-ct-tolower'. * lib/unicase/u32-ct-tolower.c: New file. * modules/unicase/u32-ct-tolower: New file. diff --git a/lib/unicase/u-ct-totitle.h b/lib/unicase/u-ct-totitle.h new file mode 100644 --- /dev/null +++ b/lib/unicase/u-ct-totitle.h @@ -0,0 +1,499 @@ +/* Titlecase mapping for UTF-8/UTF-16/UTF-32 substrings (locale dependent). + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible , 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +/* Quoting the Unicode standard, section "Default Case Algorithms": + Find the word boundaries in X according to Unicode Standard Annex #29, + “Text Boundaries.” For each word boundary, find the first cased character + F following the word boundary. If F exists, map F to Titlecase_Mapping(F); + then map all characters C between F and the following word boundary to + Lowercase_Mapping(C). */ + +UNIT * +FUNC (const UNIT *s, size_t n, + casing_prefix_context_t prefix_context, + casing_suffix_context_t suffix_context, + const char *iso639_language, + uninorm_t nf, + UNIT *resultbuf, size_t *lengthp) +{ + /* The result being accumulated. */ + UNIT *result; + size_t length; + size_t allocated; + /* An array containing the word break positions. */ + char *wordbreaks; + + /* Initialize the accumulator. */ + if (nf != NULL || resultbuf == NULL) + { + result = NULL; + allocated = 0; + } + else + { + result = resultbuf; + allocated = *lengthp; + } + length = 0; + + /* Initialize the word breaks array. */ + if (n > 0) + { + wordbreaks = (char *) malloc (n); + if (wordbreaks == NULL) + { + errno = ENOMEM; + goto fail2; + } + U_WORDBREAKS (s, n, wordbreaks); + } + else + wordbreaks = NULL; + + { + const UNIT *s_end = s + n; + const char *wp = wordbreaks; + + /* When considering the string as segmented by word boundaries: For each + such segment: + - In the first part, we are searching for the first cased character. + In this state, in_word_first_part = true, and no conversion takes + place. + - In the second part, we are converting every character: the first + among these characters to title case, the other ones to lower case. + In this state, in_word_first_part = false. */ + bool in_word_first_part = true; + + /* Helper for evaluating the FINAL_SIGMA condition: + Last character that was not case-ignorable. */ + ucs4_t last_char_except_ignorable = + prefix_context.last_char_except_ignorable; + + /* Helper for evaluating the AFTER_SOFT_DOTTED and AFTER_I conditions: + Last character that was of combining class 230 ("Above") or 0. */ + ucs4_t last_char_normal_or_above = + prefix_context.last_char_normal_or_above; + + while (s < s_end) + { + /* Fetch the next character. */ + ucs4_t uc; + int count = U_MBTOUC_UNSAFE (&uc, s, s_end - s); + + ucs4_t (*single_character_map) (ucs4_t); + size_t offset_in_rule; /* offset in 'struct special_casing_rule' */ + + ucs4_t mapped_uc[3]; + unsigned int mapped_count; + + if (*wp) + /* Crossing a word boundary. */ + in_word_first_part = true; + + /* Determine single_character_map, offset_in_rule. + There are three possibilities: + - uc should not be converted. + - uc should be titlecased. + - uc should be lowercased. */ + if (in_word_first_part) + { + if (uc_is_cased (uc)) + { + /* uc is to be titlecased. */ + single_character_map = uc_totitle; + offset_in_rule = offsetof (struct special_casing_rule, title[0]); + in_word_first_part = false; + } + else + { + /* uc is not converted. */ + single_character_map = NULL; + offset_in_rule = 0; + } + } + else + { + /* uc is to be lowercased. */ + single_character_map = uc_tolower; + offset_in_rule = offsetof (struct special_casing_rule, lower[0]); + } + + /* Actually map uc. */ + if (single_character_map == NULL) + { + mapped_uc[0] = uc; + mapped_count = 1; + goto found_mapping; + } + + if (uc < 0x10000) + { + /* Look first in the special-casing table. */ + char code[3]; + + code[0] = (uc >> 8) & 0xff; + code[1] = uc & 0xff; + + for (code[2] = 0; ; code[2]++) + { + const struct special_casing_rule *rule = + gl_unicase_special_lookup (code, 3); + + if (rule == NULL) + break; + + /* Test if the condition applies. */ + /* Does the language apply? */ + if (rule->language[0] == '\0' + || (iso639_language != NULL + && iso639_language[0] == rule->language[0] + && iso639_language[1] == rule->language[1])) + { + /* Does the context apply? */ + int context = rule->context; + bool applies; + + if (context < 0) + context = - context; + switch (context) + { + case SCC_ALWAYS: + applies = true; + break; + + case SCC_FINAL_SIGMA: + /* "Before" condition: preceded by a sequence + consisting of a cased letter and a case-ignorable + sequence. + "After" condition: not followed by a sequence + consisting of a case-ignorable sequence and then a + cased letter. */ + /* Test the "before" condition. */ + applies = uc_is_cased (last_char_except_ignorable); + /* Test the "after" condition. */ + if (applies) + { + const UNIT *s2 = s + count; + for (;;) + { + if (s2 < s_end) + { + ucs4_t uc2; + int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); + if (uc_is_cased (uc2)) + { + applies = false; + break; + } + if (!uc_is_case_ignorable (uc2)) + break; + s2 += count2; + } + else + { + applies = ((suffix_context.bits & SCC_FINAL_SIGMA_MASK) == 0); + break; + } + } + } + break; + + case SCC_AFTER_SOFT_DOTTED: + /* "Before" condition: There is a Soft_Dotted character + before it, with no intervening character of + combining class 0 or 230 (Above). */ + /* Test the "before" condition. */ + applies = uc_is_property_soft_dotted (last_char_normal_or_above); + break; + + case SCC_MORE_ABOVE: + /* "After" condition: followed by a character of + combining class 230 (Above) with no intervening + character of combining class 0 or 230 (Above). */ + /* Test the "after" condition. */ + { + const UNIT *s2 = s + count; + applies = false; + for (;;) + { + if (s2 < s_end) + { + ucs4_t uc2; + int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); + int ccc = uc_combining_class (uc2); + if (ccc == UC_CCC_A) + { + applies = true; + break; + } + if (ccc == UC_CCC_NR) + break; + s2 += count2; + } + else + { + applies = ((suffix_context.bits & SCC_MORE_ABOVE_MASK) != 0); + break; + } + } + } + break; + + case SCC_BEFORE_DOT: + /* "After" condition: followed by COMBINING DOT ABOVE + (U+0307). Any sequence of characters with a + combining class that is neither 0 nor 230 may + intervene between the current character and the + combining dot above. */ + /* Test the "after" condition. */ + { + const UNIT *s2 = s + count; + applies = false; + for (;;) + { + if (s2 < s_end) + { + ucs4_t uc2; + int count2 = U_MBTOUC_UNSAFE (&uc2, s2, s_end - s2); + if (uc2 == 0x0307) /* COMBINING DOT ABOVE */ + { + applies = true; + break; + } + { + int ccc = uc_combining_class (uc2); + if (ccc == UC_CCC_A || ccc == UC_CCC_NR) + break; + } + s2 += count2; + } + else + { + applies = ((suffix_context.bits & SCC_BEFORE_DOT_MASK) != 0); + break; + } + } + } + break; + + case SCC_AFTER_I: + /* "Before" condition: There is an uppercase I before + it, and there is no intervening character of + combining class 0 or 230 (Above). */ + /* Test the "before" condition. */ + applies = (last_char_normal_or_above == 'I'); + break; + + default: + abort (); + } + if (rule->context < 0) + applies = !applies; + + if (applies) + { + /* The rule applies. + Look up the mapping (0 to 3 characters). */ + const unsigned short *mapped_in_rule = + (const unsigned short *)((const char *)rule + offset_in_rule); + + if (mapped_in_rule[0] == 0) + mapped_count = 0; + else + { + mapped_uc[0] = mapped_in_rule[0]; + if (mapped_in_rule[1] == 0) + mapped_count = 1; + else + { + mapped_uc[1] = mapped_in_rule[1]; + if (mapped_in_rule[2] == 0) + mapped_count = 2; + else + { + mapped_uc[2] = mapped_in_rule[2]; + mapped_count = 3; + } + } + } + goto found_mapping; + } + } + + /* Optimization: Save a hash table lookup in the next round. */ + if (!rule->has_next) + break; + } + } + + /* No special-cased mapping. So use the locale and context independent + mapping. */ + mapped_uc[0] = single_character_map (uc); + mapped_count = 1; + + found_mapping: + /* Found the mapping: uc maps to mapped_uc[0..mapped_count-1]. */ + { + unsigned int i; + + for (i = 0; i < mapped_count; i++) + { + ucs4_t muc = mapped_uc[i]; + + /* Append muc to the result accumulator. */ + if (length < allocated) + { + int ret = U_UCTOMB (result + length, muc, allocated - length); + if (ret == -1) + { + errno = EINVAL; + goto fail1; + } + if (ret >= 0) + { + length += ret; + goto done_appending; + } + } + { + size_t old_allocated = allocated; + size_t new_allocated = 2 * old_allocated; + if (new_allocated < 64) + new_allocated = 64; + if (new_allocated < old_allocated) /* integer overflow? */ + abort (); + { + UNIT *larger_result; + if (result == NULL) + { + larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); + if (larger_result == NULL) + { + errno = ENOMEM; + goto fail1; + } + } + else if (result == resultbuf) + { + larger_result = (UNIT *) malloc (new_allocated * sizeof (UNIT)); + if (larger_result == NULL) + { + errno = ENOMEM; + goto fail1; + } + U_CPY (larger_result, resultbuf, length); + } + else + { + larger_result = + (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); + if (larger_result == NULL) + { + errno = ENOMEM; + goto fail1; + } + } + result = larger_result; + allocated = new_allocated; + { + int ret = U_UCTOMB (result + length, muc, allocated - length); + if (ret == -1) + { + errno = EINVAL; + goto fail1; + } + if (ret < 0) + abort (); + length += ret; + goto done_appending; + } + } + } + done_appending: ; + } + } + + if (!uc_is_case_ignorable (uc)) + last_char_except_ignorable = uc; + + { + int ccc = uc_combining_class (uc); + if (ccc == UC_CCC_A || ccc == UC_CCC_NR) + last_char_normal_or_above = uc; + } + + s += count; + wp += count; + } + } + + free (wordbreaks); + + if (nf != NULL) + { + /* Finally, normalize the result. */ + UNIT *normalized_result; + + normalized_result = U_NORMALIZE (nf, result, length, resultbuf, lengthp); + if (normalized_result == NULL) + goto fail2; + + free (result); + return normalized_result; + } + + if (length == 0) + { + if (result == NULL) + { + /* Return a non-NULL value. NULL means error. */ + result = (UNIT *) malloc (1); + if (result == NULL) + { + errno = ENOMEM; + goto fail2; + } + } + } + else if (result != resultbuf && length < allocated) + { + /* Shrink the allocated memory if possible. */ + UNIT *memory; + + memory = (UNIT *) realloc (result, length * sizeof (UNIT)); + if (memory != NULL) + result = memory; + } + + *lengthp = length; + return result; + + fail1: + { + int saved_errno = errno; + free (wordbreaks); + errno = saved_errno; + } + fail2: + if (result != resultbuf) + { + int saved_errno = errno; + free (result); + errno = saved_errno; + } + return NULL; +} diff --git a/lib/unicase/u8-ct-totitle.c b/lib/unicase/u8-ct-totitle.c new file mode 100644 --- /dev/null +++ b/lib/unicase/u8-ct-totitle.c @@ -0,0 +1,43 @@ +/* Titlecase mapping for UTF-8 substrings (locale dependent). + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible , 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +#include + +/* Specification. */ +#include "unicase.h" + +#include +#include +#include +#include + +#include "unistr.h" +#include "unictype.h" +#include "uniwbrk.h" +#include "uninorm.h" +#include "caseprop.h" +#include "context.h" +#include "special-casing.h" + +#define FUNC u8_ct_totitle +#define UNIT uint8_t +#define U_WORDBREAKS u8_wordbreaks +#define U_MBTOUC_UNSAFE u8_mbtouc_unsafe +#define U_UCTOMB u8_uctomb +#define U_CPY u8_cpy +#define U_NORMALIZE u8_normalize +#include "u-ct-totitle.h" diff --git a/modules/unicase/u8-ct-totitle b/modules/unicase/u8-ct-totitle new file mode 100644 --- /dev/null +++ b/modules/unicase/u8-ct-totitle @@ -0,0 +1,38 @@ +Description: +Titlecase mapping for UTF-8 substrings (locale dependent). + +Files: +lib/unicase/u8-ct-totitle.c +lib/unicase/u-ct-totitle.h +lib/unicase/context.h + +Depends-on: +unicase/base +unicase/cased +unicase/ignorable +unicase/special-casing +unicase/totitle +unicase/tolower +uniwbrk/u8-wordbreaks +unictype/combining-class +unictype/property-soft-dotted +unistr/u8-mbtouc-unsafe +unistr/u8-uctomb +unistr/u8-cpy +uninorm/u8-normalize +stdbool + +configure.ac: + +Makefile.am: +lib_SOURCES += unicase/u8-ct-totitle.c + +Include: +"unicase.h" + +License: +LGPL + +Maintainer: +Bruno Haible +