# HG changeset patch # User Bruno Haible # Date 1235216294 -3600 # Node ID 44ff96171fda56ef2b74df3a18e0485b6564d506 # Parent 9e0f28c297bf6fc5766727fee0395c4b723c4d54 New module 'uninorm/u8-normalize'. diff --git a/ChangeLog b/ChangeLog --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,11 @@ 2009-02-21 Bruno Haible + New module 'uninorm/u8-normalize'. + * lib/uninorm/u8-normalize.c: New file. + * lib/uninorm/normalize-internal.h: New file. + * lib/uninorm/u-normalize-internal.h: New file. + * modules/uninorm/u8-normalize: New file. + New module 'uninorm/decompose-internal'. * lib/uninorm/decompose-internal.c: New file. * modules/uninorm/decompose-internal: New file. diff --git a/lib/uninorm/normalize-internal.h b/lib/uninorm/normalize-internal.h new file mode 100644 --- /dev/null +++ b/lib/uninorm/normalize-internal.h @@ -0,0 +1,34 @@ +/* Normalization of Unicode strings. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible , 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +#include + +#include "unitypes.h" + +/* Complete definition of normalization form descriptor. */ +struct unicode_normalization_form +{ + /* Bit mask containing meta-information. */ + unsigned int description; + #define NF_IS_COMPAT_DECOMPOSING (1 << 0) + #define NF_IS_COMPOSING (1 << 1) + /* Function that decomposes a Unicode character. */ + int (*decomposer) (ucs4_t uc, ucs4_t *decomposition); + /* Function that combines two Unicode characters, a starter and another + character. */ + ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2); +}; diff --git a/lib/uninorm/u-normalize-internal.h b/lib/uninorm/u-normalize-internal.h new file mode 100644 --- /dev/null +++ b/lib/uninorm/u-normalize-internal.h @@ -0,0 +1,352 @@ +/* Decomposition and composition of Unicode strings. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible , 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +UNIT * +FUNC (uninorm_t nf, const UNIT *s, size_t n, + UNIT *resultbuf, size_t *lengthp) +{ + int (*decomposer) (ucs4_t uc, ucs4_t *decomposition) = nf->decomposer; + ucs4_t (*composer) (ucs4_t uc1, ucs4_t uc2) = nf->composer; + + /* The result being accumulated. */ + UNIT *result; + size_t length; + size_t allocated; + /* The buffer for sorting. */ + #define SORTBUF_PREALLOCATED 64 + struct ucs4_with_ccc sortbuf_preallocated[2 * SORTBUF_PREALLOCATED]; + struct ucs4_with_ccc *sortbuf; /* array of size 2 * sortbuf_allocated */ + size_t sortbuf_allocated; + size_t sortbuf_count; + + /* Initialize the accumulator. */ + if (resultbuf == NULL) + { + result = NULL; + allocated = 0; + } + else + { + result = resultbuf; + allocated = *lengthp; + } + length = 0; + + /* Initialize the buffer for sorting. */ + sortbuf = sortbuf_preallocated; + sortbuf_allocated = SORTBUF_PREALLOCATED; + sortbuf_count = 0; + + { + const UNIT *s_end = s + n; + + for (;;) + { + int count; + ucs4_t decomposed[UC_DECOMPOSITION_MAX_LENGTH]; + int decomposed_count; + int i; + + if (s < s_end) + { + /* Fetch the next character. */ + count = U_MBTOUC_UNSAFE (&decomposed[0], s, s_end - s); + decomposed_count = 1; + + /* Decompose it, recursively. + It would be possible to precompute the recursive decomposition + and store it in a table. But this would significantly increase + the size of the decomposition tables, because for example for + U+1FC1 the recursive canonical decomposition and the recursive + compatibility decomposition are different. */ + { + int curr; + + for (curr = 0; curr < decomposed_count; ) + { + /* Invariant: decomposed[0..curr-1] is fully decomposed, i.e. + all elements are atomic. */ + ucs4_t curr_decomposed[UC_DECOMPOSITION_MAX_LENGTH]; + int curr_decomposed_count; + + curr_decomposed_count = decomposer (decomposed[curr], curr_decomposed); + if (curr_decomposed_count >= 0) + { + /* Move curr_decomposed[0..curr_decomposed_count-1] over + decomposed[curr], making room. It's not worth using + memcpy() here, since the counts are so small. */ + int shift = curr_decomposed_count - 1; + + if (shift < 0) + abort (); + if (shift > 0) + { + int j; + + decomposed_count += shift; + if (decomposed_count > UC_DECOMPOSITION_MAX_LENGTH) + abort (); + for (j = decomposed_count - 1 - shift; j > curr; j--) + decomposed[j + shift] = decomposed[j]; + } + for (; shift >= 0; shift--) + decomposed[curr + shift] = curr_decomposed[shift]; + } + else + { + /* decomposed[curr] is atomic. */ + curr++; + } + } + } + } + else + { + count = 0; + decomposed_count = 0; + } + + i = 0; + for (;;) + { + ucs4_t uc; + int ccc; + + if (s < s_end) + { + /* Fetch the next character from the decomposition. */ + if (i == decomposed_count) + break; + uc = decomposed[i]; + ccc = uc_combining_class (uc); + } + else + { + /* End of string reached. */ + uc = 0; + ccc = 0; + } + + if (ccc == 0) + { + size_t j; + + /* Apply the canonical ordering algorithm to the accumulated + sequence of characters. */ + if (sortbuf_count > 1) + gl_uninorm_decompose_merge_sort_inplace (sortbuf, sortbuf_count, + sortbuf + sortbuf_count); + + if (composer != NULL) + { + /* Attempt to combine decomposed characters, as specified + in the Unicode Standard Annex #15 "Unicode Normalization + Forms". We need to check + 1. whether the first accumulated character is a + "starter" (i.e. has ccc = 0). This is usually the + case. But when the string starts with a + non-starter, the sortbuf also starts with a + non-starter. Btw, this check could also be + omitted, because the composition table has only + entries (code1, code2) for which code1 is a + starter; if the first accumulated character is not + a starter, no lookup will succeed. + 2. If the sortbuf has more than one character, check + for each of these characters that are not "blocked" + from the starter (i.e. have a ccc that is higher + than the ccc of the previous character) whether it + can be combined with the first character. + 3. If only one character is left in sortbuf, check + whether it can be combined with the next character + (also a starter). */ + if (sortbuf_count > 0 && sortbuf[0].ccc == 0) + { + for (j = 1; j < sortbuf_count; ) + { + if (sortbuf[j].ccc > sortbuf[j - 1].ccc) + { + ucs4_t combined = + composer (sortbuf[0].code, sortbuf[j].code); + if (combined) + { + size_t k; + + sortbuf[0].code = combined; + /* sortbuf[0].ccc = 0, still valid. */ + for (k = j + 1; k < sortbuf_count; k++) + sortbuf[k - 1] = sortbuf[k]; + sortbuf_count--; + continue; + } + } + j++; + } + if (s < s_end && sortbuf_count == 1) + { + ucs4_t combined = + composer (sortbuf[0].code, uc); + if (combined) + { + uc = combined; + ccc = 0; + /* uc could be further combined with subsequent + characters. So don't put it into sortbuf[0] in + this round, only in the next round. */ + sortbuf_count = 0; + } + } + } + } + + for (j = 0; j < sortbuf_count; j++) + { + ucs4_t muc = sortbuf[j].code; + + /* Append muc to the result accumulator. */ + if (length < allocated) + { + int ret = + U_UCTOMB (result + length, muc, allocated - length); + if (ret == -1) + { + errno = EINVAL; + goto fail; + } + if (ret >= 0) + { + length += ret; + goto done_appending; + } + } + { + size_t old_allocated = allocated; + size_t new_allocated = 2 * old_allocated; + if (new_allocated < 64) + new_allocated = 64; + if (new_allocated < old_allocated) /* integer overflow? */ + abort (); + { + UNIT *larger_result; + if (result == NULL) + { + larger_result = + (UNIT *) malloc (new_allocated * sizeof (UNIT)); + if (larger_result == NULL) + { + errno = ENOMEM; + goto fail; + } + } + else if (result == resultbuf) + { + larger_result = + (UNIT *) malloc (new_allocated * sizeof (UNIT)); + if (larger_result == NULL) + { + errno = ENOMEM; + goto fail; + } + U_CPY (larger_result, resultbuf, length); + } + else + { + larger_result = + (UNIT *) realloc (result, new_allocated * sizeof (UNIT)); + if (larger_result == NULL) + { + errno = ENOMEM; + goto fail; + } + } + result = larger_result; + allocated = new_allocated; + { + int ret = + U_UCTOMB (result + length, muc, allocated - length); + if (ret == -1) + { + errno = EINVAL; + goto fail; + } + if (ret < 0) + abort (); + length += ret; + goto done_appending; + } + } + } + done_appending: ; + } + + /* sortbuf is now empty. */ + sortbuf_count = 0; + } + + if (!(s < s_end)) + /* End of string reached. */ + break; + + /* Append (uc, ccc) to sortbuf. */ + if (sortbuf_count == sortbuf_allocated) + { + struct ucs4_with_ccc *new_sortbuf; + + sortbuf_allocated = 2 * sortbuf_allocated; + if (sortbuf_allocated < sortbuf_count) /* integer overflow? */ + abort (); + new_sortbuf = + (struct ucs4_with_ccc *) malloc (2 * sortbuf_allocated * sizeof (struct ucs4_with_ccc)); + memcpy (new_sortbuf, sortbuf, + sortbuf_count * sizeof (struct ucs4_with_ccc)); + if (sortbuf != sortbuf_preallocated) + free (sortbuf); + sortbuf = new_sortbuf; + } + sortbuf[sortbuf_count].code = uc; + sortbuf[sortbuf_count].ccc = ccc; + sortbuf_count++; + + i++; + } + + if (!(s < s_end)) + /* End of string reached. */ + break; + + s += count; + } + } + + if (sortbuf_count > 0) + abort (); + if (sortbuf != sortbuf_preallocated) + free (sortbuf); + + *lengthp = length; + return result; + + fail: + { + int saved_errno = errno; + if (sortbuf != sortbuf_preallocated) + free (sortbuf); + if (result != resultbuf) + free (result); + errno = saved_errno; + } + return NULL; +} diff --git a/lib/uninorm/u8-normalize.c b/lib/uninorm/u8-normalize.c new file mode 100644 --- /dev/null +++ b/lib/uninorm/u8-normalize.c @@ -0,0 +1,38 @@ +/* Normalization of UTF-8 strings. + Copyright (C) 2009 Free Software Foundation, Inc. + Written by Bruno Haible , 2009. + + This program is free software: you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +#include + +/* Specification. */ +#include "uninorm.h" + +#include +#include +#include +#include + +#include "unistr.h" +#include "unictype.h" +#include "normalize-internal.h" +#include "decompose-internal.h" + +#define FUNC u8_normalize +#define UNIT uint8_t +#define U_MBTOUC_UNSAFE u8_mbtouc_unsafe +#define U_UCTOMB u8_uctomb +#define U_CPY u8_cpy +#include "u-normalize-internal.h" diff --git a/modules/uninorm/u8-normalize b/modules/uninorm/u8-normalize new file mode 100644 --- /dev/null +++ b/modules/uninorm/u8-normalize @@ -0,0 +1,31 @@ +Description: +Normalization of UTF-8 strings. + +Files: +lib/uninorm/u8-normalize.c +lib/uninorm/normalize-internal.h +lib/uninorm/u-normalize-internal.h + +Depends-on: +uninorm/base +unistr/u8-mbtouc-unsafe +unistr/u8-uctomb +unistr/u8-cpy +unictype/combining-class +uninorm/decompose-internal + +configure.ac: +gl_MODULE_INDICATOR([uninorm/u8-normalize]) + +Makefile.am: +lib_SOURCES += uninorm/u8-normalize.c + +Include: +"uninorm.h" + +License: +LGPL + +Maintainer: +Bruno Haible +