Mercurial > hg > octave-lojdl > gnulib-hg
changeset 7793:f5b4e66aa08b
New module 'utf8-ucs4-safe'.
author | Bruno Haible <bruno@clisp.org> |
---|---|
date | Mon, 08 Jan 2007 20:37:38 +0000 |
parents | 00f3d4842cf6 |
children | 3b5072f63d1b |
files | ChangeLog lib/unistr/utf8-ucs4-safe.c lib/utf8-ucs4-safe.h modules/utf8-ucs4-safe |
diffstat | 4 files changed, 232 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +2006-12-25 Bruno Haible <bruno@clisp.org> + + * modules/utf8-ucs4-safe: New file. + * lib/utf8-ucs4-safe.h: New file. + * lib/unistr/utf8-ucs4-safe.c: New file. + 2007-01-08 Bruno Haible <bruno@clisp.org> * modules/utf8-ucs4 (Files, lib_SOURCES): Add unistr/utf8-ucs4.c.
new file mode 100644 --- /dev/null +++ b/lib/unistr/utf8-ucs4-safe.c @@ -0,0 +1,156 @@ +/* Conversion UTF-8 to UCS-4. + Copyright (C) 2001-2002, 2006 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2001. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Library General Public License as published + by the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, + USA. */ + +#include <config.h> + +/* Specification. */ +#include "utf8-ucs4-safe.h" + +int +u8_mbtouc_safe_aux (ucs4_t *puc, const uint8_t *s, size_t n) +{ + uint8_t c = *s; + + if (c >= 0xc2) + { + if (c < 0xe0) + { + if (n >= 2) + { + if ((s[1] ^ 0x80) < 0x40) + { + *puc = ((unsigned int) (c & 0x1f) << 6) + | (unsigned int) (s[1] ^ 0x80); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xf0) + { + if (n >= 3) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0) + && (c != 0xed || s[1] < 0xa0)) + { + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); + return 3; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xf8) + { + if (n >= 4) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) +#if 1 + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) +#endif + ) + { + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } +#if 0 + else if (c < 0xfc) + { + if (n >= 5) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (c >= 0xf9 || s[1] >= 0x88)) + { + *puc = ((unsigned int) (c & 0x03) << 24) + | ((unsigned int) (s[1] ^ 0x80) << 18) + | ((unsigned int) (s[2] ^ 0x80) << 12) + | ((unsigned int) (s[3] ^ 0x80) << 6) + | (unsigned int) (s[4] ^ 0x80); + return 5; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xfe) + { + if (n >= 6) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (s[5] ^ 0x80) < 0x40 + && (c >= 0xfd || s[1] >= 0x84)) + { + *puc = ((unsigned int) (c & 0x01) << 30) + | ((unsigned int) (s[1] ^ 0x80) << 24) + | ((unsigned int) (s[2] ^ 0x80) << 18) + | ((unsigned int) (s[3] ^ 0x80) << 12) + | ((unsigned int) (s[4] ^ 0x80) << 6) + | (unsigned int) (s[5] ^ 0x80); + return 6; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } +#endif + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +}
new file mode 100644 --- /dev/null +++ b/lib/utf8-ucs4-safe.h @@ -0,0 +1,45 @@ +/* Conversion UTF-8 to UCS-4. + Copyright (C) 2001-2002, 2005-2007 Free Software Foundation, Inc. + Written by Bruno Haible <bruno@clisp.org>, 2001. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU Library General Public License as published + by the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, + USA. */ + +#ifndef _UTF8_UCS4_SAFE_H +#define _UTF8_UCS4_SAFE_H + +#include <stddef.h> +#include "unitypes.h" + +extern int u8_mbtouc_safe_aux (ucs4_t *puc, const uint8_t *s, size_t n); + +/* Return the length (number of units) of the first character in S, putting + its 'ucs4_t' representation in *PUC. + The number of available units, N, must be > 0. */ +static inline int +u8_mbtouc_safe (ucs4_t *puc, const uint8_t *s, size_t n) +{ + uint8_t c = *s; + + if (c < 0x80) + { + *puc = c; + return 1; + } + else + return u8_mbtouc_safe_aux (puc, s, n); +} + +#endif /* _UTF8_UCS4_SAFE_H */
new file mode 100644 --- /dev/null +++ b/modules/utf8-ucs4-safe @@ -0,0 +1,25 @@ +Description: +Conversion UTF-8 to UCS-4. + +Files: +lib/utf8-ucs4-safe.h +lib/unistr/utf8-ucs4-safe.c +m4/utf-ucs4.m4 + +Depends-on: + +configure.ac: +gl_UTF_UCS4 + +Makefile.am: +lib_SOURCES += utf8-ucs4-safe.h unistr/utf8-ucs4-safe.c + +Include: +"utf8-ucs4-safe.h" + +License: +LGPL + +Maintainer: +Bruno Haible +