# HG changeset patch # User Bruno Haible # Date 1049394907 0 # Node ID e272bc90178517f0629e7dbe641f15ed0f15d699 # Parent c7ca3dfda644aad195c288faf8064584b9ce8c84 New modules: utf8-ucs4, utf16-ucs4, ucs4-utf8, ucs4-utf16. diff --git a/ChangeLog b/ChangeLog --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2003-04-03 Bruno Haible + + * modules/utf8-ucs4: New file. + * modules/utf16-ucs4: New file. + * modules/ucs4-utf8: New file. + * modules/ucs4-utf16: New file. + * MODULES.html.sh (func_all_modules): Add them. + 2003-04-02 Bruno Haible * modules/binary-io: New file. diff --git a/MODULES.html.sh b/MODULES.html.sh --- a/MODULES.html.sh +++ b/MODULES.html.sh @@ -1824,10 +1824,10 @@ func_module unicodeio func_module rpmatch func_module yesno - #func_module ucs4-utf8 - #func_module ucs4-utf16 - #func_module utf8-ucs4 - #func_module utf16-ucs4 + func_module ucs4-utf8 + func_module ucs4-utf16 + func_module utf8-ucs4 + func_module utf16-ucs4 #func_module linebreak func_end_table diff --git a/lib/ChangeLog b/lib/ChangeLog --- a/lib/ChangeLog +++ b/lib/ChangeLog @@ -1,3 +1,10 @@ +2003-04-03 Bruno Haible + + * utf8-ucs4.h: New file, from GNU gettext. + * utf16-ucs4.h: New file, from GNU gettext. + * ucs4-utf8.h: New file, from GNU gettext. + * ucs4-utf16.h: New file, from GNU gettext. + 2003-04-02 Bruno Haible * binary-io.h: New file, from GNU gettext. diff --git a/lib/ucs4-utf16.h b/lib/ucs4-utf16.h new file mode 100644 --- /dev/null +++ b/lib/ucs4-utf16.h @@ -0,0 +1,55 @@ +/* Conversion UCS-4 to UTF-16. + Copyright (C) 2002 Free Software Foundation, Inc. + Written by Bruno Haible , 2002. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + + +#include + +/* Return the length (number of units) of the UTF-16 representation of uc, + after storing it at S. Return -1 upon failure, -2 if the number of + available units, N, is too small. */ +static int +u16_uctomb_aux (unsigned short *s, unsigned int uc, int n) +{ + if (uc >= 0x10000) + { + if (uc < 0x110000) + { + if (n >= 2) + { + s[0] = 0xd800 + ((uc - 0x10000) >> 10); + s[1] = 0xdc00 + ((uc - 0x10000) & 0x3ff); + return 2; + } + } + else + return -1; + } + return -2; +} + +static inline int +u16_uctomb (unsigned short *s, unsigned int uc, int n) +{ + if (uc < 0x10000 && n > 0) + { + s[0] = uc; + return 1; + } + else + return u16_uctomb_aux (s, uc, n); +} diff --git a/lib/ucs4-utf8.h b/lib/ucs4-utf8.h new file mode 100644 --- /dev/null +++ b/lib/ucs4-utf8.h @@ -0,0 +1,77 @@ +/* Conversion UCS-4 to UTF-8. + Copyright (C) 2002 Free Software Foundation, Inc. + Written by Bruno Haible , 2002. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + + +#include + +/* Return the length (number of units) of the UTF-8 representation of uc, + after storing it at S. Return -1 upon failure, -2 if the number of + available units, N, is too small. */ +static int +u8_uctomb_aux (unsigned char *s, unsigned int uc, int n) +{ + int count; + + if (uc < 0x80) + count = 1; + else if (uc < 0x800) + count = 2; + else if (uc < 0x10000) + count = 3; +#if 0 + else if (uc < 0x200000) + count = 4; + else if (uc < 0x4000000) + count = 5; + else if (uc <= 0x7fffffff) + count = 6; +#else + else if (uc < 0x110000) + count = 4; +#endif + else + return -1; + + if (n < count) + return -2; + + switch (count) /* note: code falls through cases! */ + { +#if 0 + case 6: s[5] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x4000000; + case 5: s[4] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x200000; +#endif + case 4: s[3] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x10000; + case 3: s[2] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0x800; + case 2: s[1] = 0x80 | (uc & 0x3f); uc = uc >> 6; uc |= 0xc0; + case 1: s[0] = uc; + } + return count; +} + +static inline int +u8_uctomb (unsigned char *s, unsigned int uc, int n) +{ + if (uc < 0x80 && n > 0) + { + s[0] = uc; + return 1; + } + else + return u8_uctomb_aux (s, uc, n); +} diff --git a/lib/utf16-ucs4.h b/lib/utf16-ucs4.h new file mode 100644 --- /dev/null +++ b/lib/utf16-ucs4.h @@ -0,0 +1,63 @@ +/* Conversion UTF-16 to UCS-4. + Copyright (C) 2001-2002 Free Software Foundation, Inc. + Written by Bruno Haible , 2001. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + + +#include + +/* Return the length (number of units) of the first character in S, putting + its 'ucs4_t' representation in *PUC. */ +static int +u16_mbtouc_aux (unsigned int *puc, const unsigned short *s, size_t n) +{ + unsigned short c = *s; + + if (c < 0xdc00) + { + if (n >= 2) + { + if (s[1] >= 0xdc00 && s[1] < 0xe000) + { + *puc = 0x10000 + ((c - 0xd800) << 10) + (s[1] - 0xdc00); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} +static inline int +u16_mbtouc (unsigned int *puc, const unsigned short *s, size_t n) +{ + unsigned short c = *s; + + if (c < 0xd800 || c >= 0xe000) + { + *puc = c; + return 1; + } + else + return u16_mbtouc_aux (puc, s, n); +} diff --git a/lib/utf8-ucs4.h b/lib/utf8-ucs4.h new file mode 100644 --- /dev/null +++ b/lib/utf8-ucs4.h @@ -0,0 +1,167 @@ +/* Conversion UTF-8 to UCS-4. + Copyright (C) 2001-2002 Free Software Foundation, Inc. + Written by Bruno Haible , 2001. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ + + +#include + +/* Return the length (number of units) of the first character in S, putting + its 'ucs4_t' representation in *PUC. */ +static int +u8_mbtouc_aux (unsigned int *puc, const unsigned char *s, size_t n) +{ + unsigned char c = *s; + + if (c >= 0xc2) + { + if (c < 0xe0) + { + if (n >= 2) + { + if ((s[1] ^ 0x80) < 0x40) + { + *puc = ((unsigned int) (c & 0x1f) << 6) + | (unsigned int) (s[1] ^ 0x80); + return 2; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xf0) + { + if (n >= 3) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (c >= 0xe1 || s[1] >= 0xa0)) + { + *puc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (s[1] ^ 0x80) << 6) + | (unsigned int) (s[2] ^ 0x80); + return 3; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xf8) + { + if (n >= 4) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 + && (c >= 0xf1 || s[1] >= 0x90) +#if 1 + && (c < 0xf4 || (c == 0xf4 && s[1] < 0x90)) +#endif + ) + { + *puc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (s[1] ^ 0x80) << 12) + | ((unsigned int) (s[2] ^ 0x80) << 6) + | (unsigned int) (s[3] ^ 0x80); + return 4; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } +#if 0 + else if (c < 0xfc) + { + if (n >= 5) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (c >= 0xf9 || s[1] >= 0x88)) + { + *puc = ((unsigned int) (c & 0x03) << 24) + | ((unsigned int) (s[1] ^ 0x80) << 18) + | ((unsigned int) (s[2] ^ 0x80) << 12) + | ((unsigned int) (s[3] ^ 0x80) << 6) + | (unsigned int) (s[4] ^ 0x80); + return 5; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } + else if (c < 0xfe) + { + if (n >= 6) + { + if ((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 + && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 + && (s[5] ^ 0x80) < 0x40 + && (c >= 0xfd || s[1] >= 0x84)) + { + *puc = ((unsigned int) (c & 0x01) << 30) + | ((unsigned int) (s[1] ^ 0x80) << 24) + | ((unsigned int) (s[2] ^ 0x80) << 18) + | ((unsigned int) (s[3] ^ 0x80) << 12) + | ((unsigned int) (s[4] ^ 0x80) << 6) + | (unsigned int) (s[5] ^ 0x80); + return 6; + } + /* invalid multibyte character */ + } + else + { + /* incomplete multibyte character */ + *puc = 0xfffd; + return n; + } + } +#endif + } + /* invalid multibyte character */ + *puc = 0xfffd; + return 1; +} +static inline int +u8_mbtouc (unsigned int *puc, const unsigned char *s, size_t n) +{ + unsigned char c = *s; + + if (c < 0x80) + { + *puc = c; + return 1; + } + else + return u8_mbtouc_aux (puc, s, n); +} diff --git a/m4/ChangeLog b/m4/ChangeLog --- a/m4/ChangeLog +++ b/m4/ChangeLog @@ -1,3 +1,8 @@ +2003-04-03 Bruno Haible + + * utf-ucs4.m4: New file. + * ucs4-utf.m4: New file. + 2003-03-30 Bruno Haible * copy-file.m4 (gl_COPY_FILE): Add check for chown(). diff --git a/m4/ucs4-utf.m4 b/m4/ucs4-utf.m4 new file mode 100644 --- /dev/null +++ b/m4/ucs4-utf.m4 @@ -0,0 +1,13 @@ +# ucs4-utf.m4 serial 1 +dnl Copyright (C) 2003 Free Software Foundation, Inc. +dnl This file is free software, distributed under the terms of the GNU +dnl General Public License. As a special exception to the GNU General +dnl Public License, this file may be distributed as part of a program +dnl that contains a configuration script generated by Autoconf, under +dnl the same distribution terms as the rest of that program. + +AC_DEFUN([gl_UCS4_UTF], +[ + dnl Prerequisites of lib/ucs4-utf8.h, lib/ucs4-utf16.h. + AC_REQUIRE([AC_C_INLINE]) +]) diff --git a/m4/utf-ucs4.m4 b/m4/utf-ucs4.m4 new file mode 100644 --- /dev/null +++ b/m4/utf-ucs4.m4 @@ -0,0 +1,13 @@ +# utf-ucs4.m4 serial 1 +dnl Copyright (C) 2003 Free Software Foundation, Inc. +dnl This file is free software, distributed under the terms of the GNU +dnl General Public License. As a special exception to the GNU General +dnl Public License, this file may be distributed as part of a program +dnl that contains a configuration script generated by Autoconf, under +dnl the same distribution terms as the rest of that program. + +AC_DEFUN([gl_UTF_UCS4], +[ + dnl Prerequisites of lib/utf8-ucs4.h, lib/utf16-ucs4.h. + AC_REQUIRE([AC_C_INLINE]) +]) diff --git a/modules/ucs4-utf16 b/modules/ucs4-utf16 new file mode 100644 --- /dev/null +++ b/modules/ucs4-utf16 @@ -0,0 +1,20 @@ +Description: +Conversion UCS-4 to UTF-16. + +Files: +lib/ucs4-utf16.h +m4/ucs4-utf.m4 + +Depends-on: + +configure.ac: +gl_UCS4_UTF + +Makefile.am: +lib_SOURCES += ucs4-utf16.h + +Include: + +Maintainer: +Bruno Haible + diff --git a/modules/ucs4-utf8 b/modules/ucs4-utf8 new file mode 100644 --- /dev/null +++ b/modules/ucs4-utf8 @@ -0,0 +1,20 @@ +Description: +Conversion UCS-4 to UTF-8. + +Files: +lib/ucs4-utf8.h +m4/ucs4-utf.m4 + +Depends-on: + +configure.ac: +gl_UCS4_UTF + +Makefile.am: +lib_SOURCES += ucs4-utf8.h + +Include: + +Maintainer: +Bruno Haible + diff --git a/modules/utf16-ucs4 b/modules/utf16-ucs4 new file mode 100644 --- /dev/null +++ b/modules/utf16-ucs4 @@ -0,0 +1,20 @@ +Description: +Conversion UTF-16 to UCS-4. + +Files: +lib/utf16-ucs4.h +m4/utf-ucs4.m4 + +Depends-on: + +configure.ac: +gl_UTF_UCS4 + +Makefile.am: +lib_SOURCES += utf16-ucs4.h + +Include: + +Maintainer: +Bruno Haible + diff --git a/modules/utf8-ucs4 b/modules/utf8-ucs4 new file mode 100644 --- /dev/null +++ b/modules/utf8-ucs4 @@ -0,0 +1,20 @@ +Description: +Conversion UTF-8 to UCS-4. + +Files: +lib/utf8-ucs4.h +m4/utf-ucs4.m4 + +Depends-on: + +configure.ac: +gl_UTF_UCS4 + +Makefile.am: +lib_SOURCES += utf8-ucs4.h + +Include: + +Maintainer: +Bruno Haible +