1
0
mirror of https://git.FreeBSD.org/src.git synced 2024-12-16 10:20:30 +00:00

Add libiconv based versions of *c16*() and *c32*().

I initially thought wchar_t was locale independent, but this seems to be
only the case on Linux. This means that we cannot depend on the *wc*()
routines to implement *c16*() and *c32*(). Instead, use the Citrus
libiconv that is part of libc.

I'll see if there is anything I can do to make the existing functions
somewhat useful in case the system is built without libiconv in the
nearby future. If not, I'll simply remove the broken implementations.

Reviewed by:	jilles, gabor
This commit is contained in:
Ed Schouten 2013-06-03 17:17:56 +00:00
parent f8ca2db1f8
commit 49111f0092
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=251314
9 changed files with 389 additions and 3 deletions

View File

@ -4,11 +4,11 @@
# locale sources
.PATH: ${.CURDIR}/${LIBC_ARCH}/locale ${.CURDIR}/locale
SRCS+= ascii.c big5.c btowc.c c16rtomb.c c32rtomb.c collate.c collcmp.c euc.c \
fix_grouping.c gb18030.c gb2312.c gbk.c ctype.c isctype.c iswctype.c \
SRCS+= ascii.c big5.c btowc.c collate.c collcmp.c euc.c fix_grouping.c \
gb18030.c gb2312.c gbk.c ctype.c isctype.c iswctype.c \
ldpart.c lmessages.c lmonetary.c lnumeric.c localeconv.c mblen.c \
mbrlen.c \
mbrtoc16.c mbrtoc32.c mbrtowc.c mbsinit.c mbsnrtowcs.c \
mbrtowc.c mbsinit.c mbsnrtowcs.c \
mbsrtowcs.c mbtowc.c mbstowcs.c \
mskanji.c nextwctype.c nl_langinfo.c nomacros.c none.c rpmatch.c \
rune.c \
@ -23,6 +23,12 @@ SRCS+= ascii.c big5.c btowc.c c16rtomb.c c32rtomb.c collate.c collcmp.c euc.c \
wcwidth.c\
xlocale.c
.if ${MK_ICONV} != "no"
SRCS+= c16rtomb_iconv.c c32rtomb_iconv.c mbrtoc16_iconv.c mbrtoc32_iconv.c
.else
SRCS+= c16rtomb.c c32rtomb.c mbrtoc16.c mbrtoc32.c
.endif
SYM_MAPS+=${.CURDIR}/locale/Symbol.map
MAN+= btowc.3 \

View File

@ -0,0 +1,8 @@
/* $FreeBSD$ */
#define charXX_t char16_t
#define cXXrtomb c16rtomb
#define cXXrtomb_l c16rtomb_l
#define SRCBUF_LEN 2
#define UTF_XX_INTERNAL "UTF-16-INTERNAL"
#include "cXXrtomb_iconv.h"

View File

@ -0,0 +1,8 @@
/* $FreeBSD$ */
#define charXX_t char32_t
#define cXXrtomb c32rtomb
#define cXXrtomb_l c32rtomb_l
#define SRCBUF_LEN 1
#define UTF_XX_INTERNAL "UTF-32-INTERNAL"
#include "cXXrtomb_iconv.h"

View File

@ -0,0 +1,115 @@
/*-
* Copyright (c) 2013 Ed Schouten <ed@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/queue.h>
#include <assert.h>
#include <errno.h>
#include <langinfo.h>
#include <uchar.h>
#include "../iconv/citrus_hash.h"
#include "../iconv/citrus_module.h"
#include "../iconv/citrus_iconv.h"
#include "xlocale_private.h"
typedef struct {
bool initialized;
struct _citrus_iconv iconv;
union {
charXX_t widechar[SRCBUF_LEN];
char bytes[sizeof(charXX_t) * SRCBUF_LEN];
} srcbuf;
size_t srcbuf_len;
} _ConversionState;
_Static_assert(sizeof(_ConversionState) <= sizeof(mbstate_t),
"Size of _ConversionState must not exceed mbstate_t's size.");
size_t
cXXrtomb_l(char * __restrict s, charXX_t c, mbstate_t * __restrict ps,
locale_t locale)
{
_ConversionState *cs;
struct _citrus_iconv *handle;
char *src, *dst;
size_t srcleft, dstleft, invlen;
int err;
FIX_LOCALE(locale);
if (ps == NULL)
ps = &locale->cXXrtomb;
cs = (_ConversionState *)ps;
handle = &cs->iconv;
/* Reinitialize mbstate_t. */
if (s == NULL || !cs->initialized) {
if (_citrus_iconv_open(&handle, UTF_XX_INTERNAL,
nl_langinfo_l(CODESET, locale)) != 0) {
cs->initialized = false;
errno = EINVAL;
return (-1);
}
handle->cv_shared->ci_discard_ilseq = true;
handle->cv_shared->ci_hooks = NULL;
cs->srcbuf_len = 0;
cs->initialized = true;
if (s == NULL)
return (1);
}
assert(cs->srcbuf_len < sizeof(cs->srcbuf.widechar) / sizeof(charXX_t));
cs->srcbuf.widechar[cs->srcbuf_len++] = c;
/* Perform conversion. */
src = cs->srcbuf.bytes;
srcleft = cs->srcbuf_len * sizeof(charXX_t);
dst = s;
dstleft = MB_CUR_MAX_L(locale);
err = _citrus_iconv_convert(handle, &src, &srcleft, &dst, &dstleft,
0, &invlen);
/* Character is part of a surrogate pair. We need more input. */
if (err == EINVAL)
return (0);
cs->srcbuf_len = 0;
/* Illegal sequence. */
if (dst == s) {
errno = EILSEQ;
return ((size_t)-1);
}
return (dst - s);
}
size_t
cXXrtomb(char * __restrict s, charXX_t c, mbstate_t * __restrict ps)
{
return (cXXrtomb_l(s, c, ps, __get_locale()));
}

View File

@ -0,0 +1,8 @@
/* $FreeBSD$ */
#define charXX_t char16_t
#define mbrtocXX mbrtoc16
#define mbrtocXX_l mbrtoc16_l
#define DSTBUF_LEN 2
#define UTF_XX_INTERNAL "UTF-16-INTERNAL"
#include "mbrtocXX_iconv.h"

View File

@ -0,0 +1,8 @@
/* $FreeBSD$ */
#define charXX_t char32_t
#define mbrtocXX mbrtoc32
#define mbrtocXX_l mbrtoc32_l
#define DSTBUF_LEN 1
#define UTF_XX_INTERNAL "UTF-32-INTERNAL"
#include "mbrtocXX_iconv.h"

View File

@ -0,0 +1,158 @@
/*-
* Copyright (c) 2013 Ed Schouten <ed@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/queue.h>
#include <assert.h>
#include <errno.h>
#include <langinfo.h>
#include <limits.h>
#include <string.h>
#include <uchar.h>
#include "../iconv/citrus_hash.h"
#include "../iconv/citrus_module.h"
#include "../iconv/citrus_iconv.h"
#include "xlocale_private.h"
typedef struct {
bool initialized;
struct _citrus_iconv iconv;
char srcbuf[MB_LEN_MAX];
size_t srcbuf_len;
union {
charXX_t widechar[DSTBUF_LEN];
char bytes[sizeof(charXX_t) * DSTBUF_LEN];
} dstbuf;
size_t dstbuf_len;
} _ConversionState;
_Static_assert(sizeof(_ConversionState) <= sizeof(mbstate_t),
"Size of _ConversionState must not exceed mbstate_t's size.");
size_t
mbrtocXX_l(charXX_t * __restrict pc, const char * __restrict s, size_t n,
mbstate_t * __restrict ps, locale_t locale)
{
_ConversionState *cs;
struct _citrus_iconv *handle;
size_t i, retval;
charXX_t retchar;
FIX_LOCALE(locale);
if (ps == NULL)
ps = &locale->mbrtocXX;
cs = (_ConversionState *)ps;
handle = &cs->iconv;
/* Reinitialize mbstate_t. */
if (s == NULL || !cs->initialized) {
if (_citrus_iconv_open(&handle,
nl_langinfo_l(CODESET, locale), UTF_XX_INTERNAL) != 0) {
cs->initialized = false;
errno = EINVAL;
return (-1);
}
handle->cv_shared->ci_discard_ilseq = true;
handle->cv_shared->ci_hooks = NULL;
cs->srcbuf_len = cs->dstbuf_len = 0;
cs->initialized = true;
if (s == NULL)
return (0);
}
/* See if we still have characters left from the previous invocation. */
if (cs->dstbuf_len > 0) {
retval = (size_t)-3;
goto return_char;
}
/* Fill up the read buffer as far as possible. */
if (n > sizeof(cs->srcbuf) - cs->srcbuf_len)
n = sizeof(cs->srcbuf) - cs->srcbuf_len;
memcpy(cs->srcbuf + cs->srcbuf_len, s, n);
/* Convert as few characters to the dst buffer as possible. */
for (i = 0; ; i++) {
char *src, *dst;
size_t srcleft, dstleft, invlen;
int err;
src = cs->srcbuf;
srcleft = cs->srcbuf_len + n;
dst = cs->dstbuf.bytes;
dstleft = i * sizeof(charXX_t);
assert(srcleft <= sizeof(cs->srcbuf) &&
dstleft <= sizeof(cs->dstbuf.bytes));
err = _citrus_iconv_convert(handle, &src, &srcleft,
&dst, &dstleft, 0, &invlen);
cs->dstbuf_len = (dst - cs->dstbuf.bytes) / sizeof(charXX_t);
/* Got new character(s). Return the first. */
if (cs->dstbuf_len > 0) {
assert(src - cs->srcbuf > cs->srcbuf_len);
retval = src - cs->srcbuf - cs->srcbuf_len;
cs->srcbuf_len = 0;
goto return_char;
}
/* Increase dst buffer size, to obtain the surrogate pair. */
if (err == E2BIG)
continue;
/* Illegal sequence. */
if (invlen > 0) {
cs->srcbuf_len = 0;
errno = EILSEQ;
return ((size_t)-1);
}
/* Save unprocessed remainder for the next invocation. */
memmove(cs->srcbuf, src, srcleft);
cs->srcbuf_len = srcleft;
return ((size_t)-2);
}
return_char:
retchar = cs->dstbuf.widechar[0];
memmove(&cs->dstbuf.widechar[0], &cs->dstbuf.widechar[1],
--cs->dstbuf_len * sizeof(charXX_t));
if (pc != NULL)
*pc = retchar;
if (retchar == 0)
return (0);
return (retval);
}
size_t
mbrtocXX(charXX_t * __restrict pc, const char * __restrict s, size_t n,
mbstate_t * __restrict ps)
{
return (mbrtocXX_l(pc, s, n, ps, __get_locale()));
}

View File

@ -82,6 +82,34 @@ main(int argc, char *argv[])
assert(c16rtomb(buf, 0xd83d, &s) == 0);
assert(c16rtomb(buf, 0xdca9, &s) == (size_t)-1);
assert(errno == EILSEQ);
assert((unsigned char)buf[0] == 0xcc);
/*
* ISO8859-1.
*/
assert(strcmp(setlocale(LC_CTYPE, "en_US.ISO8859-1"),
"en_US.ISO8859-1") == 0);
/* Unicode character 'Euro sign'. */
memset(&s, 0, sizeof(s));
memset(buf, 0xcc, sizeof(buf));
assert(c16rtomb(buf, 0x20ac, &s) == (size_t)-1);
assert(errno == EILSEQ);
assert((unsigned char)buf[0] == 0xcc);
/*
* ISO8859-15.
*/
assert(strcmp(setlocale(LC_CTYPE, "en_US.ISO8859-15"),
"en_US.ISO8859-15") == 0);
/* Unicode character 'Euro sign'. */
memset(&s, 0, sizeof(s));
memset(buf, 0xcc, sizeof(buf));
assert(c16rtomb(buf, 0x20ac, &s) == 1);
assert((unsigned char)buf[0] == 0xa4 && (unsigned char)buf[1] == 0xcc);
/*
* UTF-8.
@ -104,12 +132,14 @@ main(int argc, char *argv[])
assert(c16rtomb(buf, 0xd83d, &s) == 0);
assert(c16rtomb(buf, L'A', &s) == (size_t)-1);
assert(errno == EILSEQ);
assert((unsigned char)buf[0] == 0xcc);
/* Invalid code; 'Pile of poo' without the lead surrogate. */
memset(&s, 0, sizeof(s));
memset(buf, 0xcc, sizeof(buf));
assert(c16rtomb(buf, 0xdca9, &s) == (size_t)-1);
assert(errno == EILSEQ);
assert((unsigned char)buf[0] == 0xcc);
printf("ok 1 - c16rtomb()\n");
}

View File

@ -85,6 +85,37 @@ main(int argc, char *argv[])
assert(mbrtoc16(&c16, "", 0, &s) == (size_t)-2);
assert(c16 == L'z');
/* Check that mbrtoc16() doesn't read ahead too aggressively. */
memset(&s, 0, sizeof(s));
assert(mbrtoc16(&c16, "AB", 2, &s) == 1);
assert(c16 == L'A');
assert(mbrtoc16(&c16, "C", 1, &s) == 1);
assert(c16 == L'C');
/*
* ISO-8859-1.
*/
assert(strcmp(setlocale(LC_CTYPE, "en_US.ISO8859-1"),
"en_US.ISO8859-1") == 0);
/* Currency sign. */
memset(&s, 0, sizeof(s));
assert(mbrtoc16(&c16, "\xa4", 1, &s) == 1);
assert(c16 == 0xa4);
/*
* ISO-8859-15.
*/
assert(strcmp(setlocale(LC_CTYPE, "en_US.ISO8859-15"),
"en_US.ISO8859-15") == 0);
/* Euro sign. */
memset(&s, 0, sizeof(s));
assert(mbrtoc16(&c16, "\xa4", 1, &s) == 1);
assert(c16 == 0x20ac);
/*
* UTF-8.
*/
@ -144,6 +175,20 @@ main(int argc, char *argv[])
assert(mbrtoc16(&c16, "", 0, &s) == (size_t)-3);
assert(c16 == 0xdca9);
/* Letter e with acute, precomposed. */
memset(&s, 0, sizeof(s));
c16 = 0;
assert(mbrtoc16(&c16, "\xc3\xa9", 2, &s) == 2);
assert(c16 == 0xe9);
/* Letter e with acute, combined. */
memset(&s, 0, sizeof(s));
c16 = 0;
assert(mbrtoc16(&c16, "\x65\xcc\x81", 3, &s) == 1);
assert(c16 == 0x65);
assert(mbrtoc16(&c16, "\xcc\x81", 2, &s) == 2);
assert(c16 == 0x301);
printf("ok 1 - mbrtoc16()\n");
return (0);