mirror of
https://git.FreeBSD.org/src.git
synced 2024-12-15 10:17:20 +00:00
Add UTF-8-specific implementations of mbsnrtowcs() and wcsnrtombs().
These convert plain ASCII characters in-line, making them only slightly slower than the single-byte ("NONE" encoding) version when processing ASCII strings.
This commit is contained in:
parent
47df0a78eb
commit
ea9a9a377b
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=132687
@ -28,6 +28,7 @@
|
||||
__FBSDID("$FreeBSD$");
|
||||
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <runetype.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
@ -37,7 +38,11 @@ __FBSDID("$FreeBSD$");
|
||||
size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t,
|
||||
mbstate_t * __restrict);
|
||||
int _UTF8_mbsinit(const mbstate_t *);
|
||||
size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, const char ** __restrict,
|
||||
size_t, size_t, mbstate_t * __restrict);
|
||||
size_t _UTF8_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict);
|
||||
size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
|
||||
size_t, size_t, mbstate_t * __restrict);
|
||||
|
||||
typedef struct {
|
||||
wchar_t ch;
|
||||
@ -52,6 +57,8 @@ _UTF8_init(_RuneLocale *rl)
|
||||
__mbrtowc = _UTF8_mbrtowc;
|
||||
__wcrtomb = _UTF8_wcrtomb;
|
||||
__mbsinit = _UTF8_mbsinit;
|
||||
__mbsnrtowcs = _UTF8_mbsnrtowcs;
|
||||
__wcsnrtombs = _UTF8_wcsnrtombs;
|
||||
_CurrentRuneLocale = rl;
|
||||
__mb_cur_max = 6;
|
||||
|
||||
@ -187,6 +194,88 @@ _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
|
||||
return (wch == L'\0' ? 0 : want);
|
||||
}
|
||||
|
||||
size_t
|
||||
_UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
|
||||
size_t nms, size_t len, mbstate_t * __restrict ps)
|
||||
{
|
||||
_UTF8State *us;
|
||||
const char *s;
|
||||
size_t nchr;
|
||||
wchar_t wc;
|
||||
size_t nb;
|
||||
|
||||
us = (_UTF8State *)ps;
|
||||
|
||||
s = *src;
|
||||
nchr = 0;
|
||||
|
||||
if (dst == NULL) {
|
||||
/*
|
||||
* The fast path in the loop below is not safe if an ASCII
|
||||
* character appears as anything but the first byte of a
|
||||
* multibyte sequence. Check now to avoid doing it in the loop.
|
||||
*/
|
||||
if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
|
||||
errno = EILSEQ;
|
||||
return ((size_t)-1);
|
||||
}
|
||||
for (;;) {
|
||||
if (nms > 0 && (signed char)*s > 0)
|
||||
/*
|
||||
* Fast path for plain ASCII characters
|
||||
* excluding NUL.
|
||||
*/
|
||||
nb = 1;
|
||||
else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
|
||||
(size_t)-1)
|
||||
/* Invalid sequence - mbrtowc() sets errno. */
|
||||
return ((size_t)-1);
|
||||
else if (nb == 0 || nb == (size_t)-2)
|
||||
return (nchr);
|
||||
s += nb;
|
||||
nms -= nb;
|
||||
nchr++;
|
||||
}
|
||||
/*NOTREACHED*/
|
||||
}
|
||||
|
||||
/*
|
||||
* The fast path in the loop below is not safe if an ASCII
|
||||
* character appears as anything but the first byte of a
|
||||
* multibyte sequence. Check now to avoid doing it in the loop.
|
||||
*/
|
||||
if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
|
||||
errno = EILSEQ;
|
||||
return ((size_t)-1);
|
||||
}
|
||||
while (len-- > 0) {
|
||||
if (nms > 0 && (signed char)*s > 0) {
|
||||
/*
|
||||
* Fast path for plain ASCII characters
|
||||
* excluding NUL.
|
||||
*/
|
||||
*dst = (wchar_t)*s;
|
||||
nb = 1;
|
||||
} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
|
||||
(size_t)-1) {
|
||||
*src = s;
|
||||
return ((size_t)-1);
|
||||
} else if (nb == (size_t)-2) {
|
||||
*src = s + nms;
|
||||
return (nchr);
|
||||
} else if (nb == 0) {
|
||||
*src = NULL;
|
||||
return (nchr);
|
||||
}
|
||||
s += nb;
|
||||
nms -= nb;
|
||||
nchr++;
|
||||
dst++;
|
||||
}
|
||||
*src = s;
|
||||
return (nchr);
|
||||
}
|
||||
|
||||
size_t
|
||||
_UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
|
||||
{
|
||||
@ -254,3 +343,77 @@ _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
|
||||
|
||||
return (len);
|
||||
}
|
||||
|
||||
size_t
|
||||
_UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
|
||||
size_t nwc, size_t len, mbstate_t * __restrict ps)
|
||||
{
|
||||
_UTF8State *us;
|
||||
char buf[MB_LEN_MAX];
|
||||
const wchar_t *s;
|
||||
size_t nbytes;
|
||||
size_t nb;
|
||||
|
||||
us = (_UTF8State *)ps;
|
||||
|
||||
if (us->want != 0) {
|
||||
errno = EINVAL;
|
||||
return ((size_t)-1);
|
||||
}
|
||||
|
||||
s = *src;
|
||||
nbytes = 0;
|
||||
|
||||
if (dst == NULL) {
|
||||
while (nwc-- > 0) {
|
||||
if (0 <= *s && *s < 0x80)
|
||||
/* Fast path for plain ASCII characters. */
|
||||
nb = 1;
|
||||
else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
|
||||
(size_t)-1)
|
||||
/* Invalid character - wcrtomb() sets errno. */
|
||||
return ((size_t)-1);
|
||||
if (*s == L'\0')
|
||||
return (nbytes + nb - 1);
|
||||
s++;
|
||||
nbytes += nb;
|
||||
}
|
||||
return (nbytes);
|
||||
}
|
||||
|
||||
while (len > 0 && nwc-- > 0) {
|
||||
if (0 <= *s && *s < 0x80) {
|
||||
/* Fast path for plain ASCII characters. */
|
||||
nb = 1;
|
||||
*dst = *s;
|
||||
} else if (len > (size_t)MB_CUR_MAX) {
|
||||
/* Enough space to translate in-place. */
|
||||
if ((nb = (int)_UTF8_wcrtomb(dst, *s, ps)) < 0) {
|
||||
*src = s;
|
||||
return ((size_t)-1);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* May not be enough space; use temp. buffer.
|
||||
*/
|
||||
if ((nb = (int)_UTF8_wcrtomb(buf, *s, ps)) < 0) {
|
||||
*src = s;
|
||||
return ((size_t)-1);
|
||||
}
|
||||
if (nb > (int)len)
|
||||
/* MB sequence for character won't fit. */
|
||||
break;
|
||||
memcpy(dst, buf, nb);
|
||||
}
|
||||
if (*s == L'\0') {
|
||||
*src = NULL;
|
||||
return (nbytes + nb - 1);
|
||||
}
|
||||
s++;
|
||||
dst += nb;
|
||||
len -= nb;
|
||||
nbytes += nb;
|
||||
}
|
||||
*src = s;
|
||||
return (nbytes);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user