libc iconv does not reject surrogates when transcoding from UTF-32le to UTF-8
Affects | Status | Importance | Assigned to | Milestone | |
---|---|---|---|---|---|
eglibc (Ubuntu) |
New
|
Undecided
|
Unassigned |
Bug Description
Compile and run the following program:
"""
#include <stdio.h>
#include <errno.h>
#include <iconv.h>
int main(int argc, char **argv) {
iconv_t cd = iconv_open("UTF-8", "UTF-32LE");
//iconv_t cd = iconv_open(
if (cd == (iconv_t)-1) {
printf("Could not open: %d\n", errno);
return 1;
}
//char in_buf[] = { 0xA1, 0xDC, 0xA5, 0xDC };
//char in_buf[] = { 0xDC, 0xA1, 0xDC, 0xA5 };
char in_buf[] = { 0xA1, 0xDC, 0x00, 0x00, 0xA5, 0xDC, 0x00, 0x00 };
char out_buf[20];
char *in_buf_p = in_buf; size_t in_buf_left = sizeof(
char *out_buf_p = out_buf; size_t out_buf_left = 20;
size_t conv_count = iconv(cd, &in_buf_p, &in_buf_left, &out_buf_p, &out_buf_left);
if (conv_count == (size_t)-1) {
switch (errno) {
// Triggered by invalid multibyte sequence in input
case EILSEQ: printf("Conversion error: EILSEQ\n"); break;
// Not enough space in output buffer
case E2BIG: printf("Conversion error: E2BIG\n"); break;
// Incomplete multibyte sequence in input
case EINVAL: printf("Conversion error: EINVAL\n"); break;
// Some other unknown error
default: printf("Conversion error: %d\n", errno);
}
return 2;
}
printf("Consumed %d, produced %d, converted %d\n", (in_buf_
for (char *out_buf_read = out_buf; out_buf_read < out_buf_p; out_buf_read++) {
printf("\t%x\n", (unsigned char)*out_
}
if (iconv_close(cd) != 0) {
printf("Could not close: %d\n", errno);
return 3;
}
return 0;
}
"""
Expected result:
"""
Conversion error: EILSEQ
"""
Actual result:
"""
Consumed 8, produced 6, converted 0
ed
b2
a1
ed
b2
a5
"""
This UTF-8 byte sequence is invalid according to the standard because it encodes a surrogate code point.
Note that if you take this output byte sequence and run it through iconv *again* (with both input and output encodings as UTF-8) then EILSEQ is reported as expected.
affects: | ubuntu → eglibc (Ubuntu) |
This bug appears to still be present in glibc HEAD (commit d35dce5213b357c 218e252cbcb7a85 3cdecfcb41) , so I'm 99% sure this is not an Ubuntu bug, but the glibc page (http:// www.gnu. org/s/libc/ #Bugs) asked me to report here first.
The error that needs to be fixed is in gconv_simple.c line 890, where a check similar to that done in utf-32.c line 204 should be done.