Ubuntu
libx11 package

Xutf8LookupString doesn't report overflows with small buffer sizes

Bug #1336588 reported by Daniel Woodworth on 2014-07-02

This bug affects 1 person

Affects		Status	Importance	Assigned to	Milestone
	libx11 (Ubuntu)	New	Undecided	Unassigned

Bug Description

Greetings!

First of all, as per the instructions below, I am using Ubuntu 14.04 with libx11-6 version 2:1.6.2-1ubuntu2 installed.

I'm trying to figure out how to develop a Unicode-based Xlib application, and, after several days of frustration with my code strangely not picking up characters in some cases where every other program seemed to handle them correctly, I realized that the problem was due to Xutf8LookupString not reporting overflows correctly in some situations when a small buffer is passed. The workaround is very simple (using a bigger buffer), but I thought I'd report the problem anyway because none of the documentation I've found has said anything about a minimum buffer size and I think that it ought to get fixed (or at least mentioned) for the benefit of other new Xlib programmers or people porting programs over from other Xlib implementations.

I've attached a small C program that can be used to demonstrate the problem. The particular key that I tested with was the ñ key on the Spanish keyboard layout, but it seems like the problem manifests itself for any non-ASCII Latin-1 character with a direct key association (and possibly other cases). As written, the attached program functions perfectly: pressing ñ results in "got 2 bytes: ñ" appearing on the terminal. However, if one changes line 116 from "const int INITIAL_BUFFER_SIZE = 4;" down to "const int INITIAL_BUFFER_SIZE = 1;" (or even down to 0!), pressing ñ instead results in the message "got 0 bytes: " because, instead of reporting the actual character length and a buffer overflow as it should, Xutf8LookupString reports that the character sequence doesn't exist. The bug also occurs if one uses XmbLookupString instead of Xutf8LookupString for roughly the same reason, but I'm focusing on Xutf8LookupString here instead since its logic is somewhat simpler and because it's the one that will be more useful for what I'm trying to do in my program.

So, how does this bug come about? To answer that question, I downloaded the source package, fired up my debugger, traced the execution path through all of the indirect function calls, and narrowed the problem down to these three functions:

modules/im/ximcp/imDefLkup.c lines 1120-1181:
int
_XimProtoUtf8LookupString(
    XIC xic,
    XKeyEvent *ev,
    char *buffer,
    int bytes,
    KeySym *keysym,
    Status *state)
{
    Xic ic = (Xic)xic;
    Xim im = (Xim)ic->core.im;
    int ret;
    Status tmp_state;
    XimCommitInfo info;

if (!IS_SERVER_CONNECTED(im))
return 0;

if (!state)
state = &tmp_state;

    if (ev->type == KeyPress && ev->keycode == 0) { /* Filter function */
if (!(info = ic->private.proto.commit_info)) {
           *state = XLookupNone;
     return 0;
}

ret = im->methods->ctstoutf8((XIM)im, info->string,
     info->string_len, buffer, bytes, state);
if (*state == XBufferOverflow)
           return ret;
if (keysym && (info->keysym && *(info->keysym))) {
     *keysym = *(info->keysym);
     if (*state == XLookupChars)
  *state = XLookupBoth;
     else
  *state = XLookupKeySym;
}
_XimUnregCommitInfo(ic);

    } else if (ev->type == KeyPress) {
ret = _XimLookupUTF8Text(ic, ev, buffer, bytes, keysym, NULL);
if (ret > 0) {
           if (ret > bytes)
               *state = XBufferOverflow;
           else if (keysym && *keysym != NoSymbol)
  *state = XLookupBoth;
     else
  *state = XLookupChars;
} else {
     if (keysym && *keysym != NoSymbol)
  *state = XLookupKeySym;
     else
  *state = XLookupNone;
}
    } else {
*state = XLookupNone;
ret = 0;
    }

return ret;
}

src/imConv.c lines 300-356:
int
_XimLookupUTF8Text(
    Xic ic,
    XKeyEvent* event,
    char* buffer,
    int nbytes,
    KeySym* keysym,
    XComposeStatus* status)
{
    int count;
    KeySym symbol;
    Status dummy;
    Xim im = (Xim)ic->core.im;
    XimCommonPrivateRec* private = &im->private.common;
    unsigned char look[BUF_SIZE];
    ucs4_t ucs4;

    /* force a latin-1 lookup for compatibility */
    count = XLOOKUPSTRING(event, (char *)buffer, nbytes, &symbol, status);
    if (keysym != NULL) *keysym = symbol;
    if ((nbytes == 0) || (symbol == NoSymbol)) return count;

    if (count > 1) {
memcpy(look, (char *)buffer,count);
look[count] = '\0';
if ((count = im->methods->ctstoutf8(ic->core.im,
    (char*) look, count,
    buffer, nbytes, &dummy)) < 0) {
     count = 0;
}
    } else if ((count == 0) ||
        (count == 1 && (symbol > 0x7f && symbol < 0xff00))) {

        XPointer from = (XPointer) &ucs4;
        int from_len = 1;
        XPointer to = (XPointer) buffer;
        int to_len = nbytes;

        ucs4 = (ucs4_t) KeySymToUcs4(symbol);
        if (!ucs4)
            return 0;

if (_XlcConvert(private->ucstoutf8_conv,
                        &from, &from_len, &to, &to_len,
                        NULL, 0) != 0) {
     count = 0;
} else {
            count = nbytes - to_len;
}
    }
    /* FIXME:
     * we should make sure that if the character is a Latin1 character
     * and it's on the right side, and we're in a non-Latin1 locale
     * that this is a valid Latin1 character for this locale.
     */
    return count;
}

src/xlibi18n/lcUTF8.c lines 1070-1111:
static int
ucstoutf8(
    XlcConv conv,
    XPointer *from,
    int *from_left,
    XPointer *to,
    int *to_left,
    XPointer *args,
    int num_args)
{
    const ucs4_t *src;
    const ucs4_t *srcend;
    unsigned char *dst;
    unsigned char *dstend;
    int unconv_num;

if (from == NULL || *from == NULL)
return 0;

    src = (const ucs4_t *) *from;
    srcend = src + *from_left;
    dst = (unsigned char *) *to;
    dstend = dst + *to_left;
    unconv_num = 0;

    while (src < srcend) {
int count = utf8_wctomb(NULL, dst, *src, dstend-dst);
if (count == RET_TOOSMALL)
     break;
if (count == RET_ILSEQ)
     unconv_num++;
src++;
dst += count;
    }

    *from = (XPointer) src;
    *from_left = srcend - src;
    *to = (XPointer) dst;
    *to_left = dstend - dst;

return unconv_num;
}

From my experiment, it seems that Xutf8LookupString was largely implemented as _XimProtoUtf8LookupString for my particular situation. The problem arises at the tail end of the function, where the results of _XimLookupUTF8Text are examined and translated into the status and return values. It appears that the return value of _XimLookupUTF8Text is pretty much forwarded directly and the buffer overflow condition occurs when that value exceeds the size passed in for the buffer. Therefore, it appears that _XimLookupUTF8Text is supposed to return the actual size of the UTF-8 text produced. However, for the case in question, _XimLookupUTF8Text returns 0 for what is supposed to be a 2-long sequence ("ñ").

Looking at the implementation of _XimLookupUTF8Text, the significant code is once again the code at the tail end of the function, where the ucs4 code obtained is converted back to UTF-8. The code in this case calls _XlcConvert and determines the aforementioned length of the character to return. The return value of _XlcConvert doesn't seem to be documented anywhwere in the code, but from a reading of the function it appears that _XlcConvert indicates conversion errors by returning a non-zero value, thus causing _XimLookupUTF8Text to report that the character sequence was invalid by returning 0. Otherwise, for "successful" conversions, it appears that to_len is modified as _XlcConvert does its work to represent the number of empty bytes in the buffer and thus _XimLookupUTF8Text determines the size of the character itself by subtracting the resulting to_len value from the buffer size. In the case of ñ with a 1-byte buffer, _XlcConvert does indeed return 0, but, since the final value of to_len after _XlcConvert runs is 1, _XimLookupUTF8Text returns 1-1=0 anyway.

_XlcConvert appears to indirectly call ucstoutf8 in order to handle the majority of this particular conversion. ucstoutf8 appears to run the conversion by calling utf8_wctomb character-by-character on the sequence passed in. The parameter naming of to_left clearly indicates that the value will be updated to reflect the amount of space "left" in the output buffer passed in. The meaning of the return value is also hinted at since it is collected throughout the function in a variable named "unconv_num," which appears to only be incremented whenever utf8_wctomb returns RET_ILSEQ. Thus, ucstoutf8 seems to return the number of ill-formed sequences identified in the course of the conversion. In my test, utf8_wctomb correctly returned RET_TOOSMALL, causing ucstoutf8 to simply break out of the loop without writing anything to the buffer and signifying that nothing was written by leaving to_left at 1.

Anyway, I apologise for posting such a long explanation, but I thought that it might help in coming up with a fix for this. Being largely unfamiliar with the code base and not seeing any obvious fixes, I am unsure how best to proceed and therefore decided not to try to come up with a patch. Also, I am not sure of the extents of the problem; I know that other non-ASCII Latin-1 characters appear to be affected and that XmbLookupString exhibits a very similar problem, but beyond that there may be other characters in other keyboard layouts that I didn't try that are affected and other locales and character sets might also be affected but with different conversion functions.

In any case, thank you for your time and good luck!