From: Joey Hess Date: Tue, 5 May 2009 19:06:34 +0000 (-0400) Subject: isutf8: Reject UTF-8-encoded UTF-16 surrogates. Closes: #525301 (Thanks, Jakub Wilk... X-Git-Tag: 0.35~2 X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a250ae89f37849be1caf204a07d2e4e563503390;p=moreutils isutf8: Reject UTF-8-encoded UTF-16 surrogates. Closes: #525301 (Thanks, Jakub Wilk and liw) --- diff --git a/check-isutf8 b/check-isutf8 index 3abb315..83a4eed 100755 --- a/check-isutf8 +++ b/check-isutf8 @@ -39,5 +39,8 @@ check 1 '\xc2' check 1 '\xc2\x20' check 1 '\x20\xc2' check 1 '\300\200' +check 1 '\xed\xa0\x88\xed\xbd\x85' # UTF-16 surrogates +check 1 '\xef\xbf\xbe' # 0xFFFE +check 1 '\xef\xbf\xbf' # 0xFFFF exit $failed diff --git a/debian/changelog b/debian/changelog index aac1f3f..7b638cb 100644 --- a/debian/changelog +++ b/debian/changelog @@ -3,6 +3,8 @@ moreutils (0.35) UNRELEASED; urgency=low * ifdata: Don't assume that all interface names are 6 characters or less, for instance "wmaster0" is longer. Increase the limit to 20 characters. Closes: #526654 (Thanks, Alan Pope) + * isutf8: Reject UTF-8-encoded UTF-16 surrogates. Closes: #525301 + (Thanks, Jakub Wilk and liw) -- Joey Hess Sat, 02 May 2009 20:40:23 -0400 diff --git a/isutf8.c b/isutf8.c index 4306c7d..c5f5eeb 100644 --- a/isutf8.c +++ b/isutf8.c @@ -127,6 +127,14 @@ static unsigned long decodeutf8(unsigned char *buf, int nbytes) return INVALID_CHAR; u = (u << 6) | (buf[j] & 0x3f); } + + /* Conforming UTF-8 cannot contain codes 0xd800–0xdfff (UTF-16 + surrogates) as well as 0xfffe and 0xffff. */ + if (u >= 0xD800 && u <= 0xDFFF) + return INVALID_CHAR; + if (u == 0xFFFE || u == 0xFFFF) + return INVALID_CHAR; + return u; } @@ -145,7 +153,7 @@ static int is_utf8_byte_stream(FILE *file, char *filename, int quiet) { int nbytes, nbytes2; int c; unsigned long code; - unsigned long line, col, byteoff; + unsigned long line, col, byteoff; nbytes = 0; line = 1;