From 97bbfc9b4a3f995776e5229bbfd8214282308b77 Mon Sep 17 00:00:00 2001 From: joeyh Date: Mon, 12 Nov 2007 17:04:11 +0000 Subject: [PATCH] * isutf8: Detect and reject overlong UTF-8 sequences. Closes: #440951 Many thanks to liw for the patch. --- check-isutf8 | 1 + debian/changelog | 7 ++ isutf8.c | 207 ++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 176 insertions(+), 39 deletions(-) diff --git a/check-isutf8 b/check-isutf8 index a4efa7b..d2858d9 100755 --- a/check-isutf8 +++ b/check-isutf8 @@ -27,5 +27,6 @@ check 0 '\xc2\xa9' check 1 '\xc2' check 1 '\xc2\x20' check 1 '\x20\xc2' +check 1 '\300\200' exit $failed diff --git a/debian/changelog b/debian/changelog index eb0e554..5fc0d0b 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,10 @@ +moreutils (0.25) unstable; urgency=low + + * isutf8: Detect and reject overlong UTF-8 sequences. Closes: #440951 + Many thanks to liw for the patch. + + -- Joey Hess Mon, 12 Nov 2007 11:58:07 -0500 + moreutils (0.24) unstable; urgency=low * vidir: Force numbers to normalised integers. diff --git a/isutf8.c b/isutf8.c index 59749c0..9af7d70 100644 --- a/isutf8.c +++ b/isutf8.c @@ -18,72 +18,199 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include #include #include #include #include #include -#define VERSION "1.0" + +#define VERSION "1.1" + + +/* + * Code to indicate an invalid UTF8 character. + */ +enum { INVALID_CHAR = 0xffffffff }; + + +/* + * Produce shortest UTF8 encoding of a 31-bit value in 'u', returning it + * in the array 'buf'. Return the number of bytes in the encoded value. + * If the value is too large (more than 32 bits or would take more than + * 'maxbytes' bytes), return -1. + */ +static int encodeutf8(unsigned long u, unsigned char *buf, size_t maxbytes) +{ + static const struct { + int nbytes; + unsigned long max; + } tab[] = { + { 1, 0x0000007F }, + { 2, 0x000007FF }, + { 3, 0x0000FFFF }, + { 4, 0x001FFFFF }, + { 5, 0x03FFFFFF }, + { 6, 0x7FFFFFFF }, + }; + static const int ntab = sizeof(tab) / sizeof(tab[0]); + int i, j; + + if (u > tab[ntab-1].max) + return -1; + + for (i = 0; i < ntab; ++i) { + if (u <= tab[i].max) + break; + } + assert(i < ntab); + + if (tab[i].nbytes > maxbytes) + return -1; + + if (tab[i].nbytes == 1) { /* Special case */ + buf[0] = u; + } else { + for (j = tab[i].nbytes-1; j > 0; --j) { + buf[j] = 0x80 | (u & 0x3f); + u >>= 6; + } + + unsigned char mask = ~(0xFF >> tab[i].nbytes); + buf[0] = mask | u; + } + + return tab[i].nbytes; +} + /* + * Return number of ones at the top of a byte. + * * I'm pretty sure there is a fancy trick to do this without a loop, * but I'm too tired to figure it out now. --liw */ static int high_ones(int c) { - int n; + int n; + + for (n = 0; (c & 0x80) == 0x80; c <<= 1) + ++n; + return n; +} + - for (n = 0; (c & 0x80) == 0x80; c <<= 1) - ++n; - return n; +/* + * Decode a UTF8 character from an array of bytes. Return character code. + * Upon error, return INVALID_CHAR. + */ +static unsigned long decodeutf8(unsigned char *buf, int nbytes) +{ + unsigned long u; + int i, j; + + if (nbytes <= 0) + return INVALID_CHAR; + + if (nbytes == 1) { + if (buf[0] >= 0x80) + return INVALID_CHAR; + return buf[0]; + } + + i = high_ones(buf[0]); + if (i != nbytes) + return INVALID_CHAR; + u = buf[0] & (0xff >> i); + for (j = 1; j < nbytes; ++j) { + if ((buf[j] & 0xC0) != 0x80) + return INVALID_CHAR; + u = (u << 6) | (buf[j] & 0x3f); + } + return u; } + +/* + * Determine if the contents of an open file form a valid UTF8 byte stream. + * Do this by collecting bytes for a character into a buffer and then + * decode the bytes and re-encode them and compare that they are identical + * to the original bytes. If any step fails, return 0 for error. If EOF + * is reached, return 1 for OK. + */ static int is_utf8_byte_stream(FILE *file, char *filename, int quiet) { - int c, n, remaining_bytes; - unsigned long line, col; - - remaining_bytes = 0; - line = 1; - col = 1; - while ((c = getc(file)) != EOF) { - n = high_ones(c); - if (remaining_bytes > 0) { - if (n == 1) { - --remaining_bytes; - if (remaining_bytes == 0) - ++col; - } else - goto error; - } else if (n == 0) { - /* 7-bit character, skip, but adjust position */ - if (c == '\n') { - ++line; - col = 1; - } else - ++col; - } else if (n == 1) - goto error; /* wrong place for continuation byte */ - else - remaining_bytes = n - 1; /* start of multi-byte sequence */ - } - if (remaining_bytes > 0) - goto error; + enum { MAX_UTF8_BYTES = 6 }; + unsigned char buf[MAX_UTF8_BYTES]; + unsigned char buf2[MAX_UTF8_BYTES]; + int nbytes, nbytes2; + int c; + unsigned long code; + unsigned long line, col, byteoff; + + nbytes = 0; + line = 1; + col = 1; + byteoff = 0; + + for (;;) { + c = getc(file); + + if (c == EOF || c < 0x80 || (c & 0xC0) != 0x80) { + /* New char starts, deal with previous one. */ + if (nbytes > 0) { + code = decodeutf8(buf, nbytes); + if (code == INVALID_CHAR) + goto error; + nbytes2 = encodeutf8(code, buf2, + MAX_UTF8_BYTES); + if (nbytes != nbytes2 || + memcmp(buf, buf2, nbytes) != 0) + goto error; + ++col; + } + nbytes = 0; + /* If it's UTF8, start collecting again. */ + if (c != EOF && c >= 0x80) + buf[nbytes++] = c; + } else { + /* This is a continuation byte, append to buffer. */ + if (nbytes == MAX_UTF8_BYTES) + goto error; + buf[nbytes++] = c; + } + + if (c == EOF) + break; + else if (c == '\n') { + ++line; + byteoff = 0; + col = 1; + } else + ++byteoff; + } + + if (nbytes != 0) + goto error; + return 1; error: if (!quiet) { - printf("%s: line %lu, col %lu: invalid UTF-8 code\n", - filename, line, col); + printf("%s: line %lu, char %lu, byte offset %lu: " + "invalid UTF-8 code\n", filename, line, col, byteoff); } return 0; } + static void usage(const char *program_name) { - printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n", program_name); + printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n", + program_name); printf("Check whether input files are valid UTF-8.\n"); printf("This is version %s.\n", VERSION); } + int main(int argc, char **argv) { int i, ok; FILE *file; @@ -127,10 +254,12 @@ int main(int argc, char **argv) { file = fopen(argv[i], "r"); if (file == NULL) { fprintf(stderr, "isutf8: %s: error %d: %s\n", - argv[i], errno, strerror(errno)); + argv[i], errno, + strerror(errno)); ok = 0; } else { - ok = is_utf8_byte_stream(file, argv[i], quiet) && ok; + if (is_utf8_byte_stream(file, argv[i], quiet)) + ok = 0; (void) fclose(file); } } -- 2.39.5