* isutf8: Detect and reject overlong UTF-8 sequences. Closes: #440951

author joeyh <joeyh>

Mon, 12 Nov 2007 17:04:11 +0000 (17:04 +0000)

committer joeyh <joeyh>

Mon, 12 Nov 2007 17:04:11 +0000 (17:04 +0000)
author joeyh <joeyh>
Mon, 12 Nov 2007 17:04:11 +0000 (17:04 +0000)
committer joeyh <joeyh>
Mon, 12 Nov 2007 17:04:11 +0000 (17:04 +0000)
diff --git a/check-isutf8 b/check-isutf8

index a4efa7bfb95e5e6a28bfcee4c17745d5bc38931f..d2858d996996acf1660ea8c8a9aa0db2c5be4587 100755 (executable)
--- a/check-isutf8
+++ b/check-isutf8
@@ -27,5 +27,6 @@ check 0 '\xc2\xa9'
  check 1 '\xc2'
  check 1 '\xc2\x20'
  check 1 '\x20\xc2'
+check 1 '\300\200'
  
  exit $failed
diff --git a/debian/changelog b/debian/changelog

index eb0e554a24f6e7b67ecd1294aa29e4c40ea7cb79..5fc0d0b5ce0b6bbcdf2b66676daed4b649fcd7b9 100644 (file)
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,10 @@
+moreutils (0.25) unstable; urgency=low
+
+  * isutf8: Detect and reject overlong UTF-8 sequences. Closes: #440951
+    Many thanks to liw for the patch.
+
+ -- Joey Hess <joeyh@debian.org>  Mon, 12 Nov 2007 11:58:07 -0500
+
  moreutils (0.24) unstable; urgency=low
  
    * vidir: Force numbers to normalised integers.
diff --git a/isutf8.c b/isutf8.c

index 59749c0f49701106ac0aaa949678a8cbbf9de0bf..9af7d70bb47ffdbf1fb818887afc3a9c04802b50 100644 (file)
--- a/isutf8.c
+++ b/isutf8.c
@@ -18,72 +18,199 @@
   * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
   */
  
+#include <assert.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <errno.h>
  #include <string.h>
  #include <getopt.h>
  
-#define VERSION "1.0"
+
+#define VERSION "1.1"
+
+
+/*
+ * Code to indicate an invalid UTF8 character.
+ */
+enum { INVALID_CHAR = 0xffffffff };
+
+
+/*
+ * Produce shortest UTF8 encoding of a 31-bit value in 'u', returning it
+ * in the array 'buf'. Return the number of bytes in the encoded value.
+ * If the value is too large (more than 32 bits or would take more than
+ * 'maxbytes' bytes), return -1.
+ */
+static int encodeutf8(unsigned long u, unsigned char *buf, size_t maxbytes)
+{
+        static const struct {
+            int nbytes;
+            unsigned long max;
+        } tab[] = {
+            { 1, 0x0000007F },
+            { 2, 0x000007FF },
+            { 3, 0x0000FFFF },
+            { 4, 0x001FFFFF },
+            { 5, 0x03FFFFFF },
+            { 6, 0x7FFFFFFF },
+        };
+        static const int ntab = sizeof(tab) / sizeof(tab[0]);
+        int i, j;
+
+        if (u > tab[ntab-1].max)
+                return -1;
+
+        for (i = 0; i < ntab; ++i) {
+                if (u <= tab[i].max)
+                    break;
+        }
+        assert(i < ntab);
+
+        if (tab[i].nbytes > maxbytes)
+                return -1;
+        
+        if (tab[i].nbytes == 1) { /* Special case */
+                buf[0] = u;
+        } else {
+                for (j = tab[i].nbytes-1; j > 0; --j) {
+                        buf[j] = 0x80 | (u & 0x3f);
+                        u >>= 6;
+                }
+        
+                unsigned char mask = ~(0xFF >> tab[i].nbytes);
+                buf[0] = mask | u;
+        }
+
+        return tab[i].nbytes;
+}
+
  
  /* 
+ * Return number of ones at the top of a byte.
+ *
   * I'm pretty sure there is a fancy trick to do this without a loop,
   * but I'm too tired to figure it out now. --liw
   */
  static int high_ones(int c) {
-       int n;
+        int n;
+
+        for (n = 0; (c & 0x80) == 0x80; c <<= 1)
+                ++n;    
+        return n;
+}
+
  
-       for (n = 0; (c & 0x80) == 0x80; c <<= 1)
-               ++n;    
-       return n;
+/*
+ * Decode a UTF8 character from an array of bytes. Return character code.
+ * Upon error, return INVALID_CHAR.
+ */
+static unsigned long decodeutf8(unsigned char *buf, int nbytes)
+{
+        unsigned long u;
+        int i, j;
+        
+        if (nbytes <= 0)
+                return INVALID_CHAR;
+        
+        if (nbytes == 1) {
+                if (buf[0] >= 0x80)
+                        return INVALID_CHAR;
+                return buf[0];
+        }
+        
+        i = high_ones(buf[0]);
+        if (i != nbytes)
+                return INVALID_CHAR;    
+        u = buf[0] & (0xff >> i);
+        for (j = 1; j < nbytes; ++j) {
+                if ((buf[j] & 0xC0) != 0x80)
+                            return INVALID_CHAR;
+                u = (u << 6) | (buf[j] & 0x3f);
+        }
+        return u;
  }
  
+
+/*
+ * Determine if the contents of an open file form a valid UTF8 byte stream.
+ * Do this by collecting bytes for a character into a buffer and then
+ * decode the bytes and re-encode them and compare that they are identical
+ * to the original bytes. If any step fails, return 0 for error. If EOF
+ * is reached, return 1 for OK.
+ */
  static int is_utf8_byte_stream(FILE *file, char *filename, int quiet) {
-       int c, n, remaining_bytes;
-       unsigned long line, col;
-       
-       remaining_bytes = 0;
-       line = 1;
-       col = 1;
-       while ((c = getc(file)) != EOF) {
-               n = high_ones(c);
-               if (remaining_bytes > 0) {
-                       if (n == 1) {
-                               --remaining_bytes;
-                               if (remaining_bytes == 0)
-                                       ++col;
-                       } else
-                               goto error;
-               } else if (n == 0) {
-                       /* 7-bit character, skip, but adjust position */
-                       if (c == '\n') {
-                               ++line;
-                               col = 1;
-                       } else
-                               ++col;
-               } else if (n == 1)
-                       goto error; /* wrong place for continuation byte */
-               else
-                       remaining_bytes = n - 1; /* start of multi-byte sequence */
-       }
-       if (remaining_bytes > 0)
-               goto error;
+        enum { MAX_UTF8_BYTES = 6 };
+        unsigned char buf[MAX_UTF8_BYTES];
+        unsigned char buf2[MAX_UTF8_BYTES];
+        int nbytes, nbytes2;
+        int c;
+        unsigned long code;
+       unsigned long line, col, byteoff;
+
+        nbytes = 0;
+        line = 1;
+        col = 1;
+        byteoff = 0;
+                
+        for (;;) {
+                c = getc(file);
+    
+                if (c == EOF || c < 0x80 || (c & 0xC0) != 0x80) {
+                        /* New char starts, deal with previous one. */
+                        if (nbytes > 0) {
+                                code = decodeutf8(buf, nbytes);
+                                if (code == INVALID_CHAR)
+                                        goto error;
+                                nbytes2 = encodeutf8(code, buf2, 
+                                                     MAX_UTF8_BYTES);
+                                if (nbytes != nbytes2 || 
+                                    memcmp(buf, buf2, nbytes) != 0)
+                                        goto error;
+                                ++col;
+                        }
+                        nbytes = 0;
+                        /* If it's UTF8, start collecting again. */
+                        if (c != EOF && c >= 0x80)
+                                buf[nbytes++] = c;
+                } else {
+                        /* This is a continuation byte, append to buffer. */
+                        if (nbytes == MAX_UTF8_BYTES)
+                                goto error;
+                        buf[nbytes++] = c;
+                }
+    
+                if (c == EOF)
+                        break;
+                else if (c == '\n') {
+                        ++line;
+                        byteoff = 0;
+                        col = 1;
+                } else
+                        ++byteoff;
+        }
+        
+        if (nbytes != 0)
+                goto error;
+
         return 1;
         
  error:
         if (!quiet) {
-               printf("%s: line %lu, col %lu: invalid UTF-8 code\n", 
-                      filename, line, col);
+               printf("%s: line %lu, char %lu, byte offset %lu: "
+                      "invalid UTF-8 code\n", filename, line, col, byteoff);
         }
         return 0;
  }
  
+
  static void usage(const char *program_name) {
-       printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n", program_name);
+       printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n", 
+              program_name);
         printf("Check whether input files are valid UTF-8.\n");
         printf("This is version %s.\n", VERSION);
  }
  
+
  int main(int argc, char **argv) {
         int i, ok;
         FILE *file;
@@ -127,10 +254,12 @@ int main(int argc, char **argv) {
                         file = fopen(argv[i], "r");
                         if (file == NULL) {
                                 fprintf(stderr, "isutf8: %s: error %d: %s\n", 
-                                               argv[i], errno, strerror(errno));
+                                               argv[i], errno, 
+                                               strerror(errno));
                                 ok = 0;
                         } else {
-                               ok = is_utf8_byte_stream(file, argv[i], quiet) && ok;
+                               if (is_utf8_byte_stream(file, argv[i], quiet))
+                                   ok = 0;
                                 (void) fclose(file);
                         }
                 }
author	joeyh <joeyh>
	Mon, 12 Nov 2007 17:04:11 +0000 (17:04 +0000)
committer	joeyh <joeyh>
	Mon, 12 Nov 2007 17:04:11 +0000 (17:04 +0000)
check-isutf8		patch \| blob \| history
debian/changelog		patch \| blob \| history
isutf8.c		patch \| blob \| history