regedit: Allow importing UCS-2 and UTF-8 files created by Windows' regedit v5

Thu Oct 12 09:30:31 CDT 2006

Changelog: regedit: Allow importing UCS-2 and UTF-8 files created by Windows' 
regedit v5.

The code tries to detect the encoding from the first read bytes. UTF-8 is not 
modified while UCS-2 is converted by keeping only the lower byte.
-------------- next part --------------

--- ./programs/regedit/regproc.c.orig	2006-08-01 19:41:14.000000000 +0300
+++ ./programs/regedit/regproc.c	2006-10-12 17:21:23.000000000 +0300
@@ -881,6 +881,8 @@
 {
     LPSTR line           = NULL;  /* line read from input stream */
     ULONG lineSize       = REG_VAL_BUF_SIZE;
+    int encoding         = -1;    /* guessed file encoding */
+    int offset           =  0;    /* offset for encoding magic */
 
     line = HeapAlloc(GetProcessHeap(), 0, lineSize);
     CHECK_ENOUGH_MEMORY(line);
@@ -896,7 +898,7 @@
             /* Do we need to expand the buffer ? */
             assert (s >= line && s <= line + lineSize);
             size_remaining = lineSize - (s-line);
-            if (size_remaining < 2) /* room for 1 character and the \0 */
+            if (size_remaining < 2*((encoding > 0) ? encoding : 1)) /* room for 1 character and the \0 */
             {
                 char *new_buffer;
                 size_t new_size = lineSize + REG_VAL_BUF_SIZE;
@@ -915,6 +917,8 @@
              * eof, error, eol or getting the maximum amount.  Abort on error.
              */
             size_to_get = (size_remaining > INT_MAX ? INT_MAX : size_remaining);
+            if ((encoding < 0) || (encoding > 1))
+                memset(s, 0, size_to_get);
             if (NULL == fgets (s, size_to_get, in)) {
                 if (ferror(in)) {
                     perror ("While reading input");
@@ -929,6 +933,49 @@
                 }
             }
 
+            /* Attempt to guess the encoding from the first several bytes */
+            if (encoding < 0) {
+                if (line [0] == '\377' && line [1] == '\376') {
+                    encoding = 2; /* FF FE    => UCS-2 little endian */
+                    offset = 2;
+                }
+                else if (line [0] == '\357' && line [1] == '\277' && line [2] == '\275') {
+                    encoding = 1; /* EF BF BD => UTF-8 */
+                    offset = 3;
+                    if (line [3] == '\357' && line [4] == '\277' && line [5] == '\275')
+                        offset += 3;
+                }
+                else
+                    encoding = 0; /* default  => ASCII (hope so...) */
+                if (encoding) {
+                    fprintf(stderr,"%s: WARNING - %s encoding (detected %d magic bytes).\n",
+                            getAppName(),((encoding == 1) ? "UTF-8" : "UCS-2"),offset);
+                }
+            }
+
+            /* Extremely crude routine to convert to 8-bit */
+            if (encoding) {
+                int i = 0;
+                char c;
+                if (encoding > 1) {
+                    /* FIXME: this is ugly - we should read binary */
+                    if (! (offset || *s))
+                        offset = encoding-1;
+                    /* FIXME: we should convert properly */
+                    while ((c = s [(encoding*i)+offset]) != 0)
+                        s [i++] = c;
+                    if (s [(encoding*i)+offset+1] == 0x0a)
+                        s [i++] = '\n';
+                }
+                else if (offset) {
+                    /* just skip over the magic */
+                    while ((c = s [i+offset]) != 0)
+                        s [i++] = c;
+                }
+                s [i] = '\0';
+                offset = 0;
+            }
+
             /* If we didn't read the eol nor the eof go around for the rest */
             s_eol = strchr (s, '\n');
             if (!feof (in) && !s_eol) {
@@ -958,8 +1005,8 @@
                 /* The following error protection could be made more self-
                  * correcting but I thought it not worth trying.
                  */
-                if ((c = fgetc (in)) == EOF || c != ' ' ||
-                        (c = fgetc (in)) == EOF || c != ' ')
+                if ((c = fgetc (in)) == EOF || (c == '\0' ? (c = fgetc(in)) : c) != ' ' ||
+                        (c = fgetc (in)) == EOF || (c == '\0' ? (c = fgetc(in)) : c) != ' ')
                     fprintf(stderr,"%s: ERROR - invalid continuation.\n",
                             getAppName());
                 continue;