[1/4] notepad: Improve encoding detection when opening files.

Alexander Scott-Johns alexander.scott.johns at googlemail.com
Mon Jun 29 19:24:50 CDT 2009


I have reorganized the patches - the Encoding drop-down box is now
added in the 3rd patch.

---
 programs/notepad/dialog.c |  102 ++++++++++++++++++++++++++++++++++++++++----
 programs/notepad/dialog.h |    2 +-
 programs/notepad/main.c   |    6 +-
 programs/notepad/main.h   |    9 ++++
 4 files changed, 105 insertions(+), 14 deletions(-)
-------------- next part --------------
From c8110d47a2e66442a342f9373ff5c6e51338ed83 Mon Sep 17 00:00:00 2001
From: Alexander Scott-Johns <alexander.scott.johns at googlemail.com>
Date: Mon, 29 Jun 2009 22:24:59 +0100
Subject: notepad: Improve encoding detection when opening files.

---
 programs/notepad/dialog.c |  102 ++++++++++++++++++++++++++++++++++++++++----
 programs/notepad/dialog.h |    2 +-
 programs/notepad/main.c   |    6 +-
 programs/notepad/main.h   |    9 ++++
 4 files changed, 105 insertions(+), 14 deletions(-)

diff --git a/programs/notepad/dialog.c b/programs/notepad/dialog.c
index d7354e7..606b777 100644
--- a/programs/notepad/dialog.c
+++ b/programs/notepad/dialog.c
@@ -37,6 +37,14 @@ static const WCHAR helpfileW[] = { 'n','o','t','e','p','a','d','.','h','l','p',0
 
 static INT_PTR WINAPI DIALOG_PAGESETUP_DlgProc(HWND hDlg, UINT msg, WPARAM wParam, LPARAM lParam);
 
+/* Swap bytes of WCHAR buffer (big-endian <-> little-endian). */
+static inline void byteswap_wide_string(LPWSTR str, UINT num)
+{
+    UINT i;
+    for (i = 0; i < num; i++)
+        str[i] = (WCHAR) MAKEWORD(HIBYTE((WORD) str[i]), LOBYTE((WORD) str[i]));
+}
+
 VOID ShowLastError(void)
 {
     DWORD error = GetLastError();
@@ -195,14 +203,52 @@ BOOL DoCloseFile(void)
     return(TRUE);
 }
 
+static inline ENCODING detect_encoding_of_buffer(const void* buffer, int size)
+{
+    static const char bom_utf8[] = { 0xef, 0xbb, 0xbf };
+    if (size >= sizeof(bom_utf8) && !memcmp(buffer, bom_utf8, sizeof(bom_utf8)))
+        return ENCODING_UTF8;
+    else
+    {
+        int flags = IS_TEXT_UNICODE_SIGNATURE |
+                    IS_TEXT_UNICODE_REVERSE_SIGNATURE |
+                    IS_TEXT_UNICODE_ODD_LENGTH;
+        IsTextUnicode(buffer, size, &flags);
+        if (flags & IS_TEXT_UNICODE_SIGNATURE)
+            return ENCODING_UTF16LE;
+        else if (flags & IS_TEXT_UNICODE_REVERSE_SIGNATURE)
+            return ENCODING_UTF16BE;
+        else
+            return ENCODING_ANSI;
+    }
+}
 
-void DoOpenFile(LPCWSTR szFileName)
+/* Similar to SetWindowTextA, but uses a CP_UTF8 encoded input, not CP_ACP.
+ * lpTextInUtf8 should be NUL-terminated and not include the BOM.
+ *
+ * Returns FALSE on failure, TRUE on success, like SetWindowTextA/W.
+ */
+static BOOL SetWindowTextUtf8(HWND hwnd, LPCSTR lpTextInUtf8)
+{
+    BOOL ret;
+    int lenW = MultiByteToWideChar(CP_UTF8, 0, lpTextInUtf8, -1, NULL, 0);
+    LPWSTR textW = HeapAlloc(GetProcessHeap(), 0, lenW * sizeof(WCHAR));
+    if (!textW)
+        return FALSE;
+    MultiByteToWideChar(CP_UTF8, 0, lpTextInUtf8, -1, textW, lenW);
+    ret = SetWindowTextW(hwnd, textW);
+    HeapFree(GetProcessHeap(), 0, textW);
+    return ret;
+}
+
+void DoOpenFile(LPCWSTR szFileName, ENCODING enc)
 {
     static const WCHAR dotlog[] = { '.','L','O','G',0 };
     HANDLE hFile;
     LPSTR pTemp;
     DWORD size;
     DWORD dwNumRead;
+    BOOL succeeded;
     WCHAR log[5];
 
     /* Close any files and prompt to save changes */
@@ -224,9 +270,9 @@ void DoOpenFile(LPCWSTR szFileName)
 	ShowLastError();
 	return;
     }
-    size++;
 
-    pTemp = HeapAlloc(GetProcessHeap(), 0, size);
+    /* Extra memory for (WCHAR)'\0'-termination. */
+    pTemp = HeapAlloc(GetProcessHeap(), 0, size+2);
     if (!pTemp)
     {
 	CloseHandle(hFile);
@@ -234,7 +280,7 @@ void DoOpenFile(LPCWSTR szFileName)
 	return;
     }
 
-    if (!ReadFile(hFile, pTemp, size, &dwNumRead, NULL))
+    if (!ReadFile(hFile, pTemp, size+2, &dwNumRead, NULL))
     {
 	CloseHandle(hFile);
 	HeapFree(GetProcessHeap(), 0, pTemp);
@@ -243,12 +289,48 @@ void DoOpenFile(LPCWSTR szFileName)
     }
 
     CloseHandle(hFile);
-    pTemp[dwNumRead] = 0;
 
-    if((size -1) >= 2 && (BYTE)pTemp[0] == 0xff && (BYTE)pTemp[1] == 0xfe)
-	SetWindowTextW(Globals.hEdit, (LPWSTR)pTemp + 1);
-    else
-	SetWindowTextA(Globals.hEdit, pTemp);
+    pTemp[size] = 0;    /* make sure it's  (char)'\0'-terminated */
+    pTemp[size+1] = 0;  /* make sure it's (WCHAR)'\0'-terminated */
+
+    if (enc == ENCODING_AUTO)
+        enc = detect_encoding_of_buffer(pTemp, size);
+
+    /* SetWindowTextUtf8 and SetWindowTextA try to allocate memory, so we
+     * check if they succeed.
+     */
+    switch (enc)
+    {
+    case ENCODING_UTF16BE:
+        byteswap_wide_string((WCHAR*) pTemp, size/sizeof(WCHAR));
+        /* fall through */
+
+    case ENCODING_UTF16LE:
+        if (size >= 2 && (BYTE)pTemp[0] == 0xff && (BYTE)pTemp[1] == 0xfe)
+            succeeded = SetWindowTextW(Globals.hEdit, (LPWSTR)(pTemp+2));
+        else
+            succeeded = SetWindowTextW(Globals.hEdit, (LPWSTR)pTemp);
+        break;
+
+    case ENCODING_UTF8:
+        if (size >= 3 && (BYTE)pTemp[0] == 0xef && (BYTE)pTemp[1] == 0xbb &&
+                                                   (BYTE)pTemp[2] == 0xbf)
+            succeeded = SetWindowTextUtf8(Globals.hEdit, pTemp+3);
+        else
+            succeeded = SetWindowTextUtf8(Globals.hEdit, pTemp);
+        break;
+
+    default:
+        succeeded = SetWindowTextA(Globals.hEdit, pTemp);
+        break;
+    }
+
+    if (!succeeded)
+    {
+        ShowLastError();
+        HeapFree(GetProcessHeap(), 0, pTemp);
+        return;
+    }
 
     HeapFree(GetProcessHeap(), 0, pTemp);
 
@@ -308,7 +390,7 @@ VOID DIALOG_FileOpen(VOID)
 
 
     if (GetOpenFileNameW(&openfilename))
-        DoOpenFile(openfilename.lpstrFile);
+        DoOpenFile(openfilename.lpstrFile, ENCODING_AUTO);
 }
 
 
diff --git a/programs/notepad/dialog.h b/programs/notepad/dialog.h
index d927143..7aabb6e 100644
--- a/programs/notepad/dialog.h
+++ b/programs/notepad/dialog.h
@@ -54,4 +54,4 @@ int DIALOG_StringMsgBox(HWND hParent, int formatId, LPCWSTR szString, DWORD dwFl
 VOID ShowLastError(void);
 BOOL FileExists(LPCWSTR szFilename);
 BOOL DoCloseFile(void);
-void DoOpenFile(LPCWSTR szFileName);
+void DoOpenFile(LPCWSTR szFileName, ENCODING enc);
diff --git a/programs/notepad/main.c b/programs/notepad/main.c
index c195668..2156d6b 100644
--- a/programs/notepad/main.c
+++ b/programs/notepad/main.c
@@ -578,7 +578,7 @@ static LRESULT WINAPI NOTEPAD_WndProc(HWND hWnd, UINT msg, WPARAM wParam,
 
         DragQueryFileW(hDrop, 0, szFileName, ARRAY_SIZE(szFileName));
         DragFinish(hDrop);
-        DoOpenFile(szFileName);
+        DoOpenFile(szFileName, ENCODING_AUTO);
         break;
     }
     
@@ -689,7 +689,7 @@ static void HandleCommandLine(LPWSTR cmdline)
 
         if (file_exists)
         {
-            DoOpenFile(file_name);
+            DoOpenFile(file_name, ENCODING_AUTO);
             InvalidateRect(Globals.hMainWnd, NULL, FALSE);
             if (opt_print)
                 DIALOG_FilePrint();
@@ -698,7 +698,7 @@ static void HandleCommandLine(LPWSTR cmdline)
         {
             switch (AlertFileDoesNotExist(file_name)) {
             case IDYES:
-                DoOpenFile(file_name);
+                DoOpenFile(file_name, ENCODING_ANSI);
                 break;
 
             case IDNO:
diff --git a/programs/notepad/main.h b/programs/notepad/main.h
index f81c437..465897b 100644
--- a/programs/notepad/main.h
+++ b/programs/notepad/main.h
@@ -25,6 +25,15 @@
 
 #define MAX_STRING_LEN      255
 
+typedef enum
+{
+    ENCODING_AUTO,
+    ENCODING_ANSI,
+    ENCODING_UTF16LE,
+    ENCODING_UTF16BE,
+    ENCODING_UTF8
+} ENCODING;
+
 typedef struct
 {
   HANDLE   hInstance;
-- 
1.5.6.3


More information about the wine-patches mailing list