[2/5] notepad: Improve encoding detection when opening files. (try 2)

Alexander Scott-Johns alexander.scott.johns at googlemail.com
Wed Jul 1 13:55:28 CDT 2009


(Based on patch [1/4] of previous attempt.)

Try 2: Simplified the patch a bit, by removing the enc argument to
DoOpenFile. This is added back in patch [4/5].

---
 programs/notepad/dialog.c |   98 +++++++++++++++++++++++++++++++++++++++++----
 programs/notepad/main.h   |    8 ++++
 2 files changed, 98 insertions(+), 8 deletions(-)
-------------- next part --------------
From 6198ce38a92cf9b75d19133642b7d62c11e3a911 Mon Sep 17 00:00:00 2001
From: Alexander Scott-Johns <alexander.scott.johns at googlemail.com>
Date: Mon, 29 Jun 2009 22:24:59 +0100
Subject: notepad: Improve encoding detection when opening files.

---
 programs/notepad/dialog.c |   98 +++++++++++++++++++++++++++++++++++++++++----
 programs/notepad/main.h   |    8 ++++
 2 files changed, 98 insertions(+), 8 deletions(-)

diff --git a/programs/notepad/dialog.c b/programs/notepad/dialog.c
index 026fe25..59fc61b 100644
--- a/programs/notepad/dialog.c
+++ b/programs/notepad/dialog.c
@@ -37,6 +37,14 @@ static const WCHAR helpfileW[] = { 'n','o','t','e','p','a','d','.','h','l','p',0
 
 static INT_PTR WINAPI DIALOG_PAGESETUP_DlgProc(HWND hDlg, UINT msg, WPARAM wParam, LPARAM lParam);
 
+/* Swap bytes of WCHAR buffer (big-endian <-> little-endian). */
+static inline void byteswap_wide_string(LPWSTR str, UINT num)
+{
+    UINT i;
+    for (i = 0; i < num; i++)
+        str[i] = (WCHAR) MAKEWORD(HIBYTE((WORD) str[i]), LOBYTE((WORD) str[i]));
+}
+
 VOID ShowLastError(void)
 {
     DWORD error = GetLastError();
@@ -195,6 +203,43 @@ BOOL DoCloseFile(void)
     return(TRUE);
 }
 
+static inline ENCODING detect_encoding_of_buffer(const void* buffer, int size)
+{
+    static const char bom_utf8[] = { 0xef, 0xbb, 0xbf };
+    if (size >= sizeof(bom_utf8) && !memcmp(buffer, bom_utf8, sizeof(bom_utf8)))
+        return ENCODING_UTF8;
+    else
+    {
+        int flags = IS_TEXT_UNICODE_SIGNATURE |
+                    IS_TEXT_UNICODE_REVERSE_SIGNATURE |
+                    IS_TEXT_UNICODE_ODD_LENGTH;
+        IsTextUnicode(buffer, size, &flags);
+        if (flags & IS_TEXT_UNICODE_SIGNATURE)
+            return ENCODING_UTF16LE;
+        else if (flags & IS_TEXT_UNICODE_REVERSE_SIGNATURE)
+            return ENCODING_UTF16BE;
+        else
+            return ENCODING_ANSI;
+    }
+}
+
+/* Similar to SetWindowTextA, but uses a CP_UTF8 encoded input, not CP_ACP.
+ * lpTextInUtf8 should be NUL-terminated and not include the BOM.
+ *
+ * Returns FALSE on failure, TRUE on success, like SetWindowTextA/W.
+ */
+static BOOL SetWindowTextUtf8(HWND hwnd, LPCSTR lpTextInUtf8)
+{
+    BOOL ret;
+    int lenW = MultiByteToWideChar(CP_UTF8, 0, lpTextInUtf8, -1, NULL, 0);
+    LPWSTR textW = HeapAlloc(GetProcessHeap(), 0, lenW * sizeof(WCHAR));
+    if (!textW)
+        return FALSE;
+    MultiByteToWideChar(CP_UTF8, 0, lpTextInUtf8, -1, textW, lenW);
+    ret = SetWindowTextW(hwnd, textW);
+    HeapFree(GetProcessHeap(), 0, textW);
+    return ret;
+}
 
 void DoOpenFile(LPCWSTR szFileName)
 {
@@ -203,6 +248,8 @@ void DoOpenFile(LPCWSTR szFileName)
     LPSTR pTemp;
     DWORD size;
     DWORD dwNumRead;
+    ENCODING enc;
+    BOOL succeeded;
     WCHAR log[5];
 
     /* Close any files and prompt to save changes */
@@ -224,9 +271,9 @@ void DoOpenFile(LPCWSTR szFileName)
 	ShowLastError();
 	return;
     }
-    size++;
 
-    pTemp = HeapAlloc(GetProcessHeap(), 0, size);
+    /* Extra memory for (WCHAR)'\0'-termination. */
+    pTemp = HeapAlloc(GetProcessHeap(), 0, size+2);
     if (!pTemp)
     {
 	CloseHandle(hFile);
@@ -234,7 +281,7 @@ void DoOpenFile(LPCWSTR szFileName)
 	return;
     }
 
-    if (!ReadFile(hFile, pTemp, size, &dwNumRead, NULL))
+    if (!ReadFile(hFile, pTemp, size+2, &dwNumRead, NULL))
     {
 	CloseHandle(hFile);
 	HeapFree(GetProcessHeap(), 0, pTemp);
@@ -243,12 +290,47 @@ void DoOpenFile(LPCWSTR szFileName)
     }
 
     CloseHandle(hFile);
-    pTemp[dwNumRead] = 0;
 
-    if((size -1) >= 2 && (BYTE)pTemp[0] == 0xff && (BYTE)pTemp[1] == 0xfe)
-	SetWindowTextW(Globals.hEdit, (LPWSTR)pTemp + 1);
-    else
-	SetWindowTextA(Globals.hEdit, pTemp);
+    pTemp[size] = 0;    /* make sure it's  (char)'\0'-terminated */
+    pTemp[size+1] = 0;  /* make sure it's (WCHAR)'\0'-terminated */
+
+    enc = detect_encoding_of_buffer(pTemp, size);
+
+    /* SetWindowTextUtf8 and SetWindowTextA try to allocate memory, so we
+     * check if they succeed.
+     */
+    switch (enc)
+    {
+    case ENCODING_UTF16BE:
+        byteswap_wide_string((WCHAR*) pTemp, size/sizeof(WCHAR));
+        /* fall through */
+
+    case ENCODING_UTF16LE:
+        if (size >= 2 && (BYTE)pTemp[0] == 0xff && (BYTE)pTemp[1] == 0xfe)
+            succeeded = SetWindowTextW(Globals.hEdit, (LPWSTR)(pTemp+2));
+        else
+            succeeded = SetWindowTextW(Globals.hEdit, (LPWSTR)pTemp);
+        break;
+
+    case ENCODING_UTF8:
+        if (size >= 3 && (BYTE)pTemp[0] == 0xef && (BYTE)pTemp[1] == 0xbb &&
+                                                   (BYTE)pTemp[2] == 0xbf)
+            succeeded = SetWindowTextUtf8(Globals.hEdit, pTemp+3);
+        else
+            succeeded = SetWindowTextUtf8(Globals.hEdit, pTemp);
+        break;
+
+    default:
+        succeeded = SetWindowTextA(Globals.hEdit, pTemp);
+        break;
+    }
+
+    if (!succeeded)
+    {
+        ShowLastError();
+        HeapFree(GetProcessHeap(), 0, pTemp);
+        return;
+    }
 
     HeapFree(GetProcessHeap(), 0, pTemp);
 
diff --git a/programs/notepad/main.h b/programs/notepad/main.h
index f81c437..bb9b7cc 100644
--- a/programs/notepad/main.h
+++ b/programs/notepad/main.h
@@ -25,6 +25,14 @@
 
 #define MAX_STRING_LEN      255
 
+typedef enum
+{
+    ENCODING_ANSI,
+    ENCODING_UTF16LE,
+    ENCODING_UTF16BE,
+    ENCODING_UTF8
+} ENCODING;
+
 typedef struct
 {
   HANDLE   hInstance;
-- 
1.5.6.3


More information about the wine-patches mailing list