[PATCH 1/5] xmllite/reader: Improve input stream encoding detection

Mon Mar 6 03:35:55 CST 2017

Signed-off-by: Nikolay Sivov <nsivov at codeweavers.com>
---
 dlls/xmllite/reader.c       | 20 +++++++++-----
 dlls/xmllite/tests/reader.c | 67 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 79 insertions(+), 8 deletions(-)

diff --git a/dlls/xmllite/reader.c b/dlls/xmllite/reader.c
index a880bee256..699af4b2a1 100644
--- a/dlls/xmllite/reader.c
+++ b/dlls/xmllite/reader.c
@@ -1,7 +1,7 @@
 /*
  * IXmlReader implementation
  *
- * Copyright 2010, 2012-2013, 2016 Nikolay Sivov
+ * Copyright 2010, 2012-2013, 2016-2017 Nikolay Sivov
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -93,6 +93,8 @@ static const WCHAR gtW[] = {'>',0};
 static const WCHAR commentW[] = {'<','!','-','-',0};
 static const WCHAR piW[] = {'<','?',0};
 
+static BOOL is_namestartchar(WCHAR ch);
+
 static const char *debugstr_nodetype(XmlNodeType nodetype)
 {
     static const char * const type_names[] =
@@ -840,10 +842,9 @@ static inline BOOL readerinput_is_utf8(xmlreaderinput *readerinput)
 static HRESULT readerinput_detectencoding(xmlreaderinput *readerinput, xml_encoding *enc)
 {
     encoded_buffer *buffer = &readerinput->buffer->encoded;
-    static const WCHAR startW[] = {'<','?'};
-    static const WCHAR commentW[] = {'<','!'};
     static const char utf8bom[] = {0xef,0xbb,0xbf};
     static const char utf16lebom[] = {0xff,0xfe};
+    WCHAR *ptrW;
 
     *enc = XmlEncoding_Unknown;
 
@@ -854,13 +855,17 @@ static HRESULT readerinput_detectencoding(xmlreaderinput *readerinput, xml_encod
         if (buffer->written <= 3) return MX_E_INPUTEND;
     }
 
+    ptrW = (WCHAR *)buffer->data;
     /* try start symbols if we have enough data to do that, input buffer should contain
        first chunk already */
     if (readerinput_is_utf8(readerinput))
         *enc = XmlEncoding_UTF8;
-    else if (!memcmp(buffer->data, startW, sizeof(startW)) ||
-             !memcmp(buffer->data, commentW, sizeof(commentW)))
-        *enc = XmlEncoding_UTF16;
+    else if (*ptrW == '<')
+    {
+        ptrW++;
+        if (*ptrW == '?' || *ptrW == '!' || is_namestartchar(*ptrW))
+            *enc = XmlEncoding_UTF16;
+    }
     /* try with BOM now */
     else if (!memcmp(buffer->data, utf8bom, sizeof(utf8bom)))
     {
@@ -2492,7 +2497,8 @@ static HRESULT reader_parse_nextnode(xmlreader *reader)
 
                 /* try to detect encoding by BOM or data and set input code page */
                 hr = readerinput_detectencoding(reader->input, &enc);
-                TRACE("detected encoding %s, 0x%08x\n", debugstr_w(xml_encoding_map[enc].name), hr);
+                TRACE("detected encoding %s, 0x%08x\n", enc == XmlEncoding_Unknown ? "(unknown)" :
+                        debugstr_w(xml_encoding_map[enc].name), hr);
                 if (FAILED(hr)) return hr;
 
                 /* always switch first time cause we have to put something in */
diff --git a/dlls/xmllite/tests/reader.c b/dlls/xmllite/tests/reader.c
index f7f738e682..fba73c9e09 100644
--- a/dlls/xmllite/tests/reader.c
+++ b/dlls/xmllite/tests/reader.c
@@ -49,7 +49,7 @@ static void free_str(WCHAR *str)
 static const char xmldecl_full[] = "\xef\xbb\xbf<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
 static const char xmldecl_short[] = "<?xml version=\"1.0\"?><RegistrationInfo/>";
 
-static IStream *create_stream_on_data(const char *data, int size)
+static IStream *create_stream_on_data(const void *data, unsigned int size)
 {
     IStream *stream = NULL;
     HGLOBAL hglobal;
@@ -2086,6 +2086,70 @@ static void test_read_charref(void)
     IStream_Release(stream);
 }
 
+static void test_encoding_detection(void)
+{
+    static const struct encoding_testW
+    {
+        WCHAR text[16];
+    }
+    encoding_testsW[] =
+    {
+        { { '<','?','p','i',' ','?','>',0 } },
+        { { '<','!','-','-',' ','c','-','-','>',0 } },
+        { { 0xfeff,'<','a','/','>',0 } },
+        { { '<','a','/','>',0 } },
+    };
+    static const char *encoding_testsA[] =
+    {
+        "<?pi ?>",
+        "<!-- comment -->",
+        "\xef\xbb\xbf<a/>", /* UTF-8 BOM */
+        "<a/>",
+    };
+    IXmlReader *reader;
+    XmlNodeType type;
+    IStream *stream;
+    unsigned int i;
+    HRESULT hr;
+
+    hr = CreateXmlReader(&IID_IXmlReader, (void **)&reader, NULL);
+    ok(hr == S_OK, "S_OK, got %08x\n", hr);
+
+    /* there's no way to query detected encoding back, so just verify that document is browsable */
+
+    for (i = 0; i < sizeof(encoding_testsA)/sizeof(encoding_testsA[0]); i++)
+    {
+        stream = create_stream_on_data(encoding_testsA[i], strlen(encoding_testsA[i]));
+
+        hr = IXmlReader_SetInput(reader, (IUnknown *)stream);
+        ok(hr == S_OK, "got %08x\n", hr);
+
+        type = XmlNodeType_None;
+        hr = IXmlReader_Read(reader, &type);
+        ok(hr == S_OK, "got %08x\n", hr);
+        ok(type != XmlNodeType_None, "Unexpected node type %d\n", type);
+
+        IStream_Release(stream);
+    }
+
+    for (i = 0; i < sizeof(encoding_testsW)/sizeof(encoding_testsW[0]); i++)
+    {
+        stream = create_stream_on_data(encoding_testsW[i].text, lstrlenW(encoding_testsW[i].text) * sizeof(WCHAR));
+
+        hr = IXmlReader_SetInput(reader, (IUnknown *)stream);
+        ok(hr == S_OK, "got %08x\n", hr);
+
+        type = XmlNodeType_None;
+        hr = IXmlReader_Read(reader, &type);
+        ok(hr == S_OK, "%u: got %08x\n", i, hr);
+        ok(type != XmlNodeType_None, "%u: unexpected node type %d\n", i, type);
+
+        IStream_Release(stream);
+    }
+
+    IXmlReader_Release(reader);
+}
+
 START_TEST(reader)
 {
     test_reader_create();
@@ -2108,4 +2172,5 @@ START_TEST(reader)
     test_prefix();
     test_namespaceuri();
     test_read_charref();
+    test_encoding_detection();
 }
-- 
2.11.0