Nikolay Sivov : xmllite: Initial support for reader input encoding detection.

Alexandre Julliard julliard at winehq.org
Tue Nov 20 13:52:22 CST 2012


Module: wine
Branch: master
Commit: d693790175ac8c8d1b2ac88ac2026e9a10f3102d
URL:    http://source.winehq.org/git/wine.git/?a=commit;h=d693790175ac8c8d1b2ac88ac2026e9a10f3102d

Author: Nikolay Sivov <nsivov at codeweavers.com>
Date:   Sun Nov 18 18:01:46 2012 -0500

xmllite: Initial support for reader input encoding detection.

---

 dlls/xmllite/reader.c          |  123 ++++++++++++++++++++++++++++++++--------
 dlls/xmllite/tests/reader.c    |   11 +++-
 dlls/xmllite/xmllite_private.h |    5 ++
 3 files changed, 115 insertions(+), 24 deletions(-)

diff --git a/dlls/xmllite/reader.c b/dlls/xmllite/reader.c
index ae05057..188b419 100644
--- a/dlls/xmllite/reader.c
+++ b/dlls/xmllite/reader.c
@@ -117,6 +117,14 @@ static inline void *m_alloc(IMalloc *imalloc, size_t len)
         return heap_alloc(len);
 }
 
+static inline void *m_realloc(IMalloc *imalloc, void *mem, size_t len)
+{
+    if (imalloc)
+        return IMalloc_Realloc(imalloc, mem, len);
+    else
+        return heap_realloc(mem, len);
+}
+
 static inline void m_free(IMalloc *imalloc, void *mem)
 {
     if (imalloc)
@@ -142,6 +150,11 @@ static inline void *readerinput_alloc(xmlreaderinput *input, size_t len)
     return m_alloc(input->imalloc, len);
 }
 
+static inline void *readerinput_realloc(xmlreaderinput *input, void *mem, size_t len)
+{
+    return m_realloc(input->imalloc, mem, len);
+}
+
 static inline void readerinput_free(xmlreaderinput *input, void *mem)
 {
     return m_free(input->imalloc, mem);
@@ -165,7 +178,7 @@ static void free_encoded_buffer(xmlreaderinput *input, encoded_buffer *buffer)
     readerinput_free(input, buffer->data);
 }
 
-static HRESULT get_code_page(xml_encoding encoding, UINT *cp)
+static HRESULT get_code_page(xml_encoding encoding, xmlreaderinput *input)
 {
     const struct xml_encoding_data *data;
 
@@ -176,12 +189,12 @@ static HRESULT get_code_page(xml_encoding encoding, UINT *cp)
     }
 
     data = &xml_encoding_map[encoding];
-    *cp = data->cp;
+    input->buffer->code_page = data->cp;
 
     return S_OK;
 }
 
-static HRESULT alloc_input_buffer(xmlreaderinput *input, xml_encoding encoding)
+static HRESULT alloc_input_buffer(xmlreaderinput *input)
 {
     input_buffer *buffer;
     HRESULT hr;
@@ -192,29 +205,20 @@ static HRESULT alloc_input_buffer(xmlreaderinput *input, xml_encoding encoding)
     if (!buffer) return E_OUTOFMEMORY;
 
     buffer->input = input;
-    hr = get_code_page(encoding, &buffer->code_page);
+    buffer->code_page = ~0; /* code page is unknown at this point */
+    hr = init_encoded_buffer(input, &buffer->utf16);
     if (hr != S_OK) {
         readerinput_free(input, buffer);
         return hr;
     }
 
-    hr = init_encoded_buffer(input, &buffer->utf16);
+    hr = init_encoded_buffer(input, &buffer->encoded);
     if (hr != S_OK) {
+        free_encoded_buffer(input, &buffer->utf16);
         readerinput_free(input, buffer);
         return hr;
     }
 
-    if (encoding != XmlEncoding_UTF16) {
-        hr = init_encoded_buffer(input, &buffer->encoded);
-        if (hr != S_OK) {
-            free_encoded_buffer(input, &buffer->utf16);
-            readerinput_free(input, buffer);
-            return hr;
-        }
-    }
-    else
-        memset(&buffer->encoded, 0, sizeof(buffer->encoded));
-
     input->buffer = buffer;
     return S_OK;
 }
@@ -226,7 +230,7 @@ static void free_input_buffer(input_buffer *buffer)
     readerinput_free(buffer->input, buffer);
 }
 
-static void xmlreaderinput_release_stream(xmlreaderinput *readerinput)
+static void readerinput_release_stream(xmlreaderinput *readerinput)
 {
     if (readerinput->stream) {
         ISequentialStream_Release(readerinput->stream);
@@ -236,11 +240,11 @@ static void xmlreaderinput_release_stream(xmlreaderinput *readerinput)
 
 /* Queries already stored interface for IStream/ISequentialStream.
    Interface supplied on creation will be overwritten */
-static HRESULT xmlreaderinput_query_for_stream(xmlreaderinput *readerinput)
+static HRESULT readerinput_query_for_stream(xmlreaderinput *readerinput)
 {
     HRESULT hr;
 
-    xmlreaderinput_release_stream(readerinput);
+    readerinput_release_stream(readerinput);
     hr = IUnknown_QueryInterface(readerinput->input, &IID_IStream, (void**)&readerinput->stream);
     if (hr != S_OK)
         hr = IUnknown_QueryInterface(readerinput->input, &IID_ISequentialStream, (void**)&readerinput->stream);
@@ -248,6 +252,59 @@ static HRESULT xmlreaderinput_query_for_stream(xmlreaderinput *readerinput)
     return hr;
 }
 
+/* reads a chunk to raw buffer */
+static HRESULT readerinput_growraw(xmlreaderinput *readerinput)
+{
+    encoded_buffer *buffer = &readerinput->buffer->encoded;
+    ULONG len = buffer->allocated - buffer->written, read;
+    HRESULT hr;
+
+    /* always try to get aligned to 4 bytes, so the only case we can get partialy read characters is
+       variable width encodings like UTF-8 */
+    len = (len + 3) & ~3;
+    /* try to use allocated space or grow */
+    if (buffer->allocated - buffer->written < len)
+    {
+        buffer->allocated *= 2;
+        buffer->data = readerinput_realloc(readerinput, buffer->data, buffer->allocated);
+        len = buffer->allocated - buffer->written;
+    }
+
+    hr = ISequentialStream_Read(readerinput->stream, buffer->data + buffer->written, len, &read);
+    if (FAILED(hr)) return hr;
+    TRACE("requested %d, read %d, ret 0x%08x\n", len, read, hr);
+    buffer->written += read;
+
+    return hr;
+}
+
+static xml_encoding readerinput_detectencoding(xmlreaderinput *readerinput)
+{
+    encoded_buffer *buffer = &readerinput->buffer->encoded;
+
+    /* try start symbols if we have enough data to do that, input buffer should contain
+       first chunk already */
+    if (buffer->written >= 4)
+    {
+        static char startA[] = {'<','?','x','m'};
+        static WCHAR startW[] = {'<','?'};
+
+        if (!memcmp(buffer->data, startA, sizeof(startA))) return XmlEncoding_UTF8;
+        if (!memcmp(buffer->data, startW, sizeof(startW))) return XmlEncoding_UTF16;
+    }
+
+    /* try with BOM now */
+    if (buffer->written >= 3)
+    {
+        static char utf8bom[] = {0xef,0xbb,0xbf};
+        static char utf16lebom[] = {0xff,0xfe};
+        if (!memcmp(buffer->data, utf8bom, sizeof(utf8bom))) return XmlEncoding_UTF8;
+        if (!memcmp(buffer->data, utf16lebom, sizeof(utf16lebom))) return XmlEncoding_UTF16;
+    }
+
+    return XmlEncoding_Unknown;
+}
+
 static HRESULT WINAPI xmlreader_QueryInterface(IXmlReader *iface, REFIID riid, void** ppvObject)
 {
     xmlreader *This = impl_from_IXmlReader(iface);
@@ -305,7 +362,7 @@ static HRESULT WINAPI xmlreader_SetInput(IXmlReader* iface, IUnknown *input)
 
     if (This->input)
     {
-        xmlreaderinput_release_stream(This->input);
+        readerinput_release_stream(This->input);
         IUnknown_Release(&This->input->IXmlReaderInput_iface);
         This->input = NULL;
     }
@@ -333,7 +390,7 @@ static HRESULT WINAPI xmlreader_SetInput(IXmlReader* iface, IUnknown *input)
     }
 
     /* set stream for supplied IXmlReaderInput */
-    hr = xmlreaderinput_query_for_stream(This->input);
+    hr = readerinput_query_for_stream(This->input);
     if (hr == S_OK)
         This->state = XmlReadState_Initial;
 
@@ -386,7 +443,27 @@ static HRESULT WINAPI xmlreader_SetProperty(IXmlReader* iface, UINT property, LO
 
 static HRESULT WINAPI xmlreader_Read(IXmlReader* iface, XmlNodeType *node_type)
 {
-    FIXME("(%p %p): stub\n", iface, node_type);
+    xmlreader *This = impl_from_IXmlReader(iface);
+
+    FIXME("(%p)->(%p): stub\n", This, node_type);
+
+    if (This->state == XmlReadState_Closed) return S_FALSE;
+
+    /* if it's a first call for a new input we need to detect stream encoding */
+    if (This->state == XmlReadState_Initial)
+    {
+        xml_encoding enc;
+        HRESULT hr;
+
+        hr = readerinput_growraw(This->input);
+        if (FAILED(hr)) return hr;
+
+        /* try to detect encoding by BOM or data and set input code page */
+        enc = readerinput_detectencoding(This->input);
+        TRACE("detected encoding %d\n", enc);
+        get_code_page(enc, This->input);
+    }
+
     return E_NOTIMPL;
 }
 
@@ -683,7 +760,7 @@ HRESULT WINAPI CreateXmlReaderInputWithEncodingName(IUnknown *stream,
     readerinput->stream = NULL;
     if (imalloc) IMalloc_AddRef(imalloc);
 
-    hr = alloc_input_buffer(readerinput, XmlEncoding_UTF16);
+    hr = alloc_input_buffer(readerinput);
     if (hr != S_OK)
     {
         readerinput_free(readerinput, readerinput);
diff --git a/dlls/xmllite/tests/reader.c b/dlls/xmllite/tests/reader.c
index 582ad48..f5b7680 100644
--- a/dlls/xmllite/tests/reader.c
+++ b/dlls/xmllite/tests/reader.c
@@ -551,15 +551,24 @@ static void test_readerinput(void)
 static void test_reader_state(void)
 {
     IXmlReader *reader;
+    XmlNodeType nodetype;
     HRESULT hr;
 
-    hr = pCreateXmlReader(&IID_IXmlReader, (LPVOID*)&reader, NULL);
+    hr = pCreateXmlReader(&IID_IXmlReader, (void**)&reader, NULL);
     ok(hr == S_OK, "Expected S_OK, got %08x\n", hr);
 
     /* invalid arguments */
     hr = IXmlReader_GetProperty(reader, XmlReaderProperty_ReadState, NULL);
     ok(hr == E_INVALIDARG, "Expected E_INVALIDARG, got %08x\n", hr);
 
+    /* attempt to read on closed reader */
+    test_read_state(reader, XmlReadState_Closed, -1, 0);
+if (0)
+{
+    /* newer versions crash here, probably cause no input was set */
+    hr = IXmlReader_Read(reader, &nodetype);
+    ok(hr == S_FALSE, "got %08x\n", hr);
+}
     IXmlReader_Release(reader);
 }
 
diff --git a/dlls/xmllite/xmllite_private.h b/dlls/xmllite/xmllite_private.h
index 1677e5c..97993e1 100644
--- a/dlls/xmllite/xmllite_private.h
+++ b/dlls/xmllite/xmllite_private.h
@@ -27,6 +27,11 @@ static inline void *heap_alloc(size_t len)
     return HeapAlloc(GetProcessHeap(), 0, len);
 }
 
+static inline void *heap_realloc(void *mem, size_t len)
+{
+    return HeapReAlloc(GetProcessHeap(), 0, mem, len);
+}
+
 static inline BOOL heap_free(void *mem)
 {
     return HeapFree(GetProcessHeap(), 0, mem);




More information about the wine-cvs mailing list