Nikolay Sivov : xmllite: Implement initial encoding switching.

Alexandre Julliard julliard at winehq.org
Mon Nov 26 15:19:20 CST 2012


Module: wine
Branch: master
Commit: eddd7fcf29994645e4047fe9738109defa78100d
URL:    http://source.winehq.org/git/wine.git/?a=commit;h=eddd7fcf29994645e4047fe9738109defa78100d

Author: Nikolay Sivov <nsivov at codeweavers.com>
Date:   Sun Nov 25 00:10:46 2012 -0500

xmllite: Implement initial encoding switching.

---

 dlls/xmllite/reader.c       |  127 +++++++++++++++++++++++++++++++++++--------
 dlls/xmllite/tests/reader.c |    2 +-
 2 files changed, 106 insertions(+), 23 deletions(-)

diff --git a/dlls/xmllite/reader.c b/dlls/xmllite/reader.c
index 8ad78d8..b0ceac4 100644
--- a/dlls/xmllite/reader.c
+++ b/dlls/xmllite/reader.c
@@ -61,6 +61,7 @@ static const struct xml_encoding_data xml_encoding_map[] = {
 typedef struct
 {
     char *data;
+    char *cur;
     unsigned int allocated;
     unsigned int written;
 } encoded_buffer;
@@ -187,6 +188,7 @@ static HRESULT init_encoded_buffer(xmlreaderinput *input, encoded_buffer *buffer
     if (!buffer->data) return E_OUTOFMEMORY;
 
     memset(buffer->data, 0, 4);
+    buffer->cur = buffer->data;
     buffer->allocated = initial_len;
     buffer->written = 0;
 
@@ -198,18 +200,15 @@ static void free_encoded_buffer(xmlreaderinput *input, encoded_buffer *buffer)
     readerinput_free(input, buffer->data);
 }
 
-static HRESULT get_code_page(xml_encoding encoding, xmlreaderinput *input)
+static HRESULT get_code_page(xml_encoding encoding, UINT *cp)
 {
-    const struct xml_encoding_data *data;
-
     if (encoding == XmlEncoding_Unknown)
     {
         FIXME("unsupported encoding %d\n", encoding);
         return E_NOTIMPL;
     }
 
-    data = &xml_encoding_map[encoding];
-    input->buffer->code_page = data->cp;
+    *cp = xml_encoding_map[encoding].cp;
 
     return S_OK;
 }
@@ -324,31 +323,112 @@ static HRESULT readerinput_growraw(xmlreaderinput *readerinput)
     return hr;
 }
 
-static xml_encoding readerinput_detectencoding(xmlreaderinput *readerinput)
+/* grows UTF-16 buffer so it has at least 'length' bytes free on return */
+static void readerinput_grow(xmlreaderinput *readerinput, int length)
+{
+    encoded_buffer *buffer = &readerinput->buffer->utf16;
+
+    /* grow if needed, plus 4 bytes to be sure null terminator will fit in */
+    if (buffer->allocated < buffer->written + length + 4)
+    {
+        int grown_size = max(2*buffer->allocated, buffer->allocated + length);
+        buffer->data = readerinput_realloc(readerinput, buffer->data, grown_size);
+        buffer->allocated = grown_size;
+    }
+}
+
+static HRESULT readerinput_detectencoding(xmlreaderinput *readerinput, xml_encoding *enc)
 {
     encoded_buffer *buffer = &readerinput->buffer->encoded;
+    static char startA[] = {'<','?','x','m'};
+    static WCHAR startW[] = {'<','?'};
+    static char utf8bom[] = {0xef,0xbb,0xbf};
+    static char utf16lebom[] = {0xff,0xfe};
+
+    *enc = XmlEncoding_Unknown;
+
+    if (buffer->written <= 3) return MX_E_INPUTEND;
 
     /* try start symbols if we have enough data to do that, input buffer should contain
        first chunk already */
-    if (buffer->written >= 4)
+    if (!memcmp(buffer->data, startA, sizeof(startA)))
+        *enc = XmlEncoding_UTF8;
+    else if (!memcmp(buffer->data, startW, sizeof(startW)))
+        *enc = XmlEncoding_UTF16;
+    /* try with BOM now */
+    else if (!memcmp(buffer->data, utf8bom, sizeof(utf8bom)))
     {
-        static char startA[] = {'<','?','x','m'};
-        static WCHAR startW[] = {'<','?'};
-
-        if (!memcmp(buffer->data, startA, sizeof(startA))) return XmlEncoding_UTF8;
-        if (!memcmp(buffer->data, startW, sizeof(startW))) return XmlEncoding_UTF16;
+        buffer->cur += sizeof(utf8bom);
+        *enc = XmlEncoding_UTF8;
+    }
+    else if (!memcmp(buffer->data, utf16lebom, sizeof(utf16lebom)))
+    {
+        buffer->cur += sizeof(utf16lebom);
+        *enc = XmlEncoding_UTF16;
     }
 
-    /* try with BOM now */
-    if (buffer->written >= 3)
+    return S_OK;
+}
+
+static int readerinput_get_utf8_convlen(xmlreaderinput *readerinput)
+{
+    encoded_buffer *buffer = &readerinput->buffer->encoded;
+    int len = buffer->written;
+
+    /* complete single byte char */
+    if (!(buffer->data[len-1] & 0x80)) return len;
+
+    /* find start byte of multibyte char */
+    while (--len && !(buffer->data[len] & 0xc0))
+        ;
+
+    return len;
+}
+
+/* returns byte length of complete char sequence for specified code page, */
+static int readerinput_get_convlen(xmlreaderinput *readerinput, UINT cp)
+{
+    encoded_buffer *buffer = &readerinput->buffer->encoded;
+    int len = buffer->written;
+
+    if (cp == CP_UTF8)
+        len = readerinput_get_utf8_convlen(readerinput);
+    else
+        len = buffer->written;
+
+    return len - (buffer->cur - buffer->data);
+}
+
+/* note that raw buffer content is kept */
+static void readerinput_switchencoding(xmlreaderinput *readerinput, xml_encoding enc)
+{
+    encoded_buffer *src = &readerinput->buffer->encoded;
+    encoded_buffer *dest = &readerinput->buffer->utf16;
+    int len, dest_len;
+    HRESULT hr;
+    UINT cp;
+
+    hr = get_code_page(enc, &cp);
+    if (FAILED(hr)) return;
+
+    len = readerinput_get_convlen(readerinput, cp);
+
+    TRACE("switching to cp %d\n", cp);
+
+    /* just copy in this case */
+    if (enc == XmlEncoding_UTF16)
     {
-        static char utf8bom[] = {0xef,0xbb,0xbf};
-        static char utf16lebom[] = {0xff,0xfe};
-        if (!memcmp(buffer->data, utf8bom, sizeof(utf8bom))) return XmlEncoding_UTF8;
-        if (!memcmp(buffer->data, utf16lebom, sizeof(utf16lebom))) return XmlEncoding_UTF16;
+        readerinput_grow(readerinput, len);
+        memcpy(dest->data, src->cur, len);
+        readerinput->buffer->code_page = cp;
+        return;
     }
 
-    return XmlEncoding_Unknown;
+    dest_len = MultiByteToWideChar(cp, 0, src->cur, len, NULL, 0);
+    readerinput_grow(readerinput, dest_len);
+    MultiByteToWideChar(cp, 0, src->cur, len, (WCHAR*)dest->data, dest_len);
+    dest->data[dest_len] = 0;
+    readerinput->buffer->code_page = cp;
 }
 
 static HRESULT WINAPI xmlreader_QueryInterface(IXmlReader *iface, REFIID riid, void** ppvObject)
@@ -505,9 +585,12 @@ static HRESULT WINAPI xmlreader_Read(IXmlReader* iface, XmlNodeType *node_type)
         if (FAILED(hr)) return hr;
 
         /* try to detect encoding by BOM or data and set input code page */
-        enc = readerinput_detectencoding(This->input);
-        TRACE("detected encoding %d\n", enc);
-        get_code_page(enc, This->input);
+        hr = readerinput_detectencoding(This->input, &enc);
+        TRACE("detected encoding %d, 0x%08x\n", enc, hr);
+        if (FAILED(hr)) return hr;
+
+        /* always switch first time cause we have to put something in */
+        readerinput_switchencoding(This->input, enc);
     }
 
     return E_NOTIMPL;
diff --git a/dlls/xmllite/tests/reader.c b/dlls/xmllite/tests/reader.c
index 7d77fe9..729d134 100644
--- a/dlls/xmllite/tests/reader.c
+++ b/dlls/xmllite/tests/reader.c
@@ -52,7 +52,7 @@ static const char *debugstr_guid(REFIID riid)
     return buf;
 }
 
-static const char xmldecl_full[] = "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
+static const char xmldecl_full[] = "\xef\xbb\xbf<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n";
 
 static IStream *create_stream_on_data(const char *data, int size)
 {




More information about the wine-cvs mailing list