[4/9] webservices: Decode XML entities.

Mon Jul 4 04:35:40 CDT 2016

Signed-off-by: Hans Leidekker <hans at codeweavers.com>
---
 dlls/webservices/reader.c       | 136 +++++++++++++++++++++++++++++++++++++++-
 dlls/webservices/tests/reader.c | 115 +++++++++++++++++++++++++++++++++
 2 files changed, 248 insertions(+), 3 deletions(-)

diff --git a/dlls/webservices/reader.c b/dlls/webservices/reader.c
index eb89af4..8283e7a 100644
--- a/dlls/webservices/reader.c
+++ b/dlls/webservices/reader.c
@@ -1008,11 +1008,129 @@ static HRESULT parse_name( const unsigned char *str, unsigned int len,
     return S_OK;
 }
 
+static int codepoint_to_utf8( int cp, unsigned char *dst )
+{
+    if (cp < 0x80)
+    {
+        *dst = cp;
+        return 1;
+    }
+    if (cp < 0x800)
+    {
+        dst[1] = 0x80 | (cp & 0x3f);
+        cp >>= 6;
+        dst[0] = 0xc0 | cp;
+        return 2;
+    }
+    if ((cp >= 0xd800 && cp <= 0xdfff) || cp == 0xfffe || cp == 0xffff) return -1;
+    if (cp < 0x10000)
+    {
+        dst[2] = 0x80 | (cp & 0x3f);
+        cp >>= 6;
+        dst[1] = 0x80 | (cp & 0x3f);
+        cp >>= 6;
+        dst[0] = 0xe0 | cp;
+        return 3;
+    }
+    dst[3] = 0x80 | (cp & 0x3f);
+    cp >>= 6;
+    dst[2] = 0x80 | (cp & 0x3f);
+    cp >>= 6;
+    dst[1] = 0x80 | (cp & 0x3f);
+    cp >>= 6;
+    dst[0] = 0xf0 | cp;
+    return 4;
+}
+
+static HRESULT decode_text( const unsigned char *str, ULONG len, unsigned char *ret, ULONG *ret_len  )
+{
+    const unsigned char *p = str;
+    unsigned char *q = ret;
+
+    *ret_len = 0;
+    while (len)
+    {
+        if (*p == '&')
+        {
+            p++; len--;
+            if (!len) return WS_E_INVALID_FORMAT;
+
+            if (len >= 3 && !memcmp( p, "lt;", 3 ))
+            {
+                *q++ = '<';
+                p += 3;
+                len -= 3;
+            }
+            else if (len >= 3 && !memcmp( p, "gt;", 3 ))
+            {
+                *q++ = '>';
+                p += 3;
+                len -= 3;
+            }
+            else if (len >= 5 && !memcmp( p, "quot;", 5 ))
+            {
+                *q++ = '"';
+                p += 5;
+                len -= 5;
+            }
+            else if (len >= 4 && !memcmp( p, "amp;", 4 ))
+            {
+                *q++ = '&';
+                p += 4;
+                len -= 4;
+            }
+            else if (len >= 5 && !memcmp( p, "apos;", 5 ))
+            {
+                *q++ = '\'';
+                p += 5;
+                len -= 5;
+            }
+            else if (*p == '#')
+            {
+                ULONG start, nb_digits, i;
+                int len_utf8, cp = 0;
+
+                p++; len--;
+                if (!len || *p != 'x') return WS_E_INVALID_FORMAT;
+                p++; len--;
+
+                start = len;
+                while (len && isxdigit( *p )) { p++; len--; };
+                if (!len) return WS_E_INVALID_FORMAT;
+
+                p -= nb_digits = start - len;
+                if (!nb_digits || nb_digits > 5 || p[nb_digits] != ';') return WS_E_INVALID_FORMAT;
+                for (i = 0; i < nb_digits; i++)
+                {
+                    cp *= 16;
+                    if (*p >= '0' && *p <= '9') cp += *p - '0';
+                    else if (*p >= 'a' && *p <= 'f') cp += *p - 'a' + 10;
+                    else cp += *p - 'A' + 10;
+                    p++;
+                }
+                p++; len--;
+                if ((len_utf8 = codepoint_to_utf8( cp, q )) < 0) return WS_E_INVALID_FORMAT;
+                *ret_len += len_utf8;
+                q += len_utf8;
+                continue;
+            }
+            else return WS_E_INVALID_FORMAT;
+        }
+        else
+        {
+            *q++ = *p++;
+            len--;
+        }
+        *ret_len += 1;
+    }
+    return S_OK;
+}
+
 static HRESULT read_attribute( struct reader *reader, WS_XML_ATTRIBUTE **ret )
 {
     static const WS_XML_STRING xmlns = {5, (BYTE *)"xmlns"};
     WS_XML_ATTRIBUTE *attr;
-    WS_XML_UTF8_TEXT *text;
+    WS_XML_UTF8_TEXT *text = NULL;
     unsigned int len = 0, ch, skip, quote;
     const unsigned char *start;
     WS_XML_STRING *prefix, *localname;
@@ -1083,7 +1201,11 @@ static HRESULT read_attribute( struct reader *reader, WS_XML_ATTRIBUTE **ret )
         if ((hr = bind_prefix( reader, attr->prefix, attr->ns )) != S_OK) goto error;
         if (!(text = alloc_utf8_text( NULL, 0 ))) goto error;
     }
-    else if (!(text = alloc_utf8_text( start, len ))) goto error;
+    else
+    {
+        if (!(text = alloc_utf8_text( NULL, len ))) goto error;
+        if ((hr = decode_text( start, len, text->value.bytes, &text->value.length )) != S_OK) goto error;
+    }
 
     attr->value = &text->text;
     attr->singleQuote = (quote == '\'');
@@ -1092,6 +1214,7 @@ static HRESULT read_attribute( struct reader *reader, WS_XML_ATTRIBUTE **ret )
     return S_OK;
 
 error:
+    heap_free( text );
     free_attribute( attr );
     return hr;
 }
@@ -1207,6 +1330,7 @@ static HRESULT read_text( struct reader *reader )
     struct node *node, *parent;
     WS_XML_TEXT_NODE *text;
     WS_XML_UTF8_TEXT *utf8;
+    HRESULT hr;
 
     start = read_current_ptr( reader );
     for (;;)
@@ -1222,11 +1346,17 @@ static HRESULT read_text( struct reader *reader )
 
     if (!(node = alloc_node( WS_XML_NODE_TYPE_TEXT ))) return E_OUTOFMEMORY;
     text = (WS_XML_TEXT_NODE *)node;
-    if (!(utf8 = alloc_utf8_text( start, len )))
+    if (!(utf8 = alloc_utf8_text( NULL, len )))
     {
         heap_free( node );
         return E_OUTOFMEMORY;
     }
+    if ((hr = decode_text( start, len, utf8->value.bytes, &utf8->value.length )) != S_OK)
+    {
+        heap_free( utf8 );
+        heap_free( node );
+        return hr;
+    }
     text->text = &utf8->text;
 
     read_insert_node( reader, parent, node );
diff --git a/dlls/webservices/tests/reader.c b/dlls/webservices/tests/reader.c
index 5c4770c..77f7964 100644
--- a/dlls/webservices/tests/reader.c
+++ b/dlls/webservices/tests/reader.c
@@ -3542,6 +3542,120 @@ static void test_WsSetReaderPosition(void)
     WsFreeHeap( heap );
 }
 
+static void test_entities(void)
+{
+    static const char str1[] = "<t>&#xA</t>";
+    static const char str2[] = "<t>&#xA;</t>";
+    static const char str3[] = "<t>&#xa;</t>";
+    static const char str4[] = "<t>&#xaaaa;</t>";
+    static const char str5[] = "<t>&#xaaaaa;</t>";
+    static const char str6[] = "<t>&1</t>";
+    static const char str7[] = "<t>&1;</t>";
+    static const char str8[] = "<t>&1111;</t>";
+    static const char str9[] = "<t>&11111;</t>";
+    static const char str10[] = "<t><</t>";
+    static const char str11[] = "<t>></t>";
+    static const char str12[] = "<t>"</t>";
+    static const char str13[] = "<t>&</t>";
+    static const char str14[] = "<t>'</t>";
+    static const char str15[] = "<t>&sopa;</t>";
+    static const char str16[] = "<t>&#;</t>";
+    static const char str17[] = "<t>&;</t>";
+    static const char str18[] = "<t>&&</t>";
+    static const char str19[] = "<t>&</t>";
+    static const char str20[] = "<t>&#xaaaaaa;</t>";
+    static const char str21[] = "<t>&#xd7ff;</t>";
+    static const char str22[] = "<t>&#xd800;</t>";
+    static const char str23[] = "<t>&#xdfff;</t>";
+    static const char str24[] = "<t>&#xe000;</t>";
+    static const char str25[] = "<t>&#xfffe;</t>";
+    static const char str26[] = "<t>&#xffff;</t>";
+    static const char str27[] = "<t><</t>";
+    static const char res4[] = {0xea, 0xaa, 0xaa, 0x00};
+    static const char res5[] = {0xf2, 0xaa, 0xaa, 0xaa, 0x00};
+    static const char res21[] = {0xed, 0x9f, 0xbf, 0x00};
+    static const char res24[] = {0xee, 0x80, 0x80, 0x00};
+    static const struct
+    {
+        const char *str;
+        HRESULT     hr;
+        const char *res;
+    }
+    tests[] =
+    {
+        { str1, WS_E_INVALID_FORMAT },
+        { str2, S_OK, "\n" },
+        { str3, S_OK, "\n" },
+        { str4, S_OK, res4 },
+        { str5, S_OK, res5 },
+        { str6, WS_E_INVALID_FORMAT },
+        { str7, WS_E_INVALID_FORMAT },
+        { str8, WS_E_INVALID_FORMAT },
+        { str9, WS_E_INVALID_FORMAT },
+        { str10, S_OK, "<" },
+        { str11, S_OK, ">" },
+        { str12, S_OK, "\"" },
+        { str13, S_OK, "&" },
+        { str14, S_OK, "'" },
+        { str15, WS_E_INVALID_FORMAT },
+        { str16, WS_E_INVALID_FORMAT },
+        { str17, WS_E_INVALID_FORMAT },
+        { str18, WS_E_INVALID_FORMAT },
+        { str19, WS_E_INVALID_FORMAT },
+        { str20, WS_E_INVALID_FORMAT },
+        { str21, S_OK, res21 },
+        { str22, WS_E_INVALID_FORMAT },
+        { str23, WS_E_INVALID_FORMAT },
+        { str24, S_OK, res24 },
+        { str25, WS_E_INVALID_FORMAT },
+        { str26, WS_E_INVALID_FORMAT },
+        { str27, WS_E_INVALID_FORMAT },
+    };
+    HRESULT hr;
+    WS_XML_READER *reader;
+    const WS_XML_NODE *node;
+    const WS_XML_UTF8_TEXT *utf8;
+    ULONG i;
+
+    hr = WsCreateReader( NULL, 0, &reader, NULL ) ;
+    ok( hr == S_OK, "got %08x\n", hr );
+
+    for (i = 0; i < sizeof(tests)/sizeof(tests[0]); i++)
+    {
+        hr = set_input( reader, tests[i].str, strlen(tests[i].str) );
+        ok( hr == S_OK, "%u: got %08x\n", i, hr );
+
+        hr = WsReadToStartElement( reader, NULL, NULL, NULL, NULL );
+        ok( hr == S_OK, "%u: got %08x\n", i, hr );
+
+        hr = WsReadNode( reader, NULL );
+        ok( hr == tests[i].hr, "%u: got %08x\n", i, hr );
+        if (hr != S_OK) continue;
+
+        hr = WsGetReaderNode( reader, &node, NULL );
+        ok( hr == S_OK, "%u: got %08x\n", i, hr );
+
+        utf8 = (const WS_XML_UTF8_TEXT *)((const WS_XML_TEXT_NODE *)node)->text;
+        ok( utf8->value.length == strlen(tests[i].res), "%u: got %u\n", i, utf8->value.length );
+        ok( !memcmp( utf8->value.bytes, tests[i].res, strlen(tests[i].res) ), "%u: wrong data\n", i );
+    }
+
+    hr = set_input( reader, "<t a='&#xA;&#xA;'/>", sizeof("<t a='&#xA;&#xA;'/>") - 1 );
+    ok( hr == S_OK, "got %08x\n", hr );
+
+    hr = WsReadToStartElement( reader, NULL, NULL, NULL, NULL );
+    ok( hr == S_OK, "got %08x\n", hr );
+
+    hr = WsGetReaderNode( reader, &node, NULL );
+    ok( hr == S_OK, "got %08x\n", hr );
+
+    utf8 = (const WS_XML_UTF8_TEXT *)((const WS_XML_ELEMENT_NODE *)node)->attributes[0]->value;
+    ok( utf8->value.length == 2, "got %u\n", utf8->value.length );
+    ok( !memcmp( utf8->value.bytes, "\n\n", 2 ), "wrong data\n" );
+
+    WsFreeReader( reader );
+}
+
 START_TEST(reader)
 {
     test_WsCreateError();
@@ -3576,4 +3690,5 @@ START_TEST(reader)
     test_WsResetError();
     test_WsGetReaderPosition();
     test_WsSetReaderPosition();
+    test_entities();
 }
-- 
2.8.1