[2/8] webservices: Add support for character set detection.
Hans Leidekker
hans at codeweavers.com
Thu Oct 22 03:17:15 CDT 2015
Signed-off-by: Hans Leidekker <hans at codeweavers.com>
---
dlls/webservices/reader.c | 77 +++++++++++++++++++++++++++++----
dlls/webservices/tests/reader.c | 95 ++++++++++++++++++++++++++++++++++++-----
2 files changed, 154 insertions(+), 18 deletions(-)
diff --git a/dlls/webservices/reader.c b/dlls/webservices/reader.c
index d888223..ca59f33 100644
--- a/dlls/webservices/reader.c
+++ b/dlls/webservices/reader.c
@@ -551,6 +551,17 @@ HRESULT WINAPI WsGetReaderProperty( WS_XML_READER *handle, WS_XML_READER_PROPERT
if (error) FIXME( "ignoring error parameter\n" );
if (!reader->input_data) return WS_E_INVALID_OPERATION;
+
+ if (id == WS_XML_READER_PROPERTY_CHARSET)
+ {
+ WS_CHARSET charset;
+ HRESULT hr;
+
+ if ((hr = get_reader_prop( reader, id, &charset, size )) != S_OK) return hr;
+ if (!charset) return WS_E_INVALID_FORMAT;
+ *(WS_CHARSET *)buf = charset;
+ return S_OK;
+ }
return get_reader_prop( reader, id, buf, size );
}
@@ -1373,6 +1384,54 @@ HRESULT WINAPI WsSetErrorProperty( WS_ERROR *handle, WS_ERROR_PROPERTY_ID id, co
return set_error_prop( error, id, value, size );
}
+static inline BOOL is_utf8( const unsigned char *data, ULONG size, ULONG *offset )
+{
+ static const char bom[] = {0xef,0xbb,0xbf};
+ const unsigned char *p = data;
+
+ return (size >= sizeof(bom) && !memcmp( p, bom, sizeof(bom) ) && (*offset = sizeof(bom))) ||
+ (size > 2 && !(*offset = 0));
+}
+
+static inline BOOL is_utf16le( const unsigned char *data, ULONG size, ULONG *offset )
+{
+ static const char bom[] = {0xff,0xfe};
+ const unsigned char *p = data;
+
+ return (size >= sizeof(bom) && !memcmp( p, bom, sizeof(bom) ) && (*offset = sizeof(bom))) ||
+ (size >= 4 && p[0] == '<' && !p[1] && !(*offset = 0));
+}
+
+static HRESULT detect_charset( const WS_XML_READER_INPUT *input, WS_CHARSET *charset, ULONG *offset )
+{
+ const WS_XML_READER_BUFFER_INPUT *buf = (const WS_XML_READER_BUFFER_INPUT *)input;
+
+ if (input->inputType != WS_XML_READER_INPUT_TYPE_BUFFER)
+ {
+ FIXME( "charset detection on input type %u not supported\n", input->inputType );
+ return E_NOTIMPL;
+ }
+
+ /* FIXME: parse xml declaration */
+
+ if (is_utf16le( buf->encodedData, buf->encodedDataSize, offset ))
+ {
+ *charset = WS_CHARSET_UTF16LE;
+ }
+ else if (is_utf8( buf->encodedData, buf->encodedDataSize, offset ))
+ {
+ *charset = WS_CHARSET_UTF8;
+ }
+ else
+ {
+ FIXME( "charset not recognized\n" );
+ *charset = 0;
+ }
+
+ TRACE( "detected charset %u\n", *charset );
+ return S_OK;
+}
+
/**************************************************************************
* WsSetInput [webservices.@]
*/
@@ -1383,7 +1442,7 @@ HRESULT WINAPI WsSetInput( WS_XML_READER *handle, const WS_XML_READER_ENCODING *
struct reader *reader = (struct reader *)handle;
struct node *node;
HRESULT hr;
- ULONG i;
+ ULONG i, offset = 0;
TRACE( "%p %p %p %p %u %p\n", handle, encoding, input, properties, count, error );
if (error) FIXME( "ignoring error parameter\n" );
@@ -1395,11 +1454,13 @@ HRESULT WINAPI WsSetInput( WS_XML_READER *handle, const WS_XML_READER_ENCODING *
case WS_XML_READER_ENCODING_TYPE_TEXT:
{
WS_XML_READER_TEXT_ENCODING *text = (WS_XML_READER_TEXT_ENCODING *)encoding;
- if (text->charSet != WS_CHARSET_UTF8)
- {
- FIXME( "charset %u not supported\n", text->charSet );
- return E_NOTIMPL;
- }
+ WS_CHARSET charset = text->charSet;
+
+ if (charset == WS_CHARSET_AUTO && (hr = detect_charset( input, &charset, &offset )) != S_OK)
+ return hr;
+
+ hr = set_reader_prop( reader, WS_XML_READER_PROPERTY_CHARSET, &charset, sizeof(charset) );
+ if (hr != S_OK) return hr;
break;
}
default:
@@ -1411,8 +1472,8 @@ HRESULT WINAPI WsSetInput( WS_XML_READER *handle, const WS_XML_READER_ENCODING *
case WS_XML_READER_INPUT_TYPE_BUFFER:
{
WS_XML_READER_BUFFER_INPUT *buf = (WS_XML_READER_BUFFER_INPUT *)input;
- reader->input_data = buf->encodedData;
- reader->input_size = buf->encodedDataSize;
+ reader->input_data = (const char *)buf->encodedData + offset;
+ reader->input_size = buf->encodedDataSize - offset;
reader->read_bufptr = reader->input_data;
break;
}
diff --git a/dlls/webservices/tests/reader.c b/dlls/webservices/tests/reader.c
index 31587e1..f0c7077 100644
--- a/dlls/webservices/tests/reader.c
+++ b/dlls/webservices/tests/reader.c
@@ -25,7 +25,7 @@ static const char data1[] =
"<?xml version=\"1.0\" encoding=\"utf-8\"?>";
static const char data2[] =
- "<text>test</text>";
+ {0xef,0xbb,0xbf,'<','t','e','x','t','>','t','e','s','t','<','/','t','e','x','t','>',0};
static const char data3[] =
"<?xml version=\"1.0\" encoding=\"utf-8\"?>"
@@ -226,7 +226,7 @@ static HRESULT set_input( WS_XML_READER *reader, const char *data, ULONG size )
WS_XML_READER_BUFFER_INPUT input;
encoding.encoding.encodingType = WS_XML_READER_ENCODING_TYPE_TEXT;
- encoding.charSet = WS_CHARSET_UTF8;
+ encoding.charSet = WS_CHARSET_AUTO;
input.input.inputType = WS_XML_READER_INPUT_TYPE_BUFFER;
input.encodedData = (void *)data;
@@ -367,6 +367,31 @@ static void test_WsCreateReader(void)
static void test_WsSetInput(void)
{
+ static char test1[] = {0xef,0xbb,0xbf,'<','a','/','>'};
+ static char test2[] = {'<','a','/','>'};
+ static char test3[] = {'<','!','-','-'};
+ static char test4[] = {'<','?','x','m','l',' ','v','e','r','s','i','o','n','=','"','1','.','0','"',
+ ' ','e','n','c','o','d','i','n','g','=','"','u','t','f','-','8','"','?','>'};
+ static char test5[] = {'<','?','x','m','l',' ','e','n','c','o','d','i','n','g','=',
+ '"','u','t','f','-','8','"','?','>'};
+ static char test6[] = {'<','?','x','m','l'};
+ static char test7[] = {'<','?','y','m','l'};
+ static char test8[] = {'<','?'};
+ static char test9[] = {'<','!'};
+ static char test10[] = {0xff,0xfe,'<',0,'a',0,'/',0,'>',0};
+ static char test11[] = {'<',0,'a',0,'/',0,'>',0};
+ static char test12[] = {'<',0,'!',0,'-',0,'-',0};
+ static char test13[] = {'<',0,'?',0};
+ static char test14[] = {'a','b'};
+ static char test15[] = {'a','b','c'};
+ static char test16[] = {'a',0};
+ static char test17[] = {'a',0,'b',0};
+ static char test18[] = {'<',0,'a',0,'b',0};
+ static char test19[] = {'<',0,'a',0};
+ static char test20[] = {0,'a','b'};
+ static char test21[] = {0,0};
+ static char test22[] = {0,0,0};
+ static char test23[] = {'<',0,'?',0,'x',0,'m',0,'l',0};
HRESULT hr;
WS_XML_READER *reader;
WS_XML_READER_PROPERTY prop;
@@ -374,7 +399,41 @@ static void test_WsSetInput(void)
WS_XML_READER_BUFFER_INPUT input;
WS_CHARSET charset;
const WS_XML_NODE *node;
- ULONG size, max_depth;
+ ULONG i, size, max_depth;
+ static const struct
+ {
+ void *data;
+ ULONG size;
+ HRESULT hr;
+ WS_CHARSET charset;
+ int todo;
+ }
+ tests[] =
+ {
+ { test1, sizeof(test1), S_OK, WS_CHARSET_UTF8 },
+ { test2, sizeof(test2), S_OK, WS_CHARSET_UTF8 },
+ { test3, sizeof(test3), S_OK, WS_CHARSET_UTF8 },
+ { test4, sizeof(test4), S_OK, WS_CHARSET_UTF8 },
+ { test5, sizeof(test5), WS_E_INVALID_FORMAT, 0, 1 },
+ { test6, sizeof(test6), WS_E_INVALID_FORMAT, 0, 1 },
+ { test7, sizeof(test7), WS_E_INVALID_FORMAT, 0, 1 },
+ { test8, sizeof(test8), WS_E_INVALID_FORMAT, 0 },
+ { test9, sizeof(test9), WS_E_INVALID_FORMAT, 0 },
+ { test10, sizeof(test10), S_OK, WS_CHARSET_UTF16LE },
+ { test11, sizeof(test11), S_OK, WS_CHARSET_UTF16LE },
+ { test12, sizeof(test12), S_OK, WS_CHARSET_UTF16LE },
+ { test13, sizeof(test13), WS_E_INVALID_FORMAT, 0, 1 },
+ { test14, sizeof(test14), WS_E_INVALID_FORMAT, 0 },
+ { test15, sizeof(test15), S_OK, WS_CHARSET_UTF8 },
+ { test16, sizeof(test16), WS_E_INVALID_FORMAT, 0 },
+ { test17, sizeof(test17), S_OK, WS_CHARSET_UTF8 },
+ { test18, sizeof(test18), S_OK, WS_CHARSET_UTF16LE },
+ { test19, sizeof(test19), S_OK, WS_CHARSET_UTF16LE },
+ { test20, sizeof(test20), S_OK, WS_CHARSET_UTF8 },
+ { test21, sizeof(test21), WS_E_INVALID_FORMAT, 0 },
+ { test22, sizeof(test22), S_OK, WS_CHARSET_UTF8 },
+ { test23, sizeof(test23), WS_E_INVALID_FORMAT, 0, 1 },
+ };
hr = WsCreateReader( NULL, 0, &reader, NULL ) ;
ok( hr == S_OK, "got %08x\n", hr );
@@ -411,14 +470,30 @@ static void test_WsSetInput(void)
/* charset is detected by WsSetInput */
enc.encoding.encodingType = WS_XML_READER_ENCODING_TYPE_TEXT;
enc.charSet = WS_CHARSET_AUTO;
- hr = WsSetInput( reader, (WS_XML_READER_ENCODING *)&enc, (WS_XML_READER_INPUT *)&input, NULL, 0, NULL );
- todo_wine ok( hr == S_OK, "got %08x\n", hr );
- charset = 0xdeadbeef;
- size = sizeof(charset);
- hr = WsGetReaderProperty( reader, WS_XML_READER_PROPERTY_CHARSET, &charset, size, NULL );
- ok( hr == S_OK, "got %08x\n", hr );
- ok( charset == WS_CHARSET_UTF8, "got %u\n", charset );
+ for (i = 0; i < sizeof(tests)/sizeof(tests[0]); i++)
+ {
+ input.encodedData = tests[i].data;
+ input.encodedDataSize = tests[i].size;
+ hr = WsSetInput( reader, (WS_XML_READER_ENCODING *)&enc, (WS_XML_READER_INPUT *)&input, NULL, 0, NULL );
+ ok( hr == S_OK, "%u: got %08x\n", i, hr );
+
+ charset = 0xdeadbeef;
+ size = sizeof(charset);
+ hr = WsGetReaderProperty( reader, WS_XML_READER_PROPERTY_CHARSET, &charset, size, NULL );
+ if (tests[i].todo)
+ {
+ todo_wine ok( hr == tests[i].hr, "%u: got %08x expected %08x\n", i, hr, tests[i].hr );
+ if (hr == S_OK)
+ todo_wine ok( charset == tests[i].charset, "%u: got %u expected %u\n", i, charset, tests[i].charset );
+ }
+ else
+ {
+ ok( hr == tests[i].hr, "%u: got %08x expected %08x\n", i, hr, tests[i].hr );
+ if (hr == S_OK)
+ ok( charset == tests[i].charset, "%u: got %u expected %u\n", i, charset, tests[i].charset );
+ }
+ }
enc.encoding.encodingType = WS_XML_READER_ENCODING_TYPE_TEXT;
enc.charSet = WS_CHARSET_UTF8;
--
2.6.1
More information about the wine-patches
mailing list