[PATCH 1/3] kernel32: Support UTF-7 in MultiByteToWideChar.
Alex Henrie
alexhenrie24 at gmail.com
Sun Oct 5 22:21:02 CDT 2014
Fixes https://bugs.winehq.org/show_bug.cgi?id=27388
I last tried tackling this issue in December 2012. This patch series
has many improvements over the last one I sent:
- The entire ASCII range is tested (including control characters now).
- Encoding invalid UTF-16 is tested.
- Decoding a UTF-7 sequence with a stray + sign is tested.
- Decoding UTF-7 sequences terminated without a minus sign is tested.
- Decoding UTF-7 sequences that have characters that should have been
escaped is tested, and a bug in that code path is fixed.
- The tests are simplified by the addition of two helper functions.
- The tests are better documented and explained.
- All test buffers are explicitly memset before use.
- utf7_mbstowcs is simpler and more efficient.
- utf7_can_directly_encode is simpler and more efficient.
Please do not let this volunteered work go to waste. As noted in the
bug report, there are multiple Windows applications that require UTF-7
support. And even though I have matched my implementation to
Microsoft's, there are multiple valid ways to encode the same string in
UTF-7, so even if there were a difference between my implementation
and theirs, it would not jeopardize compatibility.
-Alex
---
dlls/kernel32/locale.c | 248 ++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 244 insertions(+), 4 deletions(-)
diff --git a/dlls/kernel32/locale.c b/dlls/kernel32/locale.c
index 730574b..c8404d1 100644
--- a/dlls/kernel32/locale.c
+++ b/dlls/kernel32/locale.c
@@ -1954,6 +1954,247 @@ BOOL WINAPI EnumSystemCodePagesW( CODEPAGE_ENUMPROCW lpfnCodePageEnum, DWORD fla
/***********************************************************************
+ * write_to_w_string
+ *
+ * Helper for utf7_mbstowcs
+ *
+ * RETURNS
+ * 0 on success, -1 on error
+ */
+static int write_to_w_string(WCHAR* dst, int dstlen, int* index, WCHAR character)
+{
+ if (*index >= dstlen)
+ {
+ return -1;
+ }
+
+ dst[*index] = character;
+ (*index)++;
+ return 0;
+}
+
+/***********************************************************************
+ * utf7_mbstowcs
+ *
+ * UTF-7 to UTF-16 string conversion, helper for MultiByteToWideChar
+ *
+ * RETURNS
+ * On success, the number of characters written
+ * On dst buffer overflow, -1
+ * On invalid input char, -2
+ */
+static int utf7_mbstowcs(const char* src, int srclen, WCHAR* dst, int dstlen)
+{
+ static const WCHAR base64_decoding_table[] = {
+ /* \0 */ -1,
+ /* \x01 */ -1,
+ /* \x02 */ -1,
+ /* \x03 */ -1,
+ /* \x04 */ -1,
+ /* \x05 */ -1,
+ /* \x06 */ -1,
+ /* \a */ -1,
+ /* \b */ -1,
+ /* \t */ -1,
+ /* \n */ -1,
+ /* \v */ -1,
+ /* \f */ -1,
+ /* \r */ -1,
+ /* \x0E */ -1,
+ /* \x0F */ -1,
+ /* \x10 */ -1,
+ /* \x11 */ -1,
+ /* \x12 */ -1,
+ /* \x13 */ -1,
+ /* \x14 */ -1,
+ /* \x15 */ -1,
+ /* \x16 */ -1,
+ /* \x17 */ -1,
+ /* \x18 */ -1,
+ /* \x19 */ -1,
+ /* \x1A */ -1,
+ /* \e */ -1,
+ /* \x1C */ -1,
+ /* \x1D */ -1,
+ /* \x1E */ -1,
+ /* \x1F */ -1,
+ /* */ -1,
+ /* ! */ -1,
+ /* " */ -1,
+ /* # */ -1,
+ /* $ */ -1,
+ /* % */ -1,
+ /* & */ -1,
+ /* ' */ -1,
+ /* ( */ -1,
+ /* ) */ -1,
+ /* * */ -1,
+ /* + */ 62,
+ /* , */ -1,
+ /* - */ -1,
+ /* . */ -1,
+ /* / */ 63,
+ /* 0 */ 52,
+ /* 1 */ 53,
+ /* 2 */ 54,
+ /* 3 */ 55,
+ /* 4 */ 56,
+ /* 5 */ 57,
+ /* 6 */ 58,
+ /* 7 */ 59,
+ /* 8 */ 60,
+ /* 9 */ 61,
+ /* : */ -1,
+ /* ; */ -1,
+ /* < */ -1,
+ /* = */ -1,
+ /* > */ -1,
+ /* ? */ -1,
+ /* @ */ -1,
+ /* A */ 0,
+ /* B */ 1,
+ /* C */ 2,
+ /* D */ 3,
+ /* E */ 4,
+ /* F */ 5,
+ /* G */ 6,
+ /* H */ 7,
+ /* I */ 8,
+ /* J */ 9,
+ /* K */ 10,
+ /* L */ 11,
+ /* M */ 12,
+ /* N */ 13,
+ /* O */ 14,
+ /* P */ 15,
+ /* Q */ 16,
+ /* R */ 17,
+ /* S */ 18,
+ /* T */ 19,
+ /* U */ 20,
+ /* V */ 21,
+ /* W */ 22,
+ /* X */ 23,
+ /* Y */ 24,
+ /* Z */ 25,
+ /* [ */ -1,
+ /* \ */ -1,
+ /* ] */ -1,
+ /* ^ */ -1,
+ /* _ */ -1,
+ /* ` */ -1,
+ /* a */ 26,
+ /* b */ 27,
+ /* c */ 28,
+ /* d */ 29,
+ /* e */ 30,
+ /* f */ 31,
+ /* g */ 32,
+ /* h */ 33,
+ /* i */ 34,
+ /* j */ 35,
+ /* k */ 36,
+ /* l */ 37,
+ /* m */ 38,
+ /* n */ 39,
+ /* o */ 40,
+ /* p */ 41,
+ /* q */ 42,
+ /* r */ 43,
+ /* s */ 44,
+ /* t */ 45,
+ /* u */ 46,
+ /* v */ 47,
+ /* w */ 48,
+ /* x */ 49,
+ /* y */ 50,
+ /* z */ 51
+ };
+
+ BOOL dry_run = !dst || !dstlen;
+ const char* source_end = &src[srclen];
+ int dest_index = 0;
+
+ do
+ {
+ if (*src == '+')
+ {
+ WCHAR byte_pair = 0;
+ short offset = 0;
+
+ src++; /* skip the + sign */
+
+ if (*src == '-')
+ {
+ /* just a plus sign escaped as +- */
+ if (dry_run) dest_index++; else if (write_to_w_string(dst, dstlen, &dest_index, '+')) return -1;
+ src++;
+ continue;
+ }
+
+ for (;;)
+ {
+ WCHAR sextet = *src;
+ if (sextet == '-')
+ {
+ /* skip over the dash and end base64 decoding */
+ /* the current, unfinished byte pair is discarded */
+ src++;
+ break;
+ }
+ else if (sextet <= 'z')
+ {
+ sextet = base64_decoding_table[sextet];
+ if (sextet == (WCHAR)-1)
+ {
+ /* -1 means that the next character of src is not part of a base64 sequence */
+ /* in other words, all sextets in this base64 sequence have been processed */
+ /* the current, unfinished byte pair is discarded */
+ break;
+ }
+ }
+ else
+ {
+ break;
+ }
+
+ if (offset > 0)
+ {
+ byte_pair |= (sextet << 10) >> offset;
+ }
+ else
+ {
+ byte_pair |= sextet << (10 - offset);
+ }
+ offset += 6;
+ if (offset > 15)
+ {
+ /* this byte pair is done */
+ if (dry_run) dest_index++; else if (write_to_w_string(dst, dstlen, &dest_index, byte_pair)) return -1;
+ byte_pair = 0;
+ /* back up the offset to begin writing to the next byte pair,
+ including writing any part of the current sextet that didn't fit in the last byte pair */
+ offset -= 22;
+ }
+ else
+ {
+ /* this sextet is done */
+ src++;
+ }
+ }
+ }
+ else
+ {
+ /* we have to convert to unsigned char in case *src > 127 */
+ if (dry_run) dest_index++; else if (write_to_w_string(dst, dstlen, &dest_index, (unsigned char)*src)) return -1;
+ src++;
+ }
+ } while (src < source_end);
+
+ return dest_index;
+}
+
+/***********************************************************************
* MultiByteToWideChar (KERNEL32.@)
*
* Convert a multibyte character string into a Unicode string.
@@ -1963,7 +2204,7 @@ BOOL WINAPI EnumSystemCodePagesW( CODEPAGE_ENUMPROCW lpfnCodePageEnum, DWORD fla
* flags [I] Character mapping flags
* src [I] Source string buffer
* srclen [I] Length of src (in bytes), or -1 if src is NUL terminated
- * dst [O] Destination buffer
+ * dst [O] Destination buffer, or NULL to compute the required length
* dstlen [I] Length of dst (in WCHARs), or 0 to compute the required length
*
* RETURNS
@@ -2006,9 +2247,8 @@ INT WINAPI MultiByteToWideChar( UINT page, DWORD flags, LPCSTR src, INT srclen,
SetLastError( ERROR_INVALID_FLAGS );
return 0;
}
- FIXME("UTF-7 not supported\n");
- SetLastError( ERROR_CALL_NOT_IMPLEMENTED );
- return 0;
+ ret = utf7_mbstowcs( src, srclen, dst, dstlen );
+ break;
case CP_UNIXCP:
if (unix_cptable)
{
--
2.1.2
More information about the wine-patches
mailing list