Alexandre Julliard : wmc: Reimplement input format detection to correctly handle Unicode BOM.
Alexandre Julliard
julliard at winehq.org
Thu Feb 6 15:44:16 CST 2020
Module: wine
Branch: master
Commit: b7b44224d1274df2bf2b851099c4c4deb4bddc5f
URL: https://source.winehq.org/git/wine.git/?a=commit;h=b7b44224d1274df2bf2b851099c4c4deb4bddc5f
Author: Alexandre Julliard <julliard at winehq.org>
Date: Thu Feb 6 15:06:10 2020 +0100
wmc: Reimplement input format detection to correctly handle Unicode BOM.
Signed-off-by: Alexandre Julliard <julliard at winehq.org>
---
tools/wmc/mcl.c | 212 +++++++++++++++++++-------------------------------------
1 file changed, 72 insertions(+), 140 deletions(-)
diff --git a/tools/wmc/mcl.c b/tools/wmc/mcl.c
index 56c5ca640c..1319113fff 100644
--- a/tools/wmc/mcl.c
+++ b/tools/wmc/mcl.c
@@ -160,14 +160,13 @@ void set_codepage(int cp)
/*
* Input functions
*/
+#define INPUTBUFFER_SIZE 2048 /* Must be larger than 4 and approx. large enough to hold a line */
+
static int nungetstack = 0;
static int allocungetstack = 0;
static char *ungetstack = NULL;
static int ninputbuffer = 0;
-static WCHAR *inputbuffer = NULL;
-static char *xlatebuffer = NULL;
-
-#define INPUTBUFFER_SIZE 2048 /* Must be larger than 4 and approx. large enough to hold a line */
+static WCHAR inputbuffer[INPUTBUFFER_SIZE];
/*
* Fill the input buffer with *one* line of input.
@@ -179,141 +178,74 @@ static char *xlatebuffer = NULL;
*/
static int fill_inputbuffer(void)
{
- int n;
- static const char err_fatalread[] = "Fatal: reading input failed";
- static int endian = -1;
-
- if(!inputbuffer)
- {
- inputbuffer = xmalloc(INPUTBUFFER_SIZE*sizeof(WCHAR));
- xlatebuffer = xmalloc(INPUTBUFFER_SIZE);
- }
-
-try_again:
- if(!unicodein)
- {
- char *cptr;
- cptr = fgets(xlatebuffer, INPUTBUFFER_SIZE, yyin);
- if(!cptr && ferror(yyin))
- xyyerror(err_fatalread);
- else if(!cptr)
- return 0;
- if (codepage == CP_UTF8)
- {
- WCHAR *buf = utf8_to_unicode( xlatebuffer, strlen(xlatebuffer), &n );
- memcpy( inputbuffer, buf, (n + 1) * sizeof(WCHAR) );
- free( buf );
- }
- else
- {
- n = wmc_mbstowcs(codepage, 0, xlatebuffer, strlen(xlatebuffer)+1, inputbuffer, INPUTBUFFER_SIZE);
- if(n < 0)
- internal_error(__FILE__, __LINE__, "Could not translate to unicode (%d)\n", n);
- }
- if(n <= 1)
- goto try_again; /* Should not happen */
- n--; /* Strip added conversion '\0' from input length */
- /*
- * FIXME:
- * Detect UTF-8 in the first time we read some bytes by
- * checking the special sequence "FE..." or something like
- * that. I need to check www.unicode.org for details.
- */
- }
- else
- {
- if(endian == -1)
- {
- n = fread(inputbuffer, 1, 8, yyin);
- if(n != 8)
- {
- if(!n && ferror(yyin))
- xyyerror(err_fatalread);
- else
- xyyerror("Fatal: file too short to determine byteorder (should never happen)\n");
- }
- if(isisochar(inputbuffer[0]) &&
- isisochar(inputbuffer[1]) &&
- isisochar(inputbuffer[2]) &&
- isisochar(inputbuffer[3]))
- {
-#ifdef WORDS_BIGENDIAN
- endian = WMC_BO_BIG;
-#else
- endian = WMC_BO_LITTLE;
-#endif
- }
- else if(isisochar(BYTESWAP_WORD(inputbuffer[0])) &&
- isisochar(BYTESWAP_WORD(inputbuffer[1])) &&
- isisochar(BYTESWAP_WORD(inputbuffer[2])) &&
- isisochar(BYTESWAP_WORD(inputbuffer[3])))
- {
-#ifdef WORDS_BIGENDIAN
- endian = WMC_BO_LITTLE;
-#else
- endian = WMC_BO_BIG;
-#endif
- }
- else
- xyyerror("Fatal: cannot determine file's byteorder\n");
- /* FIXME:
- * Determine the file-endian with the leader-bytes
- * "FF FE..."; can't remember the exact sequence.
- */
- n /= 2;
-#ifdef WORDS_BIGENDIAN
- if(endian == WMC_BO_LITTLE)
-#else
- if(endian == WMC_BO_BIG)
-#endif
- {
- inputbuffer[0] = BYTESWAP_WORD(inputbuffer[0]);
- inputbuffer[1] = BYTESWAP_WORD(inputbuffer[1]);
- inputbuffer[2] = BYTESWAP_WORD(inputbuffer[2]);
- inputbuffer[3] = BYTESWAP_WORD(inputbuffer[3]);
- }
-
- }
- else
- {
- int i;
- n = 0;
- for(i = 0; i < INPUTBUFFER_SIZE; i++)
- {
- int t;
- t = fread(&inputbuffer[i], 2, 1, yyin);
- if(!t && ferror(yyin))
- xyyerror(err_fatalread);
- else if(!t && n)
- break;
- n++;
-#ifdef WORDS_BIGENDIAN
- if(endian == WMC_BO_LITTLE)
-#else
- if(endian == WMC_BO_BIG)
-#endif
- {
- if((inputbuffer[i] = BYTESWAP_WORD(inputbuffer[i])) == '\n')
- break;
- }
- else
- {
- if(inputbuffer[i] == '\n')
- break;
- }
- }
- }
-
- }
-
- if(!n)
- {
- mcy_warning("Re-read line (input was or converted to zilch)\n");
- goto try_again; /* Should not happen, but could be due to stdin reading and a signal */
- }
-
- ninputbuffer += n;
- return 1;
+ static enum input_mode { INPUT_UNKNOWN, INPUT_ASCII, INPUT_UTF8, INPUT_UNICODE } mode;
+ static int swapped;
+ static unsigned char utf8_bom[3] = { 0xef, 0xbb, 0xbf };
+ WCHAR *wbuf;
+ int i, pos = 0, len = 0;
+ char buffer[INPUTBUFFER_SIZE];
+
+ if (mode == INPUT_UNKNOWN)
+ {
+ len = fread( buffer, 1, 8, yyin );
+ wbuf = (WCHAR *)buffer;
+ if (len >= 3 && !memcmp( buffer, utf8_bom, 3 ))
+ {
+ mode = INPUT_UTF8;
+ memmove( buffer, buffer + 3, len - 3 );
+ len -= 3;
+ }
+ else if (len == 8)
+ {
+ if (wbuf[0] == 0xfeff || wbuf[0] == 0xfffe)
+ {
+ mode = INPUT_UNICODE;
+ pos = 1;
+ swapped = (wbuf[0] == 0xfffe);
+ }
+ else if (!((wbuf[0] | wbuf[1] | wbuf[2] | wbuf[3]) & 0xff00))
+ {
+ mode = INPUT_UNICODE;
+ }
+ else if (!((wbuf[0] | wbuf[1] | wbuf[2] | wbuf[3]) & 0x00ff))
+ {
+ mode = INPUT_UNICODE;
+ swapped = 1;
+ }
+ }
+
+ if (mode == INPUT_UNICODE)
+ {
+ len = 4 - pos;
+ memcpy( inputbuffer, wbuf + pos, len * sizeof(WCHAR) );
+ }
+ else if (mode == INPUT_UNKNOWN) mode = unicodein ? INPUT_UTF8 : INPUT_ASCII;
+ }
+
+ switch (mode)
+ {
+ case INPUT_ASCII:
+ if (!fgets( buffer + len, sizeof(buffer) - len, yyin )) break;
+ ninputbuffer = wmc_mbstowcs( codepage, 0, buffer, strlen(buffer), inputbuffer, INPUTBUFFER_SIZE );
+ if (ninputbuffer < 0) internal_error(__FILE__, __LINE__, "Could not translate to unicode\n");
+ return 1;
+ case INPUT_UTF8:
+ if (!fgets( buffer + len, sizeof(buffer) - len, yyin )) break;
+ wbuf = utf8_to_unicode( buffer, strlen(buffer), &ninputbuffer );
+ memcpy( inputbuffer, wbuf, ninputbuffer * sizeof(WCHAR) );
+ free( wbuf );
+ return 1;
+ case INPUT_UNICODE:
+ len += fread( inputbuffer + len, sizeof(WCHAR), INPUTBUFFER_SIZE - len, yyin );
+ if (!len) break;
+ if (swapped) for (i = 0; i < len; i++) inputbuffer[i] = BYTESWAP_WORD( inputbuffer[i] );
+ ninputbuffer = len;
+ return 1;
+ case INPUT_UNKNOWN:
+ break;
+ }
+ if (ferror(yyin)) xyyerror( "Fatal: reading input failed\n" );
+ return 0;
}
static int get_unichar(void)
@@ -332,7 +264,7 @@ static int get_unichar(void)
}
ninputbuffer--;
- return (int)(*b++ & 0xffff);
+ return *b++;
}
static void unget_unichar(int ch)
More information about the wine-cvs
mailing list