RICHEDIT: RTF reader i18n #2

Phil Krylov phil at newstar.rinet.ru
Thu Mar 17 17:00:04 CST 2005


ChangeLog:

Fixed support for RTF documents using ANSI charset and added support
for multibyte charsets, so that BIG5 and UTF-8 RTF documents are
working now.

Patch:

Index: dlls/riched20/editor.c
===================================================================
RCS file: /home/wine/wine/dlls/riched20/editor.c,v
retrieving revision 1.15
diff -p -u -r1.15 editor.c
--- dlls/riched20/editor.c	17 Mar 2005 13:57:27 -0000	1.15
+++ dlls/riched20/editor.c	17 Mar 2005 22:36:51 -0000
@@ -396,23 +396,24 @@ void ME_RTFReadHook(RTF_Info *info) {
       switch(info->rtfMajor)
       {
         case rtfBeginGroup:
-          if (info->formatStackTop < maxCharFormatStack) {
-            info->formatStack[info->formatStackTop].cbSize = sizeof(info->formatStack[0]);
-            memcpy(&info->formatStack[info->formatStackTop], &info->style->fmt, sizeof(CHARFORMAT2W));
-            info->codePageStack[info->formatStackTop] = info->codePage;
+          if (info->stackTop < maxStack) {
+            memcpy(&info->stack[info->stackTop].fmt, &info->style->fmt, sizeof(CHARFORMAT2W));
+            info->stack[info->stackTop].codePage = info->codePage;
+            info->stack[info->stackTop].unicodeLength = info->unicodeLength;
           }
-          info->formatStackTop++;
+          info->stackTop++;
           break;
         case rtfEndGroup:
         {
           ME_Style *s;
           RTFFlushOutputBuffer(info);
-          info->formatStackTop--;
+          info->stackTop--;
           /* FIXME too slow ? how come ? */
-          s = ME_ApplyStyle(info->style, &info->formatStack[info->formatStackTop]);
+          s = ME_ApplyStyle(info->style, &info->stack[info->stackTop].fmt);
           ME_ReleaseStyle(info->style);
           info->style = s;
-          info->codePage = info->codePageStack[info->formatStackTop];
+          info->codePage = info->stack[info->stackTop].codePage;
+          info->unicodeLength = info->stack[info->stackTop].unicodeLength;
           break;
         }
       }
@@ -476,6 +477,7 @@ static LRESULT ME_StreamIn(ME_TextEditor
     /* do the parsing */
     RTFRead(&parser);
     RTFFlushOutputBuffer(&parser);
+    RTFDestroy(&parser);
 
     style = parser.style;
   }
Index: dlls/riched20/reader.c
===================================================================
RCS file: /home/wine/wine/dlls/riched20/reader.c,v
retrieving revision 1.8
diff -p -u -r1.8 reader.c
--- dlls/riched20/reader.c	17 Mar 2005 13:57:27 -0000	1.8
+++ dlls/riched20/reader.c	17 Mar 2005 22:36:52 -0000
@@ -67,12 +67,13 @@ static void	LookupInit (void);
 static void	Lookup (RTF_Info *, char *);
 static int	Hash (char*);
 
-static void	RTFOutputUnicodeString( RTF_Info *info, WCHAR *str, int len );
-
 static void	CharAttr(RTF_Info *info);
 static void	CharSet(RTF_Info *info);
 static void	DocAttr(RTF_Info *info);
 
+static void	RTFFlushCPOutputBuffer(RTF_Info *info);
+static void	RTFPutCodePageChar(RTF_Info *info, int c);
+
 
 int _RTFGetChar(RTF_Info *info)
 {
@@ -113,6 +114,58 @@ void RTFSetEditStream(RTF_Info *info, ED
 	info->editstream.pfnCallback = es->pfnCallback;
 }
 
+static void
+RTFDestroyAttrs(RTF_Info *info)
+{
+	RTFColor	*cp;
+	RTFFont		*fp;
+	RTFStyle	*sp;
+	RTFStyleElt	*eltList, *ep;
+
+	while (info->fontList != (RTFFont *) NULL)
+	{
+		fp = info->fontList->rtfNextFont;
+		RTFFree (info->fontList->rtfFName);
+		RTFFree ((char *) info->fontList);
+		info->fontList = fp;
+	}
+	while (info->colorList != (RTFColor *) NULL)
+	{
+		cp = info->colorList->rtfNextColor;
+		RTFFree ((char *) info->colorList);
+		info->colorList = cp;
+	}
+	while (info->styleList != (RTFStyle *) NULL)
+	{
+		sp = info->styleList->rtfNextStyle;
+		eltList = info->styleList->rtfSSEList;
+		while (eltList != (RTFStyleElt *) NULL)
+		{
+			ep = eltList->rtfNextSE;
+			RTFFree (eltList->rtfSEText);
+			RTFFree ((char *) eltList);
+			eltList = ep;
+		}
+		RTFFree (info->styleList->rtfSName);
+		RTFFree ((char *) info->styleList);
+		info->styleList = sp;
+	}
+}
+
+
+void
+RTFDestroy(RTF_Info *info)
+{
+        if (info->rtfTextBuf)
+        {
+                RTFFree(info->rtfTextBuf);
+                RTFFree(info->pushedTextBuf);
+        }
+        RTFDestroyAttrs(info);
+        RTFFree(info->cpOutputBuffer);
+}
+
+
 /*
  * Initialize the reader.  This may be called multiple times,
  * to read multiple files.  The only thing not reset is the input
@@ -122,10 +175,6 @@ void RTFSetEditStream(RTF_Info *info, ED
 void RTFInit(RTF_Info *info)
 {
 	int	i;
-	RTFColor	*cp;
-	RTFFont		*fp;
-	RTFStyle	*sp;
-	RTFStyleElt	*eltList, *ep;
 
 	TRACE("\n");
 
@@ -164,36 +213,10 @@ void RTFInit(RTF_Info *info)
 
 	/* dump old lists if necessary */
 
-	while (info->fontList != (RTFFont *) NULL)
-	{
-		fp = info->fontList->rtfNextFont;
-		RTFFree (info->fontList->rtfFName);
-		RTFFree ((char *) info->fontList);
-		info->fontList = fp;
-	}
-	while (info->colorList != (RTFColor *) NULL)
-	{
-		cp = info->colorList->rtfNextColor;
-		RTFFree ((char *) info->colorList);
-		info->colorList = cp;
-	}
-	while (info->styleList != (RTFStyle *) NULL)
-	{
-		sp = info->styleList->rtfNextStyle;
-		eltList = info->styleList->rtfSSEList;
-		while (eltList != (RTFStyleElt *) NULL)
-		{
-			ep = eltList->rtfNextSE;
-			RTFFree (eltList->rtfSEText);
-			RTFFree ((char *) eltList);
-			eltList = ep;
-		}
-		RTFFree (info->styleList->rtfSName);
-		RTFFree ((char *) info->styleList);
-		info->styleList = sp;
-	}
+        RTFDestroyAttrs(info);
 
         info->ansiCodePage = 1252; /* Latin-1 */
+        
         info->unicodeLength = 1; /* \uc1 is the default */
         info->codePage = info->ansiCodePage;
 
@@ -205,6 +228,13 @@ void RTFInit(RTF_Info *info)
 	info->rtfLinePos = 0;
 	info->prevChar = EOF;
 	info->bumpLine = 0;
+
+        info->dwCPOutputCount = 0;
+        if (!info->cpOutputBuffer)
+        {
+                info->dwMaxCPOutputCount = 0x1000;
+                info->cpOutputBuffer = RTFAlloc(info->dwMaxCPOutputCount);
+        }
 }
 
 /*
@@ -475,17 +505,6 @@ static void _RTFGetToken(RTF_Info *info)
 }
 
 
-static WCHAR
-RTFANSIToUnicode(RTF_Info *info, char c)
-{
-        WCHAR buffer[2] = { 0, 0 };
-
-        /* TODO: Probably caching codepage conversion tables would be faster... */
-        MultiByteToWideChar(info->codePage, 0, &c, 1, buffer, 2);
-        return buffer[0];
-}
-
-
 static int
 RTFCharSetToCodePage(RTF_Info *info, int charset)
 {
@@ -493,7 +512,7 @@ RTFCharSetToCodePage(RTF_Info *info, int
         {
                 case ANSI_CHARSET:
                 case DEFAULT_CHARSET:
-                        return 0;
+                        return info->ansiCodePage;
                 case SYMBOL_CHARSET:
                         return CP_SYMBOL;
                 case MAC_CHARSET:
@@ -603,10 +622,6 @@ static void _RTFGetToken2(RTF_Info *info
 		else
 		{
 			info->rtfClass = rtfText;
-
-                        if (c & 0x80)
-                                info->rtfMajor = RTFANSIToUnicode(info, c);
-                        else
 			info->rtfMajor = c;
 		}
 		return;
@@ -632,7 +647,7 @@ static void _RTFGetToken2(RTF_Info *info
 			{
 				/* should do isxdigit check! */
 				info->rtfClass = rtfText;
-				info->rtfMajor = RTFANSIToUnicode(info, RTFCharToHex (c) * 16 + RTFCharToHex (c2));
+				info->rtfMajor = RTFCharToHex (c) * 16 + RTFCharToHex (c2);
 				return;
 			}
 			/* early eof, whoops (class is rtfUnknown) */
@@ -1416,6 +1431,7 @@ static RTFKey	rtfKey[] =
 	{ rtfCharAttr,	rtfLanguage,		"lang",		0 },
 	/* this has disappeared from spec 1.2 */
 	{ rtfCharAttr,	rtfGray,		"gray",		0 },
+        { rtfCharAttr,	rtfUnicodeLength,	"uc",		0 },
 
 	/*
 	 * Paragraph formatting attributes
@@ -1704,9 +1720,9 @@ static RTFKey	rtfKey[] =
 
 	{ rtfDocAttr,	rtfRTLDoc,		"rtldoc",	0 },
 	{ rtfDocAttr,	rtfLTRDoc,		"ltrdoc",	0 },
-
+       
         { rtfDocAttr,	rtfAnsiCodePage,	"ansicpg",	0 },
-        { rtfDocAttr,	rtfUnicodeLength,	"uc",		0 },
+        { rtfDocAttr,	rtfUTF8RTF,		"urtf",		0 },
 
 	/*
 	 * Style attributes
@@ -2475,7 +2491,7 @@ static void	TextClass (RTF_Info *info);
 static void	ControlClass (RTF_Info *info);
 static void	Destination (RTF_Info *info);
 static void	SpecialChar (RTF_Info *info);
-static void	PutLitChar (RTF_Info *info, int c);
+static void	RTFPutUnicodeChar (RTF_Info *info, int c);
 
 /*
  * Initialize the writer.
@@ -2499,14 +2515,13 @@ BeginFile (RTF_Info *info )
 }
 
 /*
- * Write out a character. Seems to work for the default ANSI codepage,
- * contrary to TextClass_orig. 
+ * Write out a character.
  */
 
 static void
 TextClass (RTF_Info *info)
 {
-	PutLitChar (info, info->rtfMajor);
+	RTFPutCodePageChar(info, info->rtfMajor);
 }
 
 
@@ -2530,7 +2545,7 @@ ControlClass (RTF_Info *info)
                 DocAttr(info);
                 break;
 	case rtfSpecialChar:
-		SpecialChar (info);
+                SpecialChar (info);
 		break;
 	}
 }
@@ -2539,10 +2554,19 @@ ControlClass (RTF_Info *info)
 static void
 CharAttr(RTF_Info *info)
 {
+        RTFFont *font;
+        
         switch (info->rtfMinor)
         {
         case rtfFontNum:
-                info->codePage = RTFGetFont(info, info->rtfParam)->rtfFCodePage;
+                font = RTFGetFont(info, info->rtfParam);
+                if (font)
+                        info->codePage = font->rtfFCodePage;
+                else
+                        RTFMsg(info, "unknown font %d\n", info->rtfParam);
+                break;
+        case rtfUnicodeLength:
+                info->unicodeLength = info->rtfParam;
                 break;
         }
 }
@@ -2591,8 +2615,8 @@ DocAttr(RTF_Info *info)
         case rtfAnsiCodePage:
                 info->ansiCodePage = info->rtfParam;
                 break;
-        case rtfUnicodeLength:
-                info->unicodeLength = info->rtfParam;
+        case rtfUTF8RTF:
+                info->ansiCodePage = CP_UTF8;
                 break;
         }
 }
@@ -2616,23 +2640,20 @@ static void SpecialChar (RTF_Info *info)
 		break;
 	case rtfUnicode:
 	{
-		WCHAR buf[2];
                 int i;
-                
-		buf[0] = info->rtfParam;
-		buf[1] = 0;
-		RTFFlushOutputBuffer(info);
-		RTFOutputUnicodeString(info, buf, 1);
+               
+                RTFPutUnicodeChar(info, info->rtfParam);
 		
                 /* After \u we must skip number of character tokens set by \ucN */
                 for (i = 0; i < info->unicodeLength; i++)
                 {
-		RTFGetToken(info);
+			RTFGetToken(info);
                         if (info->rtfClass != rtfText)
-		{
+		        {
                                 ERR("The token behind \\u is not text, but (%d,%d,%d)\n",
 				info->rtfClass, info->rtfMajor, info->rtfMinor);
                                 RTFUngetToken(info);
+                                break;
                         }
 		}
 		break;
@@ -2642,64 +2663,117 @@ static void SpecialChar (RTF_Info *info)
 	case rtfRow:
 	case rtfLine:
 	case rtfPar:
-		PutLitChar (info, '\n');
+		RTFPutUnicodeChar (info, '\n');
 		break;
 	case rtfCell:
-                PutLitChar (info, ' ');	/* make sure cells are separated */
+                RTFPutUnicodeChar (info, ' ');	/* make sure cells are separated */
 		break;
 	case rtfNoBrkSpace:
-		PutLitChar (info, 0x00A0);
+		RTFPutUnicodeChar (info, 0x00A0);
 		break;
 	case rtfTab:
-		PutLitChar (info, '\t');
+		RTFPutUnicodeChar (info, '\t');
 		break;
 	case rtfNoBrkHyphen:
-		PutLitChar (info, 0x2011);
+		RTFPutUnicodeChar (info, 0x2011);
 		break;
 	case rtfBullet:
-		PutLitChar (info, 0x2022);
+		RTFPutUnicodeChar (info, 0x2022);
 		break;
 	case rtfEmDash:
-		PutLitChar (info, 0x2014);
+		RTFPutUnicodeChar (info, 0x2014);
 		break;
 	case rtfEnDash:
-		PutLitChar (info, 0x2013);
+		RTFPutUnicodeChar (info, 0x2013);
 		break;
 	case rtfLQuote:
-		PutLitChar (info, 0x2018);
+		RTFPutUnicodeChar (info, 0x2018);
 		break;
 	case rtfRQuote:
-		PutLitChar (info, 0x2019);
+		RTFPutUnicodeChar (info, 0x2019);
 		break;
 	case rtfLDblQuote:
-		PutLitChar (info, 0x201C);
+		RTFPutUnicodeChar (info, 0x201C);
 		break;
 	case rtfRDblQuote:
-		PutLitChar (info, 0x201D);
+		RTFPutUnicodeChar (info, 0x201D);
 		break;
 	}
 }
 
 
-static void PutLitChar (RTF_Info *info, int c)
+static void
+RTFFlushUnicodeOutputBuffer(RTF_Info *info)
 {
-	if( info->dwOutputCount >= ( sizeof info->OutputBuffer - 1 ) )
-		RTFFlushOutputBuffer( info );
-	info->OutputBuffer[info->dwOutputCount++] = c;
+        if (info->dwOutputCount)
+        {
+                ME_InsertTextFromCursor(info->editor, 0, info->OutputBuffer,
+                                        info->dwOutputCount, info->style);
+                info->dwOutputCount = 0;
+        }
 }
 
-static void RTFOutputUnicodeString( RTF_Info *info, WCHAR *str, int len )
+static void
+RTFPutUnicodeString(RTF_Info *info, WCHAR *string, int length)
 {
-	assert(str[len] == '\0');
-	if (len) {
-		ME_InsertTextFromCursor( info->editor, 0, str, len, info->style );	  
-	}
+        if (info->dwCPOutputCount)
+                RTFFlushCPOutputBuffer(info);
+        while (length)
+        {
+                int fit = min(length, sizeof(info->OutputBuffer) / sizeof(WCHAR) - info->dwOutputCount);
+
+                memmove(info->OutputBuffer + info->dwOutputCount, string, fit * sizeof(WCHAR));
+                if (fit == sizeof(info->OutputBuffer) / sizeof(WCHAR) - info->dwOutputCount)
+                        RTFFlushUnicodeOutputBuffer(info);
+                else
+                        info->dwOutputCount += fit;
+                length -= fit;
+                string += fit;
+        }
+}
+
+static void
+RTFFlushCPOutputBuffer(RTF_Info *info)
+{
+        int bufferMax = info->dwCPOutputCount * 2 * sizeof(WCHAR);
+        WCHAR *buffer = (WCHAR *)RTFAlloc(bufferMax);
+        int length;
+
+        length = MultiByteToWideChar(info->codePage, 0, info->cpOutputBuffer,
+                                     info->dwCPOutputCount, buffer, bufferMax);
+        info->dwCPOutputCount = 0;
+
+        RTFPutUnicodeString(info, buffer, length);
+        RTFFree((char *)buffer);
 }
 
+void
+RTFFlushOutputBuffer(RTF_Info *info)
+{
+        if (info->dwCPOutputCount)
+                RTFFlushCPOutputBuffer(info);
+        RTFFlushUnicodeOutputBuffer(info);
+}
 
-void RTFFlushOutputBuffer( RTF_Info *info )
+static void
+RTFPutUnicodeChar(RTF_Info *info, int c)
 {
-	info->OutputBuffer[info->dwOutputCount] = 0;
-	RTFOutputUnicodeString(info, info->OutputBuffer, info->dwOutputCount);
-	info->dwOutputCount = 0;
+	if (info->dwCPOutputCount)
+                RTFFlushCPOutputBuffer(info);
+        if (info->dwOutputCount * sizeof(WCHAR) >= ( sizeof info->OutputBuffer - 1 ) )
+		RTFFlushUnicodeOutputBuffer( info );
+	info->OutputBuffer[info->dwOutputCount++] = c;
+}
+
+static void
+RTFPutCodePageChar(RTF_Info *info, int c)
+{
+        /* Use dynamic buffer here because it's the best way to handle
+         * MBCS codepages without having to worry about partial chars */
+        if (info->dwCPOutputCount >= info->dwMaxCPOutputCount)
+        {
+                info->dwMaxCPOutputCount *= 2;
+                info->cpOutputBuffer = RTFReAlloc(info->cpOutputBuffer, info->dwMaxCPOutputCount);
+        }
+        info->cpOutputBuffer[info->dwCPOutputCount++] = c;
 }
Index: dlls/riched20/rtf.h
===================================================================
RCS file: /home/wine/wine/dlls/riched20/rtf.h,v
retrieving revision 1.6
diff -p -u -r1.6 rtf.h
--- dlls/riched20/rtf.h	17 Mar 2005 13:57:27 -0000	1.6
+++ dlls/riched20/rtf.h	17 Mar 2005 22:36:53 -0000
@@ -353,7 +353,7 @@
 # define		rtfRTLDoc		76	/* new in 1.10 */
 # define		rtfLTRDoc		77	/* new in 1.10 */
 # define		rtfAnsiCodePage		78
-# define		rtfUnicodeLength	79
+# define		rtfUTF8RTF		79
 
 # define	rtfSectAttr	9
 # define		rtfSectDef		0
@@ -595,6 +595,7 @@
 # define		rtfCharCharSet		33	/* new in 1.10 */
 # define		rtfLanguage		34
 # define		rtfGray			35
+# define		rtfUnicodeLength	36
 
 # define	rtfPictAttr	13
 # define		rtfMacQD		0
@@ -933,20 +934,6 @@
 # define	rtfLangUrdu			0x0420
 
 /*
- * CharSet indices
- */
-
-# define	rtfCSGeneral	0	/* general (default) charset */
-# define	rtfCSSymbol	1	/* symbol charset */
-
-/*
- * Flags for auto-charset-processing.  Both are on by default.
- */
-
-# define	rtfReadCharSet		0x01	/* auto-read charset files */
-# define	rtfSwitchCharSet	0x02	/* auto-switch charset maps */
-
-/*
  * Style types
  */
 
@@ -1026,23 +1013,25 @@ struct RTFStyleElt
 
 # define        New(t)  ((t *) RTFAlloc ((int) sizeof (t)))
 
-/* maximum number of character values representable in a byte */
-
-# define        charSetSize             256
-
-/* charset stack size */
+/* Parser stack size */
 
-# define        maxCSStack              10
-
-/* character format stack size */
-
-# define        maxCharFormatStack      32
+# define        maxStack      32
 
 struct _RTF_Info;
 typedef struct _RTF_Info RTF_Info;
 
 typedef	void (*RTFFuncPtr) (RTF_Info *);		/* generic function pointer */
 
+
+/* RTF parser stack element */
+struct tagRTFState {
+        CHARFORMAT2W fmt;
+        int codePage;
+        int unicodeLength;
+};
+typedef struct tagRTFState RTFState;
+
+
 struct _RTF_Info {
     /*
      * Public variables (listed in rtf.h)
@@ -1087,8 +1076,9 @@ struct _RTF_Info {
     RTFColor	*colorList;	/* initialized to NULL */
     RTFStyle	*styleList;
     int ansiCodePage; /* ANSI codepage used in conversion to Unicode */
-    int unicodeLength; /* The length of ANSI representation of Unicode characters */
 
+    /* Character attributes */
+    int unicodeLength; /* The length of ANSI representation of Unicode characters */
     int codePage; /* Current codepage for text conversion */
 
     char *inputName;
@@ -1118,9 +1108,12 @@ struct _RTF_Info {
     DWORD    dwOutputCount;
     WCHAR    OutputBuffer[0x1000];
 
-    CHARFORMAT2W     formatStack[maxCharFormatStack];
-    int              codePageStack[maxCharFormatStack];
-    int              formatStackTop;
+    DWORD    dwCPOutputCount;
+    DWORD    dwMaxCPOutputCount;
+    char     *cpOutputBuffer;
+
+    RTFState         stack[maxStack];
+    int              stackTop;
 };
 
 
@@ -1129,6 +1122,7 @@ struct _RTF_Info {
  */
 
 void		RTFInit (RTF_Info *);
+void	        RTFDestroy(RTF_Info *info);
 void		RTFSetInputName (RTF_Info *, char *);
 char		*RTFGetInputName (RTF_Info *);
 void		RTFSetOutputName (RTF_Info *, char *);



More information about the wine-patches mailing list