RICHEDIT: RTF reader i18n #2
Phil Krylov
phil at newstar.rinet.ru
Thu Mar 17 17:00:04 CST 2005
ChangeLog:
Fixed support for RTF documents using ANSI charset and added support
for multibyte charsets, so that BIG5 and UTF-8 RTF documents are
working now.
Patch:
Index: dlls/riched20/editor.c
===================================================================
RCS file: /home/wine/wine/dlls/riched20/editor.c,v
retrieving revision 1.15
diff -p -u -r1.15 editor.c
--- dlls/riched20/editor.c 17 Mar 2005 13:57:27 -0000 1.15
+++ dlls/riched20/editor.c 17 Mar 2005 22:36:51 -0000
@@ -396,23 +396,24 @@ void ME_RTFReadHook(RTF_Info *info) {
switch(info->rtfMajor)
{
case rtfBeginGroup:
- if (info->formatStackTop < maxCharFormatStack) {
- info->formatStack[info->formatStackTop].cbSize = sizeof(info->formatStack[0]);
- memcpy(&info->formatStack[info->formatStackTop], &info->style->fmt, sizeof(CHARFORMAT2W));
- info->codePageStack[info->formatStackTop] = info->codePage;
+ if (info->stackTop < maxStack) {
+ memcpy(&info->stack[info->stackTop].fmt, &info->style->fmt, sizeof(CHARFORMAT2W));
+ info->stack[info->stackTop].codePage = info->codePage;
+ info->stack[info->stackTop].unicodeLength = info->unicodeLength;
}
- info->formatStackTop++;
+ info->stackTop++;
break;
case rtfEndGroup:
{
ME_Style *s;
RTFFlushOutputBuffer(info);
- info->formatStackTop--;
+ info->stackTop--;
/* FIXME too slow ? how come ? */
- s = ME_ApplyStyle(info->style, &info->formatStack[info->formatStackTop]);
+ s = ME_ApplyStyle(info->style, &info->stack[info->stackTop].fmt);
ME_ReleaseStyle(info->style);
info->style = s;
- info->codePage = info->codePageStack[info->formatStackTop];
+ info->codePage = info->stack[info->stackTop].codePage;
+ info->unicodeLength = info->stack[info->stackTop].unicodeLength;
break;
}
}
@@ -476,6 +477,7 @@ static LRESULT ME_StreamIn(ME_TextEditor
/* do the parsing */
RTFRead(&parser);
RTFFlushOutputBuffer(&parser);
+ RTFDestroy(&parser);
style = parser.style;
}
Index: dlls/riched20/reader.c
===================================================================
RCS file: /home/wine/wine/dlls/riched20/reader.c,v
retrieving revision 1.8
diff -p -u -r1.8 reader.c
--- dlls/riched20/reader.c 17 Mar 2005 13:57:27 -0000 1.8
+++ dlls/riched20/reader.c 17 Mar 2005 22:36:52 -0000
@@ -67,12 +67,13 @@ static void LookupInit (void);
static void Lookup (RTF_Info *, char *);
static int Hash (char*);
-static void RTFOutputUnicodeString( RTF_Info *info, WCHAR *str, int len );
-
static void CharAttr(RTF_Info *info);
static void CharSet(RTF_Info *info);
static void DocAttr(RTF_Info *info);
+static void RTFFlushCPOutputBuffer(RTF_Info *info);
+static void RTFPutCodePageChar(RTF_Info *info, int c);
+
int _RTFGetChar(RTF_Info *info)
{
@@ -113,6 +114,58 @@ void RTFSetEditStream(RTF_Info *info, ED
info->editstream.pfnCallback = es->pfnCallback;
}
+static void
+RTFDestroyAttrs(RTF_Info *info)
+{
+ RTFColor *cp;
+ RTFFont *fp;
+ RTFStyle *sp;
+ RTFStyleElt *eltList, *ep;
+
+ while (info->fontList != (RTFFont *) NULL)
+ {
+ fp = info->fontList->rtfNextFont;
+ RTFFree (info->fontList->rtfFName);
+ RTFFree ((char *) info->fontList);
+ info->fontList = fp;
+ }
+ while (info->colorList != (RTFColor *) NULL)
+ {
+ cp = info->colorList->rtfNextColor;
+ RTFFree ((char *) info->colorList);
+ info->colorList = cp;
+ }
+ while (info->styleList != (RTFStyle *) NULL)
+ {
+ sp = info->styleList->rtfNextStyle;
+ eltList = info->styleList->rtfSSEList;
+ while (eltList != (RTFStyleElt *) NULL)
+ {
+ ep = eltList->rtfNextSE;
+ RTFFree (eltList->rtfSEText);
+ RTFFree ((char *) eltList);
+ eltList = ep;
+ }
+ RTFFree (info->styleList->rtfSName);
+ RTFFree ((char *) info->styleList);
+ info->styleList = sp;
+ }
+}
+
+
+void
+RTFDestroy(RTF_Info *info)
+{
+ if (info->rtfTextBuf)
+ {
+ RTFFree(info->rtfTextBuf);
+ RTFFree(info->pushedTextBuf);
+ }
+ RTFDestroyAttrs(info);
+ RTFFree(info->cpOutputBuffer);
+}
+
+
/*
* Initialize the reader. This may be called multiple times,
* to read multiple files. The only thing not reset is the input
@@ -122,10 +175,6 @@ void RTFSetEditStream(RTF_Info *info, ED
void RTFInit(RTF_Info *info)
{
int i;
- RTFColor *cp;
- RTFFont *fp;
- RTFStyle *sp;
- RTFStyleElt *eltList, *ep;
TRACE("\n");
@@ -164,36 +213,10 @@ void RTFInit(RTF_Info *info)
/* dump old lists if necessary */
- while (info->fontList != (RTFFont *) NULL)
- {
- fp = info->fontList->rtfNextFont;
- RTFFree (info->fontList->rtfFName);
- RTFFree ((char *) info->fontList);
- info->fontList = fp;
- }
- while (info->colorList != (RTFColor *) NULL)
- {
- cp = info->colorList->rtfNextColor;
- RTFFree ((char *) info->colorList);
- info->colorList = cp;
- }
- while (info->styleList != (RTFStyle *) NULL)
- {
- sp = info->styleList->rtfNextStyle;
- eltList = info->styleList->rtfSSEList;
- while (eltList != (RTFStyleElt *) NULL)
- {
- ep = eltList->rtfNextSE;
- RTFFree (eltList->rtfSEText);
- RTFFree ((char *) eltList);
- eltList = ep;
- }
- RTFFree (info->styleList->rtfSName);
- RTFFree ((char *) info->styleList);
- info->styleList = sp;
- }
+ RTFDestroyAttrs(info);
info->ansiCodePage = 1252; /* Latin-1 */
+
info->unicodeLength = 1; /* \uc1 is the default */
info->codePage = info->ansiCodePage;
@@ -205,6 +228,13 @@ void RTFInit(RTF_Info *info)
info->rtfLinePos = 0;
info->prevChar = EOF;
info->bumpLine = 0;
+
+ info->dwCPOutputCount = 0;
+ if (!info->cpOutputBuffer)
+ {
+ info->dwMaxCPOutputCount = 0x1000;
+ info->cpOutputBuffer = RTFAlloc(info->dwMaxCPOutputCount);
+ }
}
/*
@@ -475,17 +505,6 @@ static void _RTFGetToken(RTF_Info *info)
}
-static WCHAR
-RTFANSIToUnicode(RTF_Info *info, char c)
-{
- WCHAR buffer[2] = { 0, 0 };
-
- /* TODO: Probably caching codepage conversion tables would be faster... */
- MultiByteToWideChar(info->codePage, 0, &c, 1, buffer, 2);
- return buffer[0];
-}
-
-
static int
RTFCharSetToCodePage(RTF_Info *info, int charset)
{
@@ -493,7 +512,7 @@ RTFCharSetToCodePage(RTF_Info *info, int
{
case ANSI_CHARSET:
case DEFAULT_CHARSET:
- return 0;
+ return info->ansiCodePage;
case SYMBOL_CHARSET:
return CP_SYMBOL;
case MAC_CHARSET:
@@ -603,10 +622,6 @@ static void _RTFGetToken2(RTF_Info *info
else
{
info->rtfClass = rtfText;
-
- if (c & 0x80)
- info->rtfMajor = RTFANSIToUnicode(info, c);
- else
info->rtfMajor = c;
}
return;
@@ -632,7 +647,7 @@ static void _RTFGetToken2(RTF_Info *info
{
/* should do isxdigit check! */
info->rtfClass = rtfText;
- info->rtfMajor = RTFANSIToUnicode(info, RTFCharToHex (c) * 16 + RTFCharToHex (c2));
+ info->rtfMajor = RTFCharToHex (c) * 16 + RTFCharToHex (c2);
return;
}
/* early eof, whoops (class is rtfUnknown) */
@@ -1416,6 +1431,7 @@ static RTFKey rtfKey[] =
{ rtfCharAttr, rtfLanguage, "lang", 0 },
/* this has disappeared from spec 1.2 */
{ rtfCharAttr, rtfGray, "gray", 0 },
+ { rtfCharAttr, rtfUnicodeLength, "uc", 0 },
/*
* Paragraph formatting attributes
@@ -1704,9 +1720,9 @@ static RTFKey rtfKey[] =
{ rtfDocAttr, rtfRTLDoc, "rtldoc", 0 },
{ rtfDocAttr, rtfLTRDoc, "ltrdoc", 0 },
-
+
{ rtfDocAttr, rtfAnsiCodePage, "ansicpg", 0 },
- { rtfDocAttr, rtfUnicodeLength, "uc", 0 },
+ { rtfDocAttr, rtfUTF8RTF, "urtf", 0 },
/*
* Style attributes
@@ -2475,7 +2491,7 @@ static void TextClass (RTF_Info *info);
static void ControlClass (RTF_Info *info);
static void Destination (RTF_Info *info);
static void SpecialChar (RTF_Info *info);
-static void PutLitChar (RTF_Info *info, int c);
+static void RTFPutUnicodeChar (RTF_Info *info, int c);
/*
* Initialize the writer.
@@ -2499,14 +2515,13 @@ BeginFile (RTF_Info *info )
}
/*
- * Write out a character. Seems to work for the default ANSI codepage,
- * contrary to TextClass_orig.
+ * Write out a character.
*/
static void
TextClass (RTF_Info *info)
{
- PutLitChar (info, info->rtfMajor);
+ RTFPutCodePageChar(info, info->rtfMajor);
}
@@ -2530,7 +2545,7 @@ ControlClass (RTF_Info *info)
DocAttr(info);
break;
case rtfSpecialChar:
- SpecialChar (info);
+ SpecialChar (info);
break;
}
}
@@ -2539,10 +2554,19 @@ ControlClass (RTF_Info *info)
static void
CharAttr(RTF_Info *info)
{
+ RTFFont *font;
+
switch (info->rtfMinor)
{
case rtfFontNum:
- info->codePage = RTFGetFont(info, info->rtfParam)->rtfFCodePage;
+ font = RTFGetFont(info, info->rtfParam);
+ if (font)
+ info->codePage = font->rtfFCodePage;
+ else
+ RTFMsg(info, "unknown font %d\n", info->rtfParam);
+ break;
+ case rtfUnicodeLength:
+ info->unicodeLength = info->rtfParam;
break;
}
}
@@ -2591,8 +2615,8 @@ DocAttr(RTF_Info *info)
case rtfAnsiCodePage:
info->ansiCodePage = info->rtfParam;
break;
- case rtfUnicodeLength:
- info->unicodeLength = info->rtfParam;
+ case rtfUTF8RTF:
+ info->ansiCodePage = CP_UTF8;
break;
}
}
@@ -2616,23 +2640,20 @@ static void SpecialChar (RTF_Info *info)
break;
case rtfUnicode:
{
- WCHAR buf[2];
int i;
-
- buf[0] = info->rtfParam;
- buf[1] = 0;
- RTFFlushOutputBuffer(info);
- RTFOutputUnicodeString(info, buf, 1);
+
+ RTFPutUnicodeChar(info, info->rtfParam);
/* After \u we must skip number of character tokens set by \ucN */
for (i = 0; i < info->unicodeLength; i++)
{
- RTFGetToken(info);
+ RTFGetToken(info);
if (info->rtfClass != rtfText)
- {
+ {
ERR("The token behind \\u is not text, but (%d,%d,%d)\n",
info->rtfClass, info->rtfMajor, info->rtfMinor);
RTFUngetToken(info);
+ break;
}
}
break;
@@ -2642,64 +2663,117 @@ static void SpecialChar (RTF_Info *info)
case rtfRow:
case rtfLine:
case rtfPar:
- PutLitChar (info, '\n');
+ RTFPutUnicodeChar (info, '\n');
break;
case rtfCell:
- PutLitChar (info, ' '); /* make sure cells are separated */
+ RTFPutUnicodeChar (info, ' '); /* make sure cells are separated */
break;
case rtfNoBrkSpace:
- PutLitChar (info, 0x00A0);
+ RTFPutUnicodeChar (info, 0x00A0);
break;
case rtfTab:
- PutLitChar (info, '\t');
+ RTFPutUnicodeChar (info, '\t');
break;
case rtfNoBrkHyphen:
- PutLitChar (info, 0x2011);
+ RTFPutUnicodeChar (info, 0x2011);
break;
case rtfBullet:
- PutLitChar (info, 0x2022);
+ RTFPutUnicodeChar (info, 0x2022);
break;
case rtfEmDash:
- PutLitChar (info, 0x2014);
+ RTFPutUnicodeChar (info, 0x2014);
break;
case rtfEnDash:
- PutLitChar (info, 0x2013);
+ RTFPutUnicodeChar (info, 0x2013);
break;
case rtfLQuote:
- PutLitChar (info, 0x2018);
+ RTFPutUnicodeChar (info, 0x2018);
break;
case rtfRQuote:
- PutLitChar (info, 0x2019);
+ RTFPutUnicodeChar (info, 0x2019);
break;
case rtfLDblQuote:
- PutLitChar (info, 0x201C);
+ RTFPutUnicodeChar (info, 0x201C);
break;
case rtfRDblQuote:
- PutLitChar (info, 0x201D);
+ RTFPutUnicodeChar (info, 0x201D);
break;
}
}
-static void PutLitChar (RTF_Info *info, int c)
+static void
+RTFFlushUnicodeOutputBuffer(RTF_Info *info)
{
- if( info->dwOutputCount >= ( sizeof info->OutputBuffer - 1 ) )
- RTFFlushOutputBuffer( info );
- info->OutputBuffer[info->dwOutputCount++] = c;
+ if (info->dwOutputCount)
+ {
+ ME_InsertTextFromCursor(info->editor, 0, info->OutputBuffer,
+ info->dwOutputCount, info->style);
+ info->dwOutputCount = 0;
+ }
}
-static void RTFOutputUnicodeString( RTF_Info *info, WCHAR *str, int len )
+static void
+RTFPutUnicodeString(RTF_Info *info, WCHAR *string, int length)
{
- assert(str[len] == '\0');
- if (len) {
- ME_InsertTextFromCursor( info->editor, 0, str, len, info->style );
- }
+ if (info->dwCPOutputCount)
+ RTFFlushCPOutputBuffer(info);
+ while (length)
+ {
+ int fit = min(length, sizeof(info->OutputBuffer) / sizeof(WCHAR) - info->dwOutputCount);
+
+ memmove(info->OutputBuffer + info->dwOutputCount, string, fit * sizeof(WCHAR));
+ if (fit == sizeof(info->OutputBuffer) / sizeof(WCHAR) - info->dwOutputCount)
+ RTFFlushUnicodeOutputBuffer(info);
+ else
+ info->dwOutputCount += fit;
+ length -= fit;
+ string += fit;
+ }
+}
+
+static void
+RTFFlushCPOutputBuffer(RTF_Info *info)
+{
+ int bufferMax = info->dwCPOutputCount * 2 * sizeof(WCHAR);
+ WCHAR *buffer = (WCHAR *)RTFAlloc(bufferMax);
+ int length;
+
+ length = MultiByteToWideChar(info->codePage, 0, info->cpOutputBuffer,
+ info->dwCPOutputCount, buffer, bufferMax);
+ info->dwCPOutputCount = 0;
+
+ RTFPutUnicodeString(info, buffer, length);
+ RTFFree((char *)buffer);
}
+void
+RTFFlushOutputBuffer(RTF_Info *info)
+{
+ if (info->dwCPOutputCount)
+ RTFFlushCPOutputBuffer(info);
+ RTFFlushUnicodeOutputBuffer(info);
+}
-void RTFFlushOutputBuffer( RTF_Info *info )
+static void
+RTFPutUnicodeChar(RTF_Info *info, int c)
{
- info->OutputBuffer[info->dwOutputCount] = 0;
- RTFOutputUnicodeString(info, info->OutputBuffer, info->dwOutputCount);
- info->dwOutputCount = 0;
+ if (info->dwCPOutputCount)
+ RTFFlushCPOutputBuffer(info);
+ if (info->dwOutputCount * sizeof(WCHAR) >= ( sizeof info->OutputBuffer - 1 ) )
+ RTFFlushUnicodeOutputBuffer( info );
+ info->OutputBuffer[info->dwOutputCount++] = c;
+}
+
+static void
+RTFPutCodePageChar(RTF_Info *info, int c)
+{
+ /* Use dynamic buffer here because it's the best way to handle
+ * MBCS codepages without having to worry about partial chars */
+ if (info->dwCPOutputCount >= info->dwMaxCPOutputCount)
+ {
+ info->dwMaxCPOutputCount *= 2;
+ info->cpOutputBuffer = RTFReAlloc(info->cpOutputBuffer, info->dwMaxCPOutputCount);
+ }
+ info->cpOutputBuffer[info->dwCPOutputCount++] = c;
}
Index: dlls/riched20/rtf.h
===================================================================
RCS file: /home/wine/wine/dlls/riched20/rtf.h,v
retrieving revision 1.6
diff -p -u -r1.6 rtf.h
--- dlls/riched20/rtf.h 17 Mar 2005 13:57:27 -0000 1.6
+++ dlls/riched20/rtf.h 17 Mar 2005 22:36:53 -0000
@@ -353,7 +353,7 @@
# define rtfRTLDoc 76 /* new in 1.10 */
# define rtfLTRDoc 77 /* new in 1.10 */
# define rtfAnsiCodePage 78
-# define rtfUnicodeLength 79
+# define rtfUTF8RTF 79
# define rtfSectAttr 9
# define rtfSectDef 0
@@ -595,6 +595,7 @@
# define rtfCharCharSet 33 /* new in 1.10 */
# define rtfLanguage 34
# define rtfGray 35
+# define rtfUnicodeLength 36
# define rtfPictAttr 13
# define rtfMacQD 0
@@ -933,20 +934,6 @@
# define rtfLangUrdu 0x0420
/*
- * CharSet indices
- */
-
-# define rtfCSGeneral 0 /* general (default) charset */
-# define rtfCSSymbol 1 /* symbol charset */
-
-/*
- * Flags for auto-charset-processing. Both are on by default.
- */
-
-# define rtfReadCharSet 0x01 /* auto-read charset files */
-# define rtfSwitchCharSet 0x02 /* auto-switch charset maps */
-
-/*
* Style types
*/
@@ -1026,23 +1013,25 @@ struct RTFStyleElt
# define New(t) ((t *) RTFAlloc ((int) sizeof (t)))
-/* maximum number of character values representable in a byte */
-
-# define charSetSize 256
-
-/* charset stack size */
+/* Parser stack size */
-# define maxCSStack 10
-
-/* character format stack size */
-
-# define maxCharFormatStack 32
+# define maxStack 32
struct _RTF_Info;
typedef struct _RTF_Info RTF_Info;
typedef void (*RTFFuncPtr) (RTF_Info *); /* generic function pointer */
+
+/* RTF parser stack element */
+struct tagRTFState {
+ CHARFORMAT2W fmt;
+ int codePage;
+ int unicodeLength;
+};
+typedef struct tagRTFState RTFState;
+
+
struct _RTF_Info {
/*
* Public variables (listed in rtf.h)
@@ -1087,8 +1076,9 @@ struct _RTF_Info {
RTFColor *colorList; /* initialized to NULL */
RTFStyle *styleList;
int ansiCodePage; /* ANSI codepage used in conversion to Unicode */
- int unicodeLength; /* The length of ANSI representation of Unicode characters */
+ /* Character attributes */
+ int unicodeLength; /* The length of ANSI representation of Unicode characters */
int codePage; /* Current codepage for text conversion */
char *inputName;
@@ -1118,9 +1108,12 @@ struct _RTF_Info {
DWORD dwOutputCount;
WCHAR OutputBuffer[0x1000];
- CHARFORMAT2W formatStack[maxCharFormatStack];
- int codePageStack[maxCharFormatStack];
- int formatStackTop;
+ DWORD dwCPOutputCount;
+ DWORD dwMaxCPOutputCount;
+ char *cpOutputBuffer;
+
+ RTFState stack[maxStack];
+ int stackTop;
};
@@ -1129,6 +1122,7 @@ struct _RTF_Info {
*/
void RTFInit (RTF_Info *);
+void RTFDestroy(RTF_Info *info);
void RTFSetInputName (RTF_Info *, char *);
char *RTFGetInputName (RTF_Info *);
void RTFSetOutputName (RTF_Info *, char *);
More information about the wine-patches
mailing list