[PATCH] mlang: Add basic implementation for IMultiLanguage3_DetectInputCodepage

Sun Nov 11 12:21:09 CST 2018

On 11/11/18 8:45 PM, Fabian Maurer wrote:
> This is a first basic implementation that can handle
> UTF-16/UTF-8 and should cover most text files.
>
> It's currently not used by any known program,
> but I intend to use the function for find.exe to detect encodings.
>
> Signed-off-by: Fabian Maurer <dark.shadow4 at web.de>
> ---
>   dlls/mlang/mlang.c       |  35 +++++++++++-
>   dlls/mlang/tests/mlang.c | 114 +++++++++++++++++++++++++++++++++++++++
>   include/mlang.idl        |  12 ++++-
>   3 files changed, 158 insertions(+), 3 deletions(-)
>
> diff --git a/dlls/mlang/mlang.c b/dlls/mlang/mlang.c
> index f12df298f1..e33c982c76 100644
> --- a/dlls/mlang/mlang.c
> +++ b/dlls/mlang/mlang.c
> @@ -3097,8 +3097,39 @@ static HRESULT WINAPI fnIMultiLanguage3_DetectInputCodepage(
>       DetectEncodingInfo* lpEncoding,
>       INT* pnScores)
>   {
> -    FIXME("\n");
> -    return E_NOTIMPL;
> +    INT test;
> +
> +    FIXME("(%u %u, %p, %p, %p, %p - semi-stub!\n", dwFlag, dwPrefWinCodePage, pSrcStr, pcSrcSize, lpEncoding, pnScores);
> +
> +    if (!pSrcStr || !lpEncoding || *pcSrcSize <= 0 || *pnScores <= 0)
> +        return E_INVALIDARG;
> +
> +    test = IS_TEXT_UNICODE_SIGNATURE | IS_TEXT_UNICODE_REVERSE_SIGNATURE;
> +    IsTextUnicode(pSrcStr, *pcSrcSize, &test);
> +
> +    if (test & IS_TEXT_UNICODE_SIGNATURE)
> +    {
> +        *pnScores = 1;
> +        lpEncoding[0].nCodePage = 1200;
> +        return S_OK;
> +    }
> +
> +    if (test & IS_TEXT_UNICODE_REVERSE_SIGNATURE)
> +    {
> +        *pnScores = 1;
> +        lpEncoding[0].nCodePage = 1201;
> +        return S_OK;
> +    }
> +
> +    /* Check for valid UTF-8 */
> +    if (MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, pSrcStr, *pcSrcSize, NULL, 0) > 0)
> +    {
> +        *pnScores = 1;
> +        lpEncoding[0].nCodePage = 65001;
> +        return S_OK;
> +    }
> +
> +    return S_FALSE;
>   }

MLang is clearly more sophisticated than IsTextUnicode(), judging just 
by returned arguments. I think it's possible dictionary lookup is involved.

Can you use IsTextUnicode() in find.exe instead? And more importantly to 
we have find.exe tests that prove it has to be supported in a first place?

E.g. does it work if you have UTF-16 encoded file and string you're 
looking for is in console CP, or does it instead compare byte sequence 
accounting for utf-8/utf-16 line separators?

>   
>   static HRESULT WINAPI fnIMultiLanguage3_ValidateCodePage(
> diff --git a/dlls/mlang/tests/mlang.c b/dlls/mlang/tests/mlang.c
> index b5d6fc6114..f57a870d93 100644
> --- a/dlls/mlang/tests/mlang.c
> +++ b/dlls/mlang/tests/mlang.c
> @@ -2695,6 +2695,119 @@ static void test_MapFont(IMLangFontLink *font_link, IMLangFontLink2 *font_link2)
>       ReleaseDC(NULL, hdc);
>   }
>   
> +static void test_DetectInputCodepage(IMultiLanguage2 *ml2)
> +{
> +    static char str_empty[] = {0};
> +    static char str_utf8_bom1[] = "\xef\xbb\xbf this is a test string with utf8 bom";
> +    static char str_utf8_bom2[] = "\xef\xbb\xbf this is a test string with utf8 bom this is a test string with utf8 bom this is a test string with utf8 bom"
> +        "this is a test string with utf8 bom this is a test string with utf8 bom this is a test string with utf8 bom this is a test string with utf8 bom";
> +    static char str_shift_jis[] = {0x82, 0xB1, 0x82, 0xEA, 0x82, 0xCD, 0x93, 0xFA, 0x96, 0x7B, 0x8C, 0xEA, 0x82, 0xCC, 0x83, 0x65, 0x83, 0x4C, 0x83, 0x58, 0x83, 0x67, 0x82, 0xC5, 0x82, 0xB7, 0x00 };
> +    static char str_utf16_be_with_bom[] = {
> +        0xFE, 0xFF, 0x00, 0x54, 0x00, 0x68, 0x00, 0x69, 0x00, 0x73, 0x00, 0x20, 0x00, 0x69, 0x00, 0x73,
> +        0x00, 0x20, 0x00, 0x74, 0x00, 0x65, 0x00, 0x78, 0x00, 0x74, 0x00, 0x20, 0x00, 0x69, 0x00, 0x6E,
> +        0x00, 0x20, 0x00, 0x55, 0x00, 0x54, 0x00, 0x46, 0x00, 0x31, 0x00, 0x36, 0x00 };
> +    char *str_utf16_be_without_bom = &str_utf16_be_with_bom[2];
> +    static char  str_utf16_le_with_bom[] = {
> +        0xFF, 0xFE, 0x54, 0x00, 0x68, 0x00, 0x69, 0x00, 0x73, 0x00, 0x20, 0x00, 0x69, 0x00, 0x73, 0x00,
> +        0x20, 0x00, 0x74, 0x00, 0x65, 0x00, 0x78, 0x00, 0x74, 0x00, 0x20, 0x00, 0x69, 0x00, 0x6E, 0x00,
> +        0x20, 0x00, 0x55, 0x00, 0x54, 0x00, 0x46, 0x00, 0x31, 0x00, 0x36, 0x00, 0x00 };
> +    char *str_utf16_le_without_bom = &str_utf16_le_with_bom[2];
> +    static char str_utf8_hello_without_bom[] = { /* Hello in english, russian and japanese */
> +      0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x20, 0xD0, 0xBF, 0xD1, 0x80, 0xD0, 0xB8, 0xD0, 0xB2, 0xD0, 0xB5,
> +      0xD1, 0x82, 0x20, 0xE4, 0xBB, 0x8A, 0xE6, 0x97, 0xA5, 0xE3, 0x81, 0xAF, 0x00 };
> +
> +    DetectEncodingInfo encoding_info[5];
> +    HRESULT result;
> +    INT str_size;
> +    INT encoding_count;
> +
> +    /* Test error conditions */
> +
> +    str_size = sizeof(str_empty);
> +    encoding_count = ARRAY_SIZE(encoding_info);
> +    result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, NULL, &str_size, encoding_info, &encoding_count);
> +    ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result);
> +    ok(5 == encoding_count, "Expected encoding_count to be %d, got %d\n", 5, encoding_count);
> +
> +    str_size = sizeof(str_empty);
> +    encoding_count = ARRAY_SIZE(encoding_info);
> +    result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, str_empty, &str_size, NULL, &encoding_count);
> +    ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result);
> +    ok(5 == encoding_count, "Expected encoding_count to be %d, got %d\n", 5, encoding_count);
> +
> +    str_size = 0;
> +    encoding_count = ARRAY_SIZE(encoding_info);
> +    result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, str_empty, &str_size, encoding_info, &encoding_count);
> +    ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result);
> +    ok(5 == encoding_count, "Expected encoding_count to be %d, got %d\n", 5, encoding_count);
> +
> +    str_size = sizeof(str_empty);
> +    encoding_count = 0;
> +    result = IMultiLanguage2_DetectInputCodepage(ml2, MLDETECTCP_NONE, 0, str_empty, &str_size, encoding_info, &encoding_count);
> +    ok(result == E_INVALIDARG, "Expected E_INVALIDARG, got %#x\n", result);
> +    ok(0 == encoding_count, "Expected encoding_count to be %d, got %d\n", 0, encoding_count);
> +
> +    /* Test strings */
> +
> +#define run_DetectInputCodepage(flags, codepage_default, str)                                                                   \
> +    str_size = sizeof(str);                                                                                                     \
> +    encoding_count = ARRAY_SIZE(encoding_info);                                                                                 \
> +    memset(&encoding_info, 0, sizeof(encoding_info));                                                                           \
> +    result = IMultiLanguage2_DetectInputCodepage(ml2, flags, codepage_default, str, &str_size, encoding_info, &encoding_count);

Do you need a macro for that?