[PATCH 1/5] kernelbase/locale: Implement sortkey generation on official tables

Wed Apr 29 19:43:12 CDT 2020

On 4/28/20 1:17 PM, Fabian Maurer wrote:
> Signed-off-by: Fabian Maurer <dark.shadow4 at web.de>
> ---
>   dlls/kernel32/tests/locale.c | 110 ++++++++
>   dlls/kernelbase/locale.c     | 477 ++++++++++++++++++++++++++---------
>   2 files changed, 464 insertions(+), 123 deletions(-)
> 

So as far as I understand, the sort key algorithm writes the level 0 
weights (script and alphabetic weight) for the whole string to the sort 
key, then the level 1 weights (diacritic), and so on, right?

In that case, what seems potentially simpler to me is to calculate those 
weights one level at a time, rather than one character at a time. That 
is, you'd end up doing something like

static int get_sortkey( DWORD flags, const WCHAR *src, int srclen, char 
*dst, int dstlen )
{
     int used = 0;
     for (i = 0; i < srclen; ++i)
     {
         used += get_main_weights(src[i], dst + used, dstlen - used);
         if (!(flags & NORM_IGNORENONSPACE))
             used += get_diacritic_weights(src[i], dst + used, dstlen - 
used);
         ...
     }
}

This avoids the need to store temporary buffers.

As that example shows, I also think it's probably simpler to just pass 
the buffer directly to whatever functions are writing sortkey bytes into it.

> diff --git a/dlls/kernel32/tests/locale.c b/dlls/kernel32/tests/locale.c
> index 4c1e1b4d73..13839bb10a 100644
> --- a/dlls/kernel32/tests/locale.c
> +++ b/dlls/kernel32/tests/locale.c
> @@ -2681,6 +2681,13 @@ static void test_lcmapstring_unicode(lcmapstring_wrapper func_ptr, const char *f
>       lstrlenW(symbols_stripped) + 1, ret);
>       ok(!lstrcmpW(buf, symbols_stripped), "%s string comparison mismatch\n", func_name);
> 
> +    /* test small buffer */
> +    lstrcpyW(buf, fooW);
> +    ret = func_ptr(LCMAP_SORTKEY, lower_case, -1, buf, 2);
> +    ok(ret == 0, "Expected a failure\n");
> +    ok(GetLastError() == ERROR_INSUFFICIENT_BUFFER,
> +           "%s unexpected error code %d\n", func_name, GetLastError());;
> +
>       /* test srclen = 0 */
>       SetLastError(0xdeadbeef);
>       ret = func_ptr(0, upper_case, 0, buf, ARRAY_SIZE(buf));
> @@ -3108,6 +3115,108 @@ static void test_sorting(void)
>       }
>   }
> 
> +struct sorting_test_entry {
> +    const WCHAR* locale;
> +    DWORD flags;
> +    const WCHAR* first;
> +    const WCHAR* second;
> +    int result_sortkey;
> +    int result_compare;
> +    BOOL broken_on_old_win;
> +};
> +
> +static const struct sorting_test_entry unicode_sorting_tests[] =
> +{
> +    /*   0 */ { L"en-US", 0, L"\ue6e3\u0a02", L"\ue6e3\u20dc", CSTR_LESS_THAN, 0, TRUE }, /* Test default character, when there is main weight extra there must be no diacritic weight */
> +    /*   1 */ { L"en-US", 0, L"\u276a", L"\u2768", CSTR_GREATER_THAN }, /* Test symbols, must add diacritic weight */
> +    /*   2 */ { L"en-US", 0, L"\u204d", L"\uff02", CSTR_LESS_THAN }, /* Test symbols, must add case weight */
> +    /*   3 */ { L"en-US", 0, L"a \u2060 b", L"a  b", CSTR_EQUAL }, /* Test unsortable characters */
> +    /*   4 */ { L"en-US", 0, L"a \xfff0 b", L"a  b", CSTR_EQUAL }, /* Test invalid characters */
> +    /*   5 */ { L"en-US", 0, L"\x00fc", L"\x016d", CSTR_LESS_THAN },
> +    /*   6 */ { L"en-US", 0, L"\x3fcb\x7fd5", L"\x0006\x3032", CSTR_GREATER_THAN },
> +    /*   7 */ { L"en-US", 0, L"\x00fc\x30fd", L"\x00fa\x1833", CSTR_LESS_THAN },
> +    /*   8 */ { L"en-US", 0, L"\x0037", L"\x277c", CSTR_LESS_THAN, 0, TRUE }, /* Normal character */
> +    /*   9 */ { L"en-US", 0, L"\x1eca", L"\x1ecb", CSTR_GREATER_THAN }, /* Normal character */
> +    /*  10 */ { L"en-US", 0, L"\x1d05", L"\x1d48", CSTR_GREATER_THAN }, /* Normal character */
> +    /*  11 */ { L"en-US", 0, L"\x19d7", L"\x096d", CSTR_GREATER_THAN }, /* Normal character diacritics */
> +    /*  12 */ { L"en-US", 0, L"\x00f5", L"\x1ecf", CSTR_LESS_THAN }, /* Normal character diacritics */
> +    /*  13 */ { L"en-US", 0, L"\x2793", L"\x0d70", CSTR_LESS_THAN, 0, TRUE }, /* Normal character diacritics */
> +    /*  14 */ { L"en-US", 0, L"A", L"a", CSTR_GREATER_THAN }, /* Normal character case weights */
> +    /*  15 */ { L"en-US", 0, L"z", L"Z", CSTR_LESS_THAN }, /* Normal character case weights */
> +    /*  16 */ { L"en-US", 0, L"\xe5a6", L"\xe5a5\x0333", CSTR_GREATER_THAN, 0, TRUE }, /* CJK with extra value */
> +    /*  17 */ { L"en-US", 0, L"\xe5d7", L"\xe5d6\x0330", CSTR_GREATER_THAN, 0, TRUE }, /* CJK with extra value */
> +    /*  18 */ { L"en-US", 0, L"\x1B56\x0330", L"\x1096", CSTR_GREATER_THAN }, /* Diacritic is added */
> +    /*  19 */ { L"en-US", 0, L"\x1817\x0333", L"\x19d7", CSTR_GREATER_THAN }, /* Diacritic is added */
> +    /*  20 */ { L"en-US", 0, L"\x04de\x05ac", L"\x0499", CSTR_GREATER_THAN }, /* Diacritic is added */
> +    /*  21 */ { L"en-US", 0, L"\x01ba\x0654", L"\x01b8", CSTR_LESS_THAN }, /* Diacritic can overflow */
> +    /*  22 */ { L"en-US", 0, L"\x06b7\x06eb", L"\x06b6", CSTR_LESS_THAN }, /* Diacritic can overflow */
> +    /*  23 */ { L"en-US", 0, L"\x1420\x0333", L"\x141f", CSTR_LESS_THAN }, /* Diacritic can overflow */
> +    /*  24 */ { L"en-US", 0, L"\x11bc", L"\x110b", CSTR_GREATER_THAN }, /* Jamo case weight */
> +    /*  25 */ { L"en-US", 0, L"\x11c1", L"\x1111", CSTR_GREATER_THAN }, /* Jamo case weight */
> +    /*  26 */ { L"en-US", 0, L"\x11af", L"\x1105", CSTR_GREATER_THAN }, /* Jamo case weight */
> +    /*  27 */ { L"en-US", 0, L"\x11c2", L"\x11f5", CSTR_LESS_THAN }, /* Jamo main weight */
> +    /*  28 */ { L"en-US", 0, L"\x1108", L"\x1121", CSTR_LESS_THAN }, /* Jamo main weight */
> +    /*  29 */ { L"en-US", 0, L"\x1116", L"\x11c7", CSTR_LESS_THAN }, /* Jamo main weight */
> +    /*  30 */ { L"en-US", 0, L"\x11b1", L"\x11d1", CSTR_LESS_THAN }, /* Jamo main weight */
> +    /*  31 */ { L"en-US", 0, L"\x4550\x73d2", L"\x3211\x23ad", CSTR_GREATER_THAN }, /* Script 5 main weight 1 */
> +    /*  32 */ { L"en-US", 0, L"\x3265", L"\x4079", CSTR_LESS_THAN }, /* Script 5 main weight 1 */
> +    /*  33 */ { L"en-US", 0, L"\x4c19\x68d0\x52d0", L"\x316d", CSTR_GREATER_THAN }, /* Script 5 main weight 1 */
> +    /*  34 */ { L"en-US", 0, L"\x72dd", L"\x6b8a", CSTR_GREATER_THAN }, /* Script 5 main weight 2 */
> +    /*  35 */ { L"en-US", 0, L"\x6785\x3bff\x6f83", L"\x7550\x34c9\x71a7", CSTR_LESS_THAN }, /* Script 5 main weight 2 */
> +    /*  36 */ { L"en-US", 0, L"\x5d61", L"\x3aef", CSTR_LESS_THAN }, /* Script 5 main weight 2 */
> +    /*  37 */ { L"en-US", 0, L"\x207a", L"\xfe62", CSTR_GREATER_THAN }, /* Symbols case weights */
> +    /*  38 */ { L"en-US", 0, L"\xfe65", L"\xff1e", CSTR_GREATER_THAN }, /* Symbols case weights */
> +    /*  39 */ { L"en-US", 0, L"\x2502", L"\xffe8", CSTR_GREATER_THAN }, /* Symbols case weights */
> +    /*  40 */ { L"en-US", 0, L"\x21da", L"\x21dc", CSTR_LESS_THAN }, /* Symbols diacritic weights */
> +    /*  41 */ { L"en-US", 0, L"\x29fb", L"\x2295", CSTR_LESS_THAN }, /* Symbols diacritic weights */
> +    /*  42 */ { L"en-US", 0, L"\x0092", L"\x009c", CSTR_LESS_THAN }, /* Symbols diacritic weights */
> +    /*  43 */ { L"en-US", NORM_IGNORESYMBOLS, L"\x21da", L"\x21dc", CSTR_EQUAL }, /* NORM_IGNORESYMBOLS */
> +    /*  44 */ { L"en-US", NORM_IGNORESYMBOLS, L"\x29fb", L"\x2295", CSTR_EQUAL }, /* NORM_IGNORESYMBOLS */
> +    /*  45 */ { L"en-US", NORM_IGNORESYMBOLS, L"\x0092", L"\x009c", CSTR_EQUAL }, /* NORM_IGNORESYMBOLS */
> +    /*  46 */ { L"en-US", 0, L"\x3099", L"\x309a", CSTR_EQUAL }, /* MIN_WEIGHT */
> +    /*  47 */ { L"en-US", 0, L"\x309b", L"\x05a2", CSTR_EQUAL }, /* MIN_WEIGHT */
> +    /*  48 */ { L"en-US", 0, L"\xff9e", L"\x0e47", CSTR_EQUAL }, /* MIN_WEIGHT */
> +};
> +
> +static void test_unicode_sorting(void)
> +{
> +    int i;
> +    if (!pLCMapStringEx)
> +    {
> +
> +        win_skip("LCMapStringEx not available\n");
> +        return;
> +    }
> +    for (i = 0; i < ARRAY_SIZE(unicode_sorting_tests); i++)
> +    {
> +        int pos;
> +        BYTE buff1[1000];
> +        BYTE buff2[1000];
> +        int len1, len2;
> +        int result = CSTR_EQUAL;
> +        const struct sorting_test_entry* entry = &unicode_sorting_tests[i];
> +
> +        len1 = pLCMapStringEx(entry->locale, LCMAP_SORTKEY | entry->flags, entry->first, -1, (WCHAR*)buff1, ARRAY_SIZE(buff1), NULL, NULL, 0);
> +        len2 = pLCMapStringEx(entry->locale, LCMAP_SORTKEY | entry->flags, entry->second, -1, (WCHAR*)buff2, ARRAY_SIZE(buff2), NULL, NULL, 0);

Is there a reason to use LCMapStringEx() here rather than LCMapString()?

> +
> +        for (pos = 0; pos < len1 && pos < len2; pos++)
> +        {
> +            if (buff1[pos] > buff2[pos])
> +            {
> +                result = CSTR_GREATER_THAN;
> +                break;
> +            }
> +            else if (buff1[pos] < buff2[pos])
> +            {
> +                result = CSTR_LESS_THAN;
> +                break;
> +            }
> +        }
> +
> +        ok (result == entry->result_sortkey || broken(entry->broken_on_old_win), "Test %d - Expected %d, got %d\n", i, entry->result_sortkey, result);
> +    }
> +}
> +
>   static void test_FoldStringA(void)
>   {
>     int ret, i, j;
> @@ -6897,4 +7006,5 @@ START_TEST(locale)
>     test_NLSVersion();
>     /* this requires collation table patch to make it MS compatible */
>     if (0) test_sorting();

The fact that this test is commented out never struck me as great. I'm 
pretty sure that with todo_wine added as appropriate, it could pass. A 
first patch in this series could be to do that.

> +  test_unicode_sorting();
>   }
> diff --git a/dlls/kernelbase/locale.c b/dlls/kernelbase/locale.c
> index 53e4e42da3..74177371d9 100644
> --- a/dlls/kernelbase/locale.c
> +++ b/dlls/kernelbase/locale.c
> @@ -2126,127 +2126,6 @@ static int wcstombs_codepage( UINT codepage, DWORD flags, const WCHAR *src, int
>           return wcstombs_sbcs( info, src, srclen, dst, dstlen );
>   }
> 
> -
> -static int get_sortkey( DWORD flags, const WCHAR *src, int srclen, char *dst, int dstlen )
> -{
> -    WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
> -    int key_len[4];
> -    char *key_ptr[4];
> -    const WCHAR *src_save = src;
> -    int srclen_save = srclen;
> -
> -    key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
> -    for (; srclen; srclen--, src++)
> -    {
> -        unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
> -        dummy[0] = *src;
> -        if (decomposed_len)
> -        {
> -            for (i = 0; i < decomposed_len; i++)
> -            {
> -                WCHAR wch = dummy[i];
> -                unsigned int ce;
> -
> -                if ((flags & NORM_IGNORESYMBOLS) &&
> -                    (get_char_type( CT_CTYPE1, wch ) & (C1_PUNCT | C1_SPACE)))
> -                    continue;
> -
> -                if (flags & NORM_IGNORECASE) wch = casemap( nls_info.LowerCaseTable, wch );
> -
> -                ce = collation_table[collation_table[collation_table[wch >> 8] + ((wch >> 4) & 0x0f)] + (wch & 0xf)];
> -                if (ce != (unsigned int)-1)
> -                {
> -                    if (ce >> 16) key_len[0] += 2;
> -                    if ((ce >> 8) & 0xff) key_len[1]++;
> -                    if ((ce >> 4) & 0x0f) key_len[2]++;
> -                    if (ce & 1)
> -                    {
> -                        if (wch >> 8) key_len[3]++;
> -                        key_len[3]++;
> -                    }
> -                }
> -                else
> -                {
> -                    key_len[0] += 2;
> -                    if (wch >> 8) key_len[0]++;
> -                    if (wch & 0xff) key_len[0]++;
> -		}
> -            }
> -        }
> -    }
> -
> -    if (!dstlen) /* compute length */
> -        /* 4 * '\1' + key length */
> -        return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4;
> -
> -    if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
> -        return 0; /* overflow */
> -
> -    src = src_save;
> -    srclen = srclen_save;
> -
> -    key_ptr[0] = dst;
> -    key_ptr[1] = key_ptr[0] + key_len[0] + 1;
> -    key_ptr[2] = key_ptr[1] + key_len[1] + 1;
> -    key_ptr[3] = key_ptr[2] + key_len[2] + 1;
> -
> -    for (; srclen; srclen--, src++)
> -    {
> -        unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
> -        dummy[0] = *src;
> -        if (decomposed_len)
> -        {
> -            for (i = 0; i < decomposed_len; i++)
> -            {
> -                WCHAR wch = dummy[i];
> -                unsigned int ce;
> -
> -                if ((flags & NORM_IGNORESYMBOLS) &&
> -                    (get_char_type( CT_CTYPE1, wch ) & (C1_PUNCT | C1_SPACE)))
> -                    continue;
> -
> -                if (flags & NORM_IGNORECASE) wch = casemap( nls_info.LowerCaseTable, wch );
> -
> -                ce = collation_table[collation_table[collation_table[wch >> 8] + ((wch >> 4) & 0x0f)] + (wch & 0xf)];
> -                if (ce != (unsigned int)-1)
> -                {
> -                    WCHAR key;
> -                    if ((key = ce >> 16))
> -                    {
> -                        *key_ptr[0]++ = key >> 8;
> -                        *key_ptr[0]++ = key & 0xff;
> -                    }
> -                    /* make key 1 start from 2 */
> -                    if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
> -                    /* make key 2 start from 2 */
> -                    if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
> -                    /* key 3 is always a character code */
> -                    if (ce & 1)
> -                    {
> -                        if (wch >> 8) *key_ptr[3]++ = wch >> 8;
> -                        if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
> -                    }
> -                }
> -                else
> -                {
> -                    *key_ptr[0]++ = 0xff;
> -                    *key_ptr[0]++ = 0xfe;
> -                    if (wch >> 8) *key_ptr[0]++ = wch >> 8;
> -                    if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
> -                }
> -            }
> -        }
> -    }
> -
> -    *key_ptr[0] = 1;
> -    *key_ptr[1] = 1;
> -    *key_ptr[2] = 1;
> -    *key_ptr[3]++ = 1;
> -    *key_ptr[3] = 0;
> -    return key_ptr[3] - dst;
> -}
> -
> -
>   /* compose a full-width katakana. return consumed source characters. */
>   static int compose_katakana( const WCHAR *src, int srclen, WCHAR *dst )
>   {
> @@ -2574,6 +2453,358 @@ static int compare_weights(int flags, const WCHAR *str1, int len1,
>       return len1 - len2;
>   }
> 
> +/* Start sortkey handler code. */
> +
> +/* Defines */
> +
> +#define JAPANESE 3
> +#define MIN_WEIGHT 2
> +#define LIST_STACK_BUFFER 1000
> +
> +/* Internal structures */

Are these comments useful?

> +
> +typedef struct _character_info
> +{
> +    BYTE weight_primary;
> +    BYTE script_member;
> +    BYTE weight_diacritic;
> +    BYTE weight_case;
> +} character_info;
> +

I get the impression that typedefs have largely fallen out of favour.

> +typedef struct _weight_main_info
> +{
> +    BYTE script_member;
> +    BYTE weight_primary;
> +    BYTE extra;
> +} weight_main_info;
> +
> +typedef struct _list
> +{
> +    int extra_len;
> +    int len;
> +    BYTE buffer[LIST_STACK_BUFFER];
> +    int buffer_count;
> +    BYTE* extra;
> +    int element_size;
> +} list;
> +
> +typedef struct _sortkey_data
> +{
> +    int flags;
> +    list key;
> +    list weights_main;
> +    list weights_diacritic;
> +    list weights_case;
> +} sortkey_data;
> +
> +/* List functions */
> +
> +static void LIST_INIT(list* name, int type_size)
> +{
> +    name->extra_len = 0;
> +    name->len = 0;
> +    name->extra = 0;
> +    name->buffer_count = LIST_STACK_BUFFER / type_size;
> +    name->element_size = type_size;
> +}
> +
> +static void LIST_DESTROY(list* name)
> +{
> +    RtlFreeHeap(GetProcessHeap(), 0, name->extra);
> +}
> +
> +static void* LIST_GET(list* name, int index)
> +{
> +    if ((index + 1) * name->element_size <= LIST_STACK_BUFFER)
> +        return &name->buffer[index * name->element_size];
> +    else
> +        return &name->extra[index * name->element_size - name->buffer_count];
> +}
> +
> +/* Add entry to list, resizing as needed */
> +static void LIST_ADD(list* name, const void *value)
> +{
> +    void* entry;
> +    if ((name->len + 1) * name->element_size > name->extra_len + LIST_STACK_BUFFER)
> +    {
> +        if (!name->extra) /* First allocation */
> +        {
> +            name->extra_len = LIST_STACK_BUFFER;
> +            name->extra = RtlAllocateHeap(GetProcessHeap(), 0, name->extra_len);
> +        }
> +        else
> +        {
> +            name->extra_len *= 2;
> +            name->extra = RtlReAllocateHeap(GetProcessHeap(), 0,name->extra, name->extra_len);
> +        }
> +    }
> +    entry = LIST_GET(name, name->len);
> +    memcpy(entry, value, name->element_size);
> +    name->len++;
> +}
> +
> +/* Append a weight list to the sortkey */
> +#define APPEND_LIST_TO_SORTKEY(data, weights, type, statement_get_value, statement_is_ignored) \
> +    do {                                                                \
> +        int z;                                                          \
> +        int end = data->weights.len - 1;                                \
> +        while (end >= 0)                                                \
> +        {                                                               \
> +            const type* element = LIST_GET(&data->weights, end);        \
> +            (void)element;                                              \
> +            if (!(statement_is_ignored)) break;                         \
> +            end--;                                                      \
> +        }                                                               \
> +        for (z = 0; z <= end; z++)                                      \
> +        {                                                               \
> +            const type* element = LIST_GET(&data->weights, z);          \
> +            LIST_ADD(&data->key, statement_get_value);                  \
> +        }                                                               \
> +    }  while (0);
> +
> +/* Helper functions */
> +
> +static BOOL get_char(sortkey_data* data, character_info* info, WCHAR ch)
> +{
> +    DWORD value = sort.keys[ch];
> +
> +    info->weight_case = value >> 24;
> +    info->weight_diacritic = (value >> 16) & 0xff;
> +    info->script_member = (value >> 8) & 0xff;
> +    info->weight_primary = value & 0xff;
> +    return info->script_member != 0;
> +}
> +
> +static void sortkey_data_init(sortkey_data* data, int flags, const WCHAR* locale, BOOL is_compare_string)
> +{
> +    data->flags = flags;
> +    LIST_INIT(&data->key, sizeof(BYTE));
> +    LIST_INIT(&data->weights_main, sizeof(BYTE));
> +    LIST_INIT(&data->weights_diacritic, sizeof(BYTE));
> +    LIST_INIT(&data->weights_case, sizeof(BYTE));
> +}
> +
> +static void sortkey_data_destroy(sortkey_data* data)
> +{
> +    LIST_DESTROY(&data->key);
> +    LIST_DESTROY(&data->weights_main);
> +    LIST_DESTROY(&data->weights_diacritic);
> +    LIST_DESTROY(&data->weights_case);
> +}
> +
> +static weight_main_info create_weight_main(BYTE script_member, BYTE weight_primary)
> +{
> +    weight_main_info ret = { 0 };
> +    ret.script_member = script_member;
> +    ret.weight_primary = weight_primary;
> +    return ret;
> +}
> +
> +static void case_weights_add(sortkey_data* data, BYTE value)
> +{
> +    int flags = data->flags;
> +    if (NORM_IGNORECASE & flags)
> +        value = value & ~(16 + 8);
> +    if (NORM_IGNOREWIDTH & flags)
> +        value = value & ~(1);
> +    if (NORM_IGNOREKANATYPE & flags)
> +        value = value & ~(32);
> +
> +    LIST_ADD(&data->weights_case, &value);
> +}
> +
> +static void main_weights_add(sortkey_data *data, weight_main_info* value)
> +{
> +    LIST_ADD(&data->weights_main, &value->script_member);
> +    LIST_ADD(&data->weights_main, &value->weight_primary);
> +    if (value->extra > 0)
> +        LIST_ADD(&data->weights_main, &value->extra);
> +}
> +
> +static void diacritic_weights_add(sortkey_data* data, const character_info* info, BYTE value)
> +{
> +    LIST_ADD(&data->weights_diacritic, &value);
> +}
> +
> +/* Main sortkey logic */
> +
> +static void sortkey_handle_default_character(sortkey_data* data, WCHAR c)
> +{
> +    weight_main_info weightmain;
> +    character_info info;
> +
> +    if (!get_char(data, &info, c))
> +    {
> +        return;
> +    }
> +
> +    weightmain = create_weight_main(info.script_member, info.weight_primary);
> +    if (info.script_member >= 0xa9 && info.script_member <= 0xaf) /* Some CJK have extra value */
> +        weightmain.extra = info.weight_diacritic;
> +    else
> +        diacritic_weights_add(data, &info, info.weight_diacritic);
> +
> +    main_weights_add(data, &weightmain);
> +
> +    case_weights_add(data, info.weight_case);
> +}
> +
> +static BOOL sortkey_handle_character(sortkey_data* data, WCHAR c, const WCHAR* str, int i)
> +{
> +    weight_main_info weightmain;
> +    character_info info;
> +    int flags = data->flags;
> +
> +    if (!get_char(data, &info, c))
> +    {
> +        return FALSE;
> +    }
> +
> +    switch (info.script_member)
> +    {
> +    case 0: /* Not sorted */
> +        break;
> +
> +    case 1:
> +        if (data->weights_diacritic.len > 0)
> +        {
> +            BYTE* entry = LIST_GET(&data->weights_diacritic, data->weights_diacritic.len - 1);
> +            *entry += info.weight_diacritic; /* Overflow can happen, that's okay */
> +        }
> +        else
> +            diacritic_weights_add(data, &info, info.weight_diacritic);
> +        break;
> +
> +    case JAPANESE:
> +        /* TODO */
> +        break;
> +
> +    case 4: /* Jamo */
> +        weightmain = create_weight_main(info.weight_primary, info.weight_diacritic);
> +        main_weights_add(data, &weightmain);
> +
> +        diacritic_weights_add(data, &info, MIN_WEIGHT);
> +
> +        case_weights_add(data, info.weight_case);
> +        break;
> +
> +    case 5:
> +        weightmain = create_weight_main(253, 255);
> +        main_weights_add(data, &weightmain);
> +
> +        weightmain = create_weight_main(info.weight_primary, info.weight_diacritic);
> +        main_weights_add(data, &weightmain);
> +
> +        diacritic_weights_add(data, &info, MIN_WEIGHT);
> +
> +        case_weights_add(data, MIN_WEIGHT);
> +        break;
> +
> +    case 6: /* Punctuation */
> +        /* TODO */
> +        break;
> +
> +    case 7:  /* Symbols */
> +    case 8:  /* Symbols */
> +    case 9:  /* Symbols */
> +    case 10: /* Symbols */
> +    case 11: /* Symbols */
> +    case 12: /* Symbols */
> +        if (flags & NORM_IGNORESYMBOLS)
> +            break;
> +
> +        weightmain = create_weight_main(info.script_member, info.weight_primary);
> +        main_weights_add(data, &weightmain);
> +
> +        diacritic_weights_add(data, &info, info.weight_diacritic);
> +
> +        case_weights_add(data, info.weight_case);
> +        break;
> +
> +    default:
> +        sortkey_handle_default_character(data, c);
> +        break;

The fact that exactly one of these integer cases has a symbolic constant 
attached seems less than ideal.

> +    }
> +    return TRUE;
> +}
> +
> +static void sortkey_write_result(sortkey_data* data)
> +{
> +    int flags = data->flags;
> +
> +    const BYTE SORTKEY_SEPARATOR = 1;
> +    const BYTE SORTKEY_TERMINATOR = 0;
> +
> +    /* Main weights */
> +
> +    APPEND_LIST_TO_SORTKEY(data, weights_main, BYTE, element, FALSE);
> +
> +    LIST_ADD(&data->key, &SORTKEY_SEPARATOR);
> +
> +    /* Diacritic weights */
> +
> +    if ((flags & NORM_IGNORENONSPACE) == 0)
> +    {
> +        APPEND_LIST_TO_SORTKEY(data, weights_diacritic, BYTE, element, *element <= MIN_WEIGHT);
> +    }
> +
> +    LIST_ADD(&data->key, &SORTKEY_SEPARATOR);
> +
> +    /* Case weights */
> +    if ((NORM_IGNORECASE & flags) == 0 || (NORM_IGNOREWIDTH & flags) == 0)
> +    {
> +        APPEND_LIST_TO_SORTKEY(data, weights_case, BYTE, element, FALSE);
> +    }
> +
> +    LIST_ADD(&data->key,  &SORTKEY_SEPARATOR);
> +
> +    /* Extra weights */
> +    /* TODO */
> +
> +    LIST_ADD(&data->key, &SORTKEY_SEPARATOR);
> +
> +    /* Special weights */
> +    /* TODO */
> +
> +    LIST_ADD(&data->key, &SORTKEY_TERMINATOR);
> +}
> +
> +static int sortkey_generate(int flags, const WCHAR* locale, const WCHAR* str, int str_len, BYTE* buffer, int buffer_len)
> +{
> +    int i;
> +    sortkey_data data;
> +    int ret = 0;
> +
> +    sortkey_data_init(&data, flags, locale, FALSE);
> +
> +    if (str_len == -1)
> +        str_len = wcslen(str);
> +
> +    for (i = 0; i < str_len; i++)
> +    {
> +        sortkey_handle_character(&data, str[i], str, i);
> +    }
> +
> +    sortkey_write_result(&data);
> +
> +    if (data.key.len <= buffer_len)
> +    {
> +        for (i = 0; i < data.key.len; i++)
> +        {
> +            BYTE* value = LIST_GET(&data.key, i);
> +            buffer[i] = *value;
> +        }
> +        ret = data.key.len;
> +    }
> +    else if (!buffer)
> +    {
> +        ret = data.key.len;
> +    }
> +    sortkey_data_destroy(&data);
> +    return ret;
> +}
> +
> +/* End sortkey handler code */
> 
>   static const struct geoinfo *get_geoinfo_ptr( GEOID geoid )
>   {
> @@ -4964,8 +5195,8 @@ INT WINAPI DECLSPEC_HOTPATCH LCMapStringEx( const WCHAR *locale, DWORD flags, co
>           TRACE( "(%s,0x%08x,%s,%d,%p,%d)\n",
>                  debugstr_w(locale), flags, debugstr_wn(src, srclen), srclen, dst, dstlen );
> 
> -        if ((ret = get_sortkey( flags, src, srclen, (char *)dst, dstlen ))) ret++;
> -        else SetLastError( ERROR_INSUFFICIENT_BUFFER );
> +        if (!(ret = sortkey_generate(flags, L"", src, srclen, (BYTE *)dst, dstlen )))
> +            SetLastError( ERROR_INSUFFICIENT_BUFFER );
>           return ret;
>       }
> 
> --
> 2.26.2
> 
>