[PATCH v3 2/3] tools/make_unicode: Implement canonical composition for use in normalization.

Sergio Gómez Del Real sdelreal at codeweavers.com
Tue Mar 27 08:21:58 CDT 2018


Signed-off-by: Sergio Gómez Del Real <sdelreal at codeweavers.com>
---
 tools/make_unicode | 302 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 301 insertions(+), 1 deletion(-)

diff --git a/tools/make_unicode b/tools/make_unicode
index 65ae7ab2a0..1ad090219b 100755
--- a/tools/make_unicode
+++ b/tools/make_unicode
@@ -359,6 +359,8 @@ my @joining_table = ();
 my @direction_table = ();
 my @decomp_table = ();
 my @compose_table = ();
+my @comb_class_table = ();
+my @full_comp_table = ();
 my $default_char;
 my $default_wchar;
 
@@ -469,6 +471,11 @@ sub READ_DEFAULTS($)
             }
         }
 
+        if ($comb != 0)
+        {
+            $comb_class_table[$src] = (hex $comb);
+        }
+
         next if $decomp eq "";  # no decomposition, skip it
 
         # store decomposition table
@@ -561,6 +568,25 @@ sub READ_DEFAULTS($)
         my $flag = $ctype{$cat};
         foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
     }
+
+    my $UNICODE_DERIVED = open_data_file( $UNIDATA, "DerivedNormalizationProps.txt" );
+    while (<$UNICODE_DERIVED>)
+    {
+        next unless (/^([0-9a-fA-F.]+)\s+;\s+Full_Composition_Exclusion/);
+        my ($first, $last) = split /\.\./,$1;
+        $first = hex $first;
+        if (defined $last)
+        {
+            $last = hex $last;
+            while ($last gt $first)
+            {
+                $full_comp_table[$last] = 1;
+                $last--;
+            }
+        }
+        $full_comp_table[$first] = 1;
+    }
+    close $UNICODE_DERIVED;
 }
 
 
@@ -2249,6 +2275,8 @@ sub dump_compose_table($)
     }
     print OUTPUT "\n};\n\n";
     print OUTPUT <<"EOF";
+#include "decompose.c"
+
 static inline int binary_search( WCHAR ch, int low, int high )
 {
     while (low <= high)
@@ -2272,6 +2300,59 @@ WCHAR DECLSPEC_HIDDEN wine_compose( const WCHAR *str )
         count = table[2 * pos + 3];
     }
 }
+
+static inline int is_blocked(WCHAR *ptr1, WCHAR *ptr2)
+{
+    if (ptr1 >= ptr2) return -1;
+
+    while (++ptr1 < ptr2)
+    {
+        const WCHAR *map1, *map2;
+        map1 = unicode_table_lookup( *ptr1, 0, idx1_comb, 8, idx2_comb, 4,
+                                     offsets_comb, 4, data_comb, 0 );
+        map2 = unicode_table_lookup( *ptr2, 0, idx1_comb, 8, idx2_comb, 4,
+                                     offsets_comb, 4, data_comb, 0 );
+        if (*map1 == 0 || *map2 <= *map1) return 1;
+    }
+    return 0;
+}
+
+static inline int is_fullexcl(WCHAR ch)
+{
+    const WCHAR *map;
+    map = unicode_table_lookup( ch, 0, idx1_fullcomp, 8, idx2_fullcomp, 4,
+                                offsets_fullcomp, 4, data_fullcomp, 0 );
+    return (int)*map;
+}
+
+UINT unicode_canonical_composition( WCHAR *str, UINT strlen )
+{
+    int i, j;
+    WCHAR dum[3] = {0};
+
+    if (strlen == 0) strlen = strlenW( str );
+
+    for (i = 1; i <= strlen; i++)
+    {
+        WCHAR *scratch = str+i, comp;
+        if (*scratch == 0) break;
+        for (scratch = str+i; scratch - str > 0; --scratch)
+        {
+            if (is_starter( *scratch )) break;
+        }
+        if (!is_starter( *scratch ) || is_blocked( scratch, str+i )) continue;
+        dum[0] = *scratch;
+        dum[1] = str[i];
+        comp = wine_compose( dum );
+        if (comp == 0 || is_fullexcl( comp )) continue;
+        *scratch = comp;
+        for (j = i; j < strlen-1; j++) str[j] = str[j+1];
+        strlen--;
+        i--;
+    }
+
+    return strlen;
+}
 EOF
     close OUTPUT;
     save_file($filename);
@@ -2339,13 +2420,21 @@ sub dump_decompose_table($)
     my %nfd_lookup = ();
     my %nfkd_lookup = ();
     my %decomp_lookup = ();
+    my %comb_lookup = ();
+    my %fullcomp_lookup = ();
     my @decomp_data = (0);
+    my @comb_data = (0);
+    my @full_comp_data = (0);
     my $pos = 1;
+    my $pos_comb = 1;
+    my $pos_fullcomp = 1;
     my $lastchar_decomp;
+    my $lastchar_comb;
+    my $lastchar_fullcomp;
 
     for (my $i = 0; $i < $utflim; $i++)
     {
-        next unless defined $decomp_table[$i];
+        next unless defined $decomp_table[$i] || defined $comb_class_table[$i] || defined $full_comp_table[$i];
 
         if (defined $decomp_table[$i])
         {
@@ -2400,6 +2489,20 @@ sub dump_decompose_table($)
                 $pos += @nfkd;
             }
         }
+        if (defined $comb_class_table[$i])
+        {
+            push @comb_data, $comb_class_table[$i];
+            $lastchar_comb = $i;
+            $comb_lookup{$i} = $pos_comb;
+            $pos_comb++;
+        }
+        if (defined $full_comp_table[$i])
+        {
+            push @full_comp_data, $full_comp_table[$i];
+            $lastchar_fullcomp = $i;
+            $fullcomp_lookup{$i} = $pos_fullcomp;
+            $pos_fullcomp++;
+        }
     }
 
     printf OUTPUT "static const UINT last_decomposable = 0x%x;\n\n", $lastchar_decomp;
@@ -2491,6 +2594,154 @@ sub dump_decompose_table($)
     }
     print OUTPUT "\n};\n\n";
 
+    # now for Compatibility Class
+
+    printf OUTPUT "static const WCHAR data_comb[%d] =\n", $pos_comb;
+    print OUTPUT "{\n";
+    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @comb_data );
+    print OUTPUT "\n};\n\n";
+
+    my $comb_pos = 1;
+    my $comb_lim = ($lastchar_comb >> 8) + 1;
+    my @comb_filled = (0) x $comb_lim;
+    for (my $i = 0; $i < $utflim; $i++)
+    {
+        last if $i > $lastchar_comb;
+        next unless defined $comb_class_table[$i];
+        $comb_filled[$i >> 8] = $comb_pos++;
+        $i |= 255;
+    }
+    printf OUTPUT "static const BYTE idx1_comb[%d] =\n", $comb_lim;
+    print OUTPUT "{\n";
+    printf OUTPUT "%s", DUMP_ARRAY( "0x%02x", 0, @comb_filled );
+    print OUTPUT "\n};\n\n";
+
+    my $sub_comb_filled_pos = 1;
+    my %sub_comb_filled = ();
+    for (my $i = 0; $i < $comb_lim; $i++)
+    {
+        next unless $comb_filled[$i];
+        for (my $j = 0; $j < 256; $j++)
+        {
+            my $idx = ($i << 8) | $j;
+            next unless defined $comb_class_table[$idx];
+            $sub_comb_filled{$idx >> 4} = $sub_comb_filled_pos++;
+            $j |= 15;
+        }
+    }
+
+    printf OUTPUT "static const USHORT idx2_comb[%d] =\n", $comb_pos * 16;
+    print OUTPUT "{\n";
+    @null_idx = (0) x 16;
+    print OUTPUT "    /* all-zero 256-char blocks get mapped to here */\n";
+    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_idx );
+    for (my $i = 0; $i < $comb_lim; $i++)
+    {
+        next unless $comb_filled[$i];
+        printf OUTPUT ",\n    /* sub-index 0x%02x */\n", $comb_filled[$i];
+
+        my @sub_idx;
+        for (my $j = 0; $j < 16; $j++)
+        {
+            my $idx = ($i << 4) | $j;
+            $sub_idx[$j] = $sub_comb_filled{$idx} || 0;
+        }
+        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_idx );
+    }
+    print OUTPUT "\n};\n\n";
+
+    printf OUTPUT "static const USHORT offsets_comb[%d] =\n", 16 * $sub_comb_filled_pos;
+    print OUTPUT "{\n";
+    @null_table = (0) x 16;
+    print OUTPUT "    /* all-zero 16-char blocks get mapped to here */\n";
+    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_table );
+    for my $key (sort {$a <=> $b} keys %sub_comb_filled)
+    {
+        printf OUTPUT ",\n    /* 0x%03x0 .. 0x%03xf */\n", $key, $key;
+        my @sub_table;
+        for (my $j = 0; $j < 16; $j++)
+        {
+            my $idx = ($key << 4) | $j;
+            $sub_table[$j] = $comb_lookup{$idx} || 0;
+        }
+        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_table );
+    }
+    print OUTPUT "\n};\n\n";
+
+    # now for Full Composition Exclusion
+
+    printf OUTPUT "const WCHAR data_fullcomp[%d] =\n", $pos_fullcomp;
+    print OUTPUT "{\n";
+    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @full_comp_data );
+    print OUTPUT "\n};\n\n";
+
+    my $fullcomp_pos = 1;
+    my $fullcomp_lim = ($lastchar_fullcomp >> 8) + 1;
+    my @fullcomp_filled = (0) x $fullcomp_lim;
+    for (my $i = 0; $i < $utflim; $i++)
+    {
+        last if $i > $lastchar_fullcomp;
+        next unless defined $full_comp_table[$i];
+        $fullcomp_filled[$i >> 8] = $fullcomp_pos++;
+        $i |= 255;
+    }
+    printf OUTPUT "const BYTE idx1_fullcomp[%d] =\n", $fullcomp_lim;
+    print OUTPUT "{\n";
+    printf OUTPUT "%s", DUMP_ARRAY( "0x%02x", 0, @fullcomp_filled );
+    print OUTPUT "\n};\n\n";
+
+    my $sub_fullcomp_filled_pos = 1;
+    my %sub_fullcomp_filled = ();
+    for (my $i = 0; $i < $fullcomp_lim; $i++)
+    {
+        next unless $fullcomp_filled[$i];
+        for (my $j = 0; $j < 256; $j++)
+        {
+            my $idx = ($i << 8) | $j;
+            next unless defined $full_comp_table[$idx];
+            $sub_fullcomp_filled{$idx >> 4} = $sub_fullcomp_filled_pos++;
+            $j |= 15;
+        }
+    }
+
+    printf OUTPUT "const USHORT idx2_fullcomp[%d] =\n", $fullcomp_pos * 16;
+    print OUTPUT "{\n";
+    @null_idx = (0) x 16;
+    print OUTPUT "    /* all-zero 256-char blocks get mapped to here */\n";
+    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_idx );
+    for (my $i = 0; $i < $fullcomp_lim; $i++)
+    {
+        next unless $fullcomp_filled[$i];
+        printf OUTPUT ",\n    /* sub-index 0x%02x */\n", $fullcomp_filled[$i];
+
+        my @sub_idx;
+        for (my $j = 0; $j < 16; $j++)
+        {
+            my $idx = ($i << 4) | $j;
+            $sub_idx[$j] = $sub_fullcomp_filled{$idx} || 0;
+        }
+        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_idx );
+    }
+    print OUTPUT "\n};\n\n";
+
+    printf OUTPUT "const USHORT offsets_fullcomp[%d] =\n", 16 * $sub_fullcomp_filled_pos;
+    print OUTPUT "{\n";
+    @null_table = (0) x 16;
+    print OUTPUT "    /* all-zero 16-char blocks get mapped to here */\n";
+    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_table );
+    for my $key (sort {$a <=> $b} keys %sub_fullcomp_filled)
+    {
+        printf OUTPUT ",\n    /* 0x%03x0 .. 0x%03xf */\n", $key, $key;
+        my @sub_table;
+        for (my $j = 0; $j < 16; $j++)
+        {
+            my $idx = ($key << 4) | $j;
+            $sub_table[$j] = $fullcomp_lookup{$idx} || 0;
+        }
+        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_table );
+    }
+    print OUTPUT "\n};\n\n";
+
     print OUTPUT <<"EOF";
 static const WCHAR *unicode_table_lookup( UINT cp, int compat, const BYTE *idx1, UINT scale_idx1,
                                           const USHORT *idx2, UINT scale_idx2, const USHORT *offsets,
@@ -2533,6 +2784,20 @@ static inline int decompose_hangul( WCHAR ch, WCHAR dum[4], int dstlen )
     return 0;
 }
 
+static inline int reorderable_pair( WCHAR ch1, WCHAR ch2 )
+{
+    const WCHAR *cc1, *cc2;
+
+    if (ch1 == 0 || ch2 == 0) return 0;
+
+    cc1 = unicode_table_lookup( ch1, 0, idx1_comb, 8, idx2_comb, 4,
+                                offsets_comb, 4, data_comb, 0 );
+    cc2 = unicode_table_lookup( ch2, 0, idx1_comb, 8, idx2_comb, 4,
+                                offsets_comb, 4, data_comb, 0 );
+    if (*cc2 < *cc1) return 1;
+    else return 0;
+}
+
 static inline UINT utf16_codepoint_to_surrogates( UINT cp )
 {
     UINT ch = cp;
@@ -2666,6 +2931,41 @@ unsigned int wine_unicode_decompose_string( int compat, const WCHAR *src,
 
     return dstpos;
 }
+
+int is_starter( WCHAR ch )
+{
+    const WCHAR *map = unicode_table_lookup( ch, 0, idx1_comb, 8, idx2_comb, 4,
+                                             offsets_comb, 4, data_comb, 0 );
+    return (*map == 0) ? 1 : 0;
+}
+
+void unicode_canon_order( WCHAR *str, int strlen )
+{
+    int i, j, m;
+    int sublen = 0, tot_sublen = 0;
+    WCHAR *substr = str;
+
+    for (m = 1; m < strlen; m++)
+    {
+        if (m == strlen || is_starter( str[m] )) sublen = m - tot_sublen;
+        else continue;
+
+        for (i = 0; i < sublen; i++)
+        {
+            for (j = i+1; j < sublen; j++)
+            {
+                if (reorderable_pair( substr[i], substr[j] ))
+                {
+                    WCHAR swp = substr[i];
+                    substr[i] = substr[j];
+                    substr[j] = swp;
+                }
+            }
+        }
+        tot_sublen += m;
+        substr = str+m;
+    }
+}
 EOF
     close OUTPUT;
     save_file($filename);
-- 
2.14.1




More information about the wine-devel mailing list