[PATCH v3 2/3] tools/make_unicode: Implement canonical composition for use in normalization.
Sergio Gómez Del Real
sdelreal at codeweavers.com
Tue Mar 27 08:21:58 CDT 2018
Signed-off-by: Sergio Gómez Del Real <sdelreal at codeweavers.com>
---
tools/make_unicode | 302 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 301 insertions(+), 1 deletion(-)
diff --git a/tools/make_unicode b/tools/make_unicode
index 65ae7ab2a0..1ad090219b 100755
--- a/tools/make_unicode
+++ b/tools/make_unicode
@@ -359,6 +359,8 @@ my @joining_table = ();
my @direction_table = ();
my @decomp_table = ();
my @compose_table = ();
+my @comb_class_table = ();
+my @full_comp_table = ();
my $default_char;
my $default_wchar;
@@ -469,6 +471,11 @@ sub READ_DEFAULTS($)
}
}
+ if ($comb != 0)
+ {
+ $comb_class_table[$src] = (hex $comb);
+ }
+
next if $decomp eq ""; # no decomposition, skip it
# store decomposition table
@@ -561,6 +568,25 @@ sub READ_DEFAULTS($)
my $flag = $ctype{$cat};
foreach my $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
}
+
+ my $UNICODE_DERIVED = open_data_file( $UNIDATA, "DerivedNormalizationProps.txt" );
+ while (<$UNICODE_DERIVED>)
+ {
+ next unless (/^([0-9a-fA-F.]+)\s+;\s+Full_Composition_Exclusion/);
+ my ($first, $last) = split /\.\./,$1;
+ $first = hex $first;
+ if (defined $last)
+ {
+ $last = hex $last;
+ while ($last gt $first)
+ {
+ $full_comp_table[$last] = 1;
+ $last--;
+ }
+ }
+ $full_comp_table[$first] = 1;
+ }
+ close $UNICODE_DERIVED;
}
@@ -2249,6 +2275,8 @@ sub dump_compose_table($)
}
print OUTPUT "\n};\n\n";
print OUTPUT <<"EOF";
+#include "decompose.c"
+
static inline int binary_search( WCHAR ch, int low, int high )
{
while (low <= high)
@@ -2272,6 +2300,59 @@ WCHAR DECLSPEC_HIDDEN wine_compose( const WCHAR *str )
count = table[2 * pos + 3];
}
}
+
+static inline int is_blocked(WCHAR *ptr1, WCHAR *ptr2)
+{
+ if (ptr1 >= ptr2) return -1;
+
+ while (++ptr1 < ptr2)
+ {
+ const WCHAR *map1, *map2;
+ map1 = unicode_table_lookup( *ptr1, 0, idx1_comb, 8, idx2_comb, 4,
+ offsets_comb, 4, data_comb, 0 );
+ map2 = unicode_table_lookup( *ptr2, 0, idx1_comb, 8, idx2_comb, 4,
+ offsets_comb, 4, data_comb, 0 );
+ if (*map1 == 0 || *map2 <= *map1) return 1;
+ }
+ return 0;
+}
+
+static inline int is_fullexcl(WCHAR ch)
+{
+ const WCHAR *map;
+ map = unicode_table_lookup( ch, 0, idx1_fullcomp, 8, idx2_fullcomp, 4,
+ offsets_fullcomp, 4, data_fullcomp, 0 );
+ return (int)*map;
+}
+
+UINT unicode_canonical_composition( WCHAR *str, UINT strlen )
+{
+ int i, j;
+ WCHAR dum[3] = {0};
+
+ if (strlen == 0) strlen = strlenW( str );
+
+ for (i = 1; i <= strlen; i++)
+ {
+ WCHAR *scratch = str+i, comp;
+ if (*scratch == 0) break;
+ for (scratch = str+i; scratch - str > 0; --scratch)
+ {
+ if (is_starter( *scratch )) break;
+ }
+ if (!is_starter( *scratch ) || is_blocked( scratch, str+i )) continue;
+ dum[0] = *scratch;
+ dum[1] = str[i];
+ comp = wine_compose( dum );
+ if (comp == 0 || is_fullexcl( comp )) continue;
+ *scratch = comp;
+ for (j = i; j < strlen-1; j++) str[j] = str[j+1];
+ strlen--;
+ i--;
+ }
+
+ return strlen;
+}
EOF
close OUTPUT;
save_file($filename);
@@ -2339,13 +2420,21 @@ sub dump_decompose_table($)
my %nfd_lookup = ();
my %nfkd_lookup = ();
my %decomp_lookup = ();
+ my %comb_lookup = ();
+ my %fullcomp_lookup = ();
my @decomp_data = (0);
+ my @comb_data = (0);
+ my @full_comp_data = (0);
my $pos = 1;
+ my $pos_comb = 1;
+ my $pos_fullcomp = 1;
my $lastchar_decomp;
+ my $lastchar_comb;
+ my $lastchar_fullcomp;
for (my $i = 0; $i < $utflim; $i++)
{
- next unless defined $decomp_table[$i];
+ next unless defined $decomp_table[$i] || defined $comb_class_table[$i] || defined $full_comp_table[$i];
if (defined $decomp_table[$i])
{
@@ -2400,6 +2489,20 @@ sub dump_decompose_table($)
$pos += @nfkd;
}
}
+ if (defined $comb_class_table[$i])
+ {
+ push @comb_data, $comb_class_table[$i];
+ $lastchar_comb = $i;
+ $comb_lookup{$i} = $pos_comb;
+ $pos_comb++;
+ }
+ if (defined $full_comp_table[$i])
+ {
+ push @full_comp_data, $full_comp_table[$i];
+ $lastchar_fullcomp = $i;
+ $fullcomp_lookup{$i} = $pos_fullcomp;
+ $pos_fullcomp++;
+ }
}
printf OUTPUT "static const UINT last_decomposable = 0x%x;\n\n", $lastchar_decomp;
@@ -2491,6 +2594,154 @@ sub dump_decompose_table($)
}
print OUTPUT "\n};\n\n";
+ # now for Compatibility Class
+
+ printf OUTPUT "static const WCHAR data_comb[%d] =\n", $pos_comb;
+ print OUTPUT "{\n";
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @comb_data );
+ print OUTPUT "\n};\n\n";
+
+ my $comb_pos = 1;
+ my $comb_lim = ($lastchar_comb >> 8) + 1;
+ my @comb_filled = (0) x $comb_lim;
+ for (my $i = 0; $i < $utflim; $i++)
+ {
+ last if $i > $lastchar_comb;
+ next unless defined $comb_class_table[$i];
+ $comb_filled[$i >> 8] = $comb_pos++;
+ $i |= 255;
+ }
+ printf OUTPUT "static const BYTE idx1_comb[%d] =\n", $comb_lim;
+ print OUTPUT "{\n";
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%02x", 0, @comb_filled );
+ print OUTPUT "\n};\n\n";
+
+ my $sub_comb_filled_pos = 1;
+ my %sub_comb_filled = ();
+ for (my $i = 0; $i < $comb_lim; $i++)
+ {
+ next unless $comb_filled[$i];
+ for (my $j = 0; $j < 256; $j++)
+ {
+ my $idx = ($i << 8) | $j;
+ next unless defined $comb_class_table[$idx];
+ $sub_comb_filled{$idx >> 4} = $sub_comb_filled_pos++;
+ $j |= 15;
+ }
+ }
+
+ printf OUTPUT "static const USHORT idx2_comb[%d] =\n", $comb_pos * 16;
+ print OUTPUT "{\n";
+ @null_idx = (0) x 16;
+ print OUTPUT " /* all-zero 256-char blocks get mapped to here */\n";
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_idx );
+ for (my $i = 0; $i < $comb_lim; $i++)
+ {
+ next unless $comb_filled[$i];
+ printf OUTPUT ",\n /* sub-index 0x%02x */\n", $comb_filled[$i];
+
+ my @sub_idx;
+ for (my $j = 0; $j < 16; $j++)
+ {
+ my $idx = ($i << 4) | $j;
+ $sub_idx[$j] = $sub_comb_filled{$idx} || 0;
+ }
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_idx );
+ }
+ print OUTPUT "\n};\n\n";
+
+ printf OUTPUT "static const USHORT offsets_comb[%d] =\n", 16 * $sub_comb_filled_pos;
+ print OUTPUT "{\n";
+ @null_table = (0) x 16;
+ print OUTPUT " /* all-zero 16-char blocks get mapped to here */\n";
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_table );
+ for my $key (sort {$a <=> $b} keys %sub_comb_filled)
+ {
+ printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $key, $key;
+ my @sub_table;
+ for (my $j = 0; $j < 16; $j++)
+ {
+ my $idx = ($key << 4) | $j;
+ $sub_table[$j] = $comb_lookup{$idx} || 0;
+ }
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_table );
+ }
+ print OUTPUT "\n};\n\n";
+
+ # now for Full Composition Exclusion
+
+ printf OUTPUT "const WCHAR data_fullcomp[%d] =\n", $pos_fullcomp;
+ print OUTPUT "{\n";
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @full_comp_data );
+ print OUTPUT "\n};\n\n";
+
+ my $fullcomp_pos = 1;
+ my $fullcomp_lim = ($lastchar_fullcomp >> 8) + 1;
+ my @fullcomp_filled = (0) x $fullcomp_lim;
+ for (my $i = 0; $i < $utflim; $i++)
+ {
+ last if $i > $lastchar_fullcomp;
+ next unless defined $full_comp_table[$i];
+ $fullcomp_filled[$i >> 8] = $fullcomp_pos++;
+ $i |= 255;
+ }
+ printf OUTPUT "const BYTE idx1_fullcomp[%d] =\n", $fullcomp_lim;
+ print OUTPUT "{\n";
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%02x", 0, @fullcomp_filled );
+ print OUTPUT "\n};\n\n";
+
+ my $sub_fullcomp_filled_pos = 1;
+ my %sub_fullcomp_filled = ();
+ for (my $i = 0; $i < $fullcomp_lim; $i++)
+ {
+ next unless $fullcomp_filled[$i];
+ for (my $j = 0; $j < 256; $j++)
+ {
+ my $idx = ($i << 8) | $j;
+ next unless defined $full_comp_table[$idx];
+ $sub_fullcomp_filled{$idx >> 4} = $sub_fullcomp_filled_pos++;
+ $j |= 15;
+ }
+ }
+
+ printf OUTPUT "const USHORT idx2_fullcomp[%d] =\n", $fullcomp_pos * 16;
+ print OUTPUT "{\n";
+ @null_idx = (0) x 16;
+ print OUTPUT " /* all-zero 256-char blocks get mapped to here */\n";
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_idx );
+ for (my $i = 0; $i < $fullcomp_lim; $i++)
+ {
+ next unless $fullcomp_filled[$i];
+ printf OUTPUT ",\n /* sub-index 0x%02x */\n", $fullcomp_filled[$i];
+
+ my @sub_idx;
+ for (my $j = 0; $j < 16; $j++)
+ {
+ my $idx = ($i << 4) | $j;
+ $sub_idx[$j] = $sub_fullcomp_filled{$idx} || 0;
+ }
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_idx );
+ }
+ print OUTPUT "\n};\n\n";
+
+ printf OUTPUT "const USHORT offsets_fullcomp[%d] =\n", 16 * $sub_fullcomp_filled_pos;
+ print OUTPUT "{\n";
+ @null_table = (0) x 16;
+ print OUTPUT " /* all-zero 16-char blocks get mapped to here */\n";
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @null_table );
+ for my $key (sort {$a <=> $b} keys %sub_fullcomp_filled)
+ {
+ printf OUTPUT ",\n /* 0x%03x0 .. 0x%03xf */\n", $key, $key;
+ my @sub_table;
+ for (my $j = 0; $j < 16; $j++)
+ {
+ my $idx = ($key << 4) | $j;
+ $sub_table[$j] = $fullcomp_lookup{$idx} || 0;
+ }
+ printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @sub_table );
+ }
+ print OUTPUT "\n};\n\n";
+
print OUTPUT <<"EOF";
static const WCHAR *unicode_table_lookup( UINT cp, int compat, const BYTE *idx1, UINT scale_idx1,
const USHORT *idx2, UINT scale_idx2, const USHORT *offsets,
@@ -2533,6 +2784,20 @@ static inline int decompose_hangul( WCHAR ch, WCHAR dum[4], int dstlen )
return 0;
}
+static inline int reorderable_pair( WCHAR ch1, WCHAR ch2 )
+{
+ const WCHAR *cc1, *cc2;
+
+ if (ch1 == 0 || ch2 == 0) return 0;
+
+ cc1 = unicode_table_lookup( ch1, 0, idx1_comb, 8, idx2_comb, 4,
+ offsets_comb, 4, data_comb, 0 );
+ cc2 = unicode_table_lookup( ch2, 0, idx1_comb, 8, idx2_comb, 4,
+ offsets_comb, 4, data_comb, 0 );
+ if (*cc2 < *cc1) return 1;
+ else return 0;
+}
+
static inline UINT utf16_codepoint_to_surrogates( UINT cp )
{
UINT ch = cp;
@@ -2666,6 +2931,41 @@ unsigned int wine_unicode_decompose_string( int compat, const WCHAR *src,
return dstpos;
}
+
+int is_starter( WCHAR ch )
+{
+ const WCHAR *map = unicode_table_lookup( ch, 0, idx1_comb, 8, idx2_comb, 4,
+ offsets_comb, 4, data_comb, 0 );
+ return (*map == 0) ? 1 : 0;
+}
+
+void unicode_canon_order( WCHAR *str, int strlen )
+{
+ int i, j, m;
+ int sublen = 0, tot_sublen = 0;
+ WCHAR *substr = str;
+
+ for (m = 1; m < strlen; m++)
+ {
+ if (m == strlen || is_starter( str[m] )) sublen = m - tot_sublen;
+ else continue;
+
+ for (i = 0; i < sublen; i++)
+ {
+ for (j = i+1; j < sublen; j++)
+ {
+ if (reorderable_pair( substr[i], substr[j] ))
+ {
+ WCHAR swp = substr[i];
+ substr[i] = substr[j];
+ substr[j] = swp;
+ }
+ }
+ }
+ tot_sublen += m;
+ substr = str+m;
+ }
+}
EOF
close OUTPUT;
save_file($filename);
--
2.14.1
More information about the wine-devel
mailing list