1 | # uniParse.tcl -- |
---|
2 | # |
---|
3 | # This program parses the UnicodeData file and generates the |
---|
4 | # corresponding tclUniData.c file with compressed character |
---|
5 | # data tables. The input to this program should be the latest |
---|
6 | # UnicodeData file from: |
---|
7 | # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData-Latest.txt |
---|
8 | # |
---|
9 | # Copyright (c) 1998-1999 by Scriptics Corporation. |
---|
10 | # All rights reserved. |
---|
11 | # |
---|
12 | # RCS: @(#) $Id: uniParse.tcl,v 1.4 2001/05/28 04:37:57 hobbs Exp $ |
---|
13 | |
---|
14 | |
---|
15 | namespace eval uni { |
---|
16 | set shift 5; # number of bits of data within a page |
---|
17 | # This value can be adjusted to find the |
---|
18 | # best split to minimize table size |
---|
19 | |
---|
20 | variable pMap; # map from page to page index, each entry is |
---|
21 | # an index into the pages table, indexed by |
---|
22 | # page number |
---|
23 | variable pages; # map from page index to page info, each |
---|
24 | # entry is a list of indices into the groups |
---|
25 | # table, the list is indexed by the offset |
---|
26 | variable groups; # list of character info values, indexed by |
---|
27 | # group number, initialized with the |
---|
28 | # unassigned character group |
---|
29 | |
---|
30 | variable categories { |
---|
31 | Cn Lu Ll Lt Lm Lo Mn Me Mc Nd Nl No Zs Zl Zp |
---|
32 | Cc Cf Co Cs Pc Pd Ps Pe Pi Pf Po Sm Sc Sk So |
---|
33 | }; # Ordered list of character categories, must |
---|
34 | # match the enumeration in the header file. |
---|
35 | |
---|
36 | variable titleCount 0; # Count of the number of title case |
---|
37 | # characters. This value is used in the |
---|
38 | # regular expression code to allocate enough |
---|
39 | # space for the title case variants. |
---|
40 | } |
---|
41 | |
---|
42 | proc uni::getValue {items index} { |
---|
43 | variable categories |
---|
44 | variable titleCount |
---|
45 | |
---|
46 | # Extract character info |
---|
47 | |
---|
48 | set category [lindex $items 2] |
---|
49 | if {[scan [lindex $items 12] %4x toupper] == 1} { |
---|
50 | set toupper [expr {$index - $toupper}] |
---|
51 | } else { |
---|
52 | set toupper {} |
---|
53 | } |
---|
54 | if {[scan [lindex $items 13] %4x tolower] == 1} { |
---|
55 | set tolower [expr {$tolower - $index}] |
---|
56 | } else { |
---|
57 | set tolower {} |
---|
58 | } |
---|
59 | if {[scan [lindex $items 14] %4x totitle] == 1} { |
---|
60 | set totitle [expr {$index - $totitle}] |
---|
61 | } else { |
---|
62 | set totitle {} |
---|
63 | } |
---|
64 | |
---|
65 | set categoryIndex [lsearch -exact $categories $category] |
---|
66 | if {$categoryIndex < 0} { |
---|
67 | puts "Unexpected character category: $index($category)" |
---|
68 | set categoryIndex 0 |
---|
69 | } elseif {$category == "Lt"} { |
---|
70 | incr titleCount |
---|
71 | } |
---|
72 | |
---|
73 | return "$categoryIndex,$toupper,$tolower,$totitle" |
---|
74 | } |
---|
75 | |
---|
76 | proc uni::getGroup {value} { |
---|
77 | variable groups |
---|
78 | |
---|
79 | set gIndex [lsearch -exact $groups $value] |
---|
80 | if {$gIndex == -1} { |
---|
81 | set gIndex [llength $groups] |
---|
82 | lappend groups $value |
---|
83 | } |
---|
84 | return $gIndex |
---|
85 | } |
---|
86 | |
---|
87 | proc uni::addPage {info} { |
---|
88 | variable pMap |
---|
89 | variable pages |
---|
90 | |
---|
91 | set pIndex [lsearch -exact $pages $info] |
---|
92 | if {$pIndex == -1} { |
---|
93 | set pIndex [llength $pages] |
---|
94 | lappend pages $info |
---|
95 | } |
---|
96 | lappend pMap $pIndex |
---|
97 | return |
---|
98 | } |
---|
99 | |
---|
100 | proc uni::buildTables {data} { |
---|
101 | variable shift |
---|
102 | |
---|
103 | variable pMap {} |
---|
104 | variable pages {} |
---|
105 | variable groups {{0,,,}} |
---|
106 | set info {} ;# temporary page info |
---|
107 | |
---|
108 | set mask [expr {(1 << $shift) - 1}] |
---|
109 | |
---|
110 | set next 0 |
---|
111 | |
---|
112 | foreach line [split $data \n] { |
---|
113 | if {$line == ""} { |
---|
114 | set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n" |
---|
115 | } |
---|
116 | |
---|
117 | set items [split $line \;] |
---|
118 | |
---|
119 | scan [lindex $items 0] %4x index |
---|
120 | set index [format 0x%0.4x $index] |
---|
121 | |
---|
122 | set gIndex [getGroup [getValue $items $index]] |
---|
123 | |
---|
124 | # Since the input table omits unassigned characters, these will |
---|
125 | # show up as gaps in the index sequence. There are a few special cases |
---|
126 | # where the gaps correspond to a uniform block of assigned characters. |
---|
127 | # These are indicated as such in the character name. |
---|
128 | |
---|
129 | # Enter all unassigned characters up to the current character. |
---|
130 | if {($index > $next) \ |
---|
131 | && ![regexp "Last>$" [lindex $items 1]]} { |
---|
132 | for {} {$next < $index} {incr next} { |
---|
133 | lappend info 0 |
---|
134 | if {($next & $mask) == $mask} { |
---|
135 | addPage $info |
---|
136 | set info {} |
---|
137 | } |
---|
138 | } |
---|
139 | } |
---|
140 | |
---|
141 | # Enter all assigned characters up to the current character |
---|
142 | for {set i $next} {$i <= $index} {incr i} { |
---|
143 | # Split character index into offset and page number |
---|
144 | set offset [expr {$i & $mask}] |
---|
145 | set page [expr {($i >> $shift)}] |
---|
146 | |
---|
147 | # Add the group index to the info for the current page |
---|
148 | lappend info $gIndex |
---|
149 | |
---|
150 | # If this is the last entry in the page, add the page |
---|
151 | if {$offset == $mask} { |
---|
152 | addPage $info |
---|
153 | set info {} |
---|
154 | } |
---|
155 | } |
---|
156 | set next [expr {$index + 1}] |
---|
157 | } |
---|
158 | return |
---|
159 | } |
---|
160 | |
---|
161 | proc uni::main {} { |
---|
162 | global argc argv0 argv |
---|
163 | variable pMap |
---|
164 | variable pages |
---|
165 | variable groups |
---|
166 | variable shift |
---|
167 | variable titleCount |
---|
168 | |
---|
169 | if {$argc != 2} { |
---|
170 | puts stderr "\nusage: $argv0 <datafile> <outdir>\n" |
---|
171 | exit 1 |
---|
172 | } |
---|
173 | set f [open [lindex $argv 0] r] |
---|
174 | set data [read $f] |
---|
175 | close $f |
---|
176 | |
---|
177 | buildTables $data |
---|
178 | puts "X = [llength $pMap] Y= [llength $pages] A= [llength $groups]" |
---|
179 | set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}] |
---|
180 | puts "shift = 6, space = $size" |
---|
181 | puts "title case count = $titleCount" |
---|
182 | |
---|
183 | set f [open [file join [lindex $argv 1] tclUniData.c] w] |
---|
184 | fconfigure $f -translation lf |
---|
185 | puts $f "/* |
---|
186 | * tclUniData.c -- |
---|
187 | * |
---|
188 | * Declarations of Unicode character information tables. This file is |
---|
189 | * automatically generated by the tools/uniParse.tcl script. Do not |
---|
190 | * modify this file by hand. |
---|
191 | * |
---|
192 | * Copyright (c) 1998 by Scriptics Corporation. |
---|
193 | * All rights reserved. |
---|
194 | * |
---|
195 | * RCS: @(#) \$Id\$ |
---|
196 | */ |
---|
197 | |
---|
198 | /* |
---|
199 | * A 16-bit Unicode character is split into two parts in order to index |
---|
200 | * into the following tables. The lower OFFSET_BITS comprise an offset |
---|
201 | * into a page of characters. The upper bits comprise the page number. |
---|
202 | */ |
---|
203 | |
---|
204 | #define OFFSET_BITS $shift |
---|
205 | |
---|
206 | /* |
---|
207 | * The pageMap is indexed by page number and returns an alternate page number |
---|
208 | * that identifies a unique page of characters. Many Unicode characters map |
---|
209 | * to the same alternate page number. |
---|
210 | */ |
---|
211 | |
---|
212 | static unsigned char pageMap\[\] = {" |
---|
213 | set line " " |
---|
214 | set last [expr {[llength $pMap] - 1}] |
---|
215 | for {set i 0} {$i <= $last} {incr i} { |
---|
216 | append line [lindex $pMap $i] |
---|
217 | if {$i != $last} { |
---|
218 | append line ", " |
---|
219 | } |
---|
220 | if {[string length $line] > 70} { |
---|
221 | puts $f $line |
---|
222 | set line " " |
---|
223 | } |
---|
224 | } |
---|
225 | puts $f $line |
---|
226 | puts $f "}; |
---|
227 | |
---|
228 | /* |
---|
229 | * The groupMap is indexed by combining the alternate page number with |
---|
230 | * the page offset and returns a group number that identifies a unique |
---|
231 | * set of character attributes. |
---|
232 | */ |
---|
233 | |
---|
234 | static unsigned char groupMap\[\] = {" |
---|
235 | set line " " |
---|
236 | set lasti [expr {[llength $pages] - 1}] |
---|
237 | for {set i 0} {$i <= $lasti} {incr i} { |
---|
238 | set page [lindex $pages $i] |
---|
239 | set lastj [expr {[llength $page] - 1}] |
---|
240 | for {set j 0} {$j <= $lastj} {incr j} { |
---|
241 | append line [lindex $page $j] |
---|
242 | if {$j != $lastj || $i != $lasti} { |
---|
243 | append line ", " |
---|
244 | } |
---|
245 | if {[string length $line] > 70} { |
---|
246 | puts $f $line |
---|
247 | set line " " |
---|
248 | } |
---|
249 | } |
---|
250 | } |
---|
251 | puts $f $line |
---|
252 | puts $f "}; |
---|
253 | |
---|
254 | /* |
---|
255 | * Each group represents a unique set of character attributes. The attributes |
---|
256 | * are encoded into a 32-bit value as follows: |
---|
257 | * |
---|
258 | * Bits 0-4 Character category: see the constants listed below. |
---|
259 | * |
---|
260 | * Bits 5-7 Case delta type: 000 = identity |
---|
261 | * 010 = add delta for lower |
---|
262 | * 011 = add delta for lower, add 1 for title |
---|
263 | * 100 = sutract delta for title/upper |
---|
264 | * 101 = sub delta for upper, sub 1 for title |
---|
265 | * 110 = sub delta for upper, add delta for lower |
---|
266 | * |
---|
267 | * Bits 8-21 Reserved for future use. |
---|
268 | * |
---|
269 | * Bits 22-31 Case delta: delta for case conversions. This should be the |
---|
270 | * highest field so we can easily sign extend. |
---|
271 | */ |
---|
272 | |
---|
273 | static int groups\[\] = {" |
---|
274 | set line " " |
---|
275 | set last [expr {[llength $groups] - 1}] |
---|
276 | for {set i 0} {$i <= $last} {incr i} { |
---|
277 | foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {} |
---|
278 | |
---|
279 | # Compute the case conversion type and delta |
---|
280 | |
---|
281 | if {$totitle != ""} { |
---|
282 | if {$totitle == $toupper} { |
---|
283 | # subtract delta for title or upper |
---|
284 | set case 4 |
---|
285 | set delta $toupper |
---|
286 | } elseif {$toupper != ""} { |
---|
287 | # subtract delta for upper, subtract 1 for title |
---|
288 | set case 5 |
---|
289 | set delta $toupper |
---|
290 | } else { |
---|
291 | # add delta for lower, add 1 for title |
---|
292 | set case 3 |
---|
293 | set delta $tolower |
---|
294 | } |
---|
295 | } elseif {$toupper != ""} { |
---|
296 | # subtract delta for upper, add delta for lower |
---|
297 | set case 6 |
---|
298 | set delta $toupper |
---|
299 | } elseif {$tolower != ""} { |
---|
300 | # add delta for lower |
---|
301 | set case 2 |
---|
302 | set delta $tolower |
---|
303 | } else { |
---|
304 | # noop |
---|
305 | set case 0 |
---|
306 | set delta 0 |
---|
307 | } |
---|
308 | |
---|
309 | set val [expr {($delta << 22) | ($case << 5) | $type}] |
---|
310 | |
---|
311 | append line [format "%d" $val] |
---|
312 | if {$i != $last} { |
---|
313 | append line ", " |
---|
314 | } |
---|
315 | if {[string length $line] > 65} { |
---|
316 | puts $f $line |
---|
317 | set line " " |
---|
318 | } |
---|
319 | } |
---|
320 | puts $f $line |
---|
321 | puts $f "}; |
---|
322 | |
---|
323 | /* |
---|
324 | * The following constants are used to determine the category of a |
---|
325 | * Unicode character. |
---|
326 | */ |
---|
327 | |
---|
328 | #define UNICODE_CATEGORY_MASK 0X1F |
---|
329 | |
---|
330 | enum { |
---|
331 | UNASSIGNED, |
---|
332 | UPPERCASE_LETTER, |
---|
333 | LOWERCASE_LETTER, |
---|
334 | TITLECASE_LETTER, |
---|
335 | MODIFIER_LETTER, |
---|
336 | OTHER_LETTER, |
---|
337 | NON_SPACING_MARK, |
---|
338 | ENCLOSING_MARK, |
---|
339 | COMBINING_SPACING_MARK, |
---|
340 | DECIMAL_DIGIT_NUMBER, |
---|
341 | LETTER_NUMBER, |
---|
342 | OTHER_NUMBER, |
---|
343 | SPACE_SEPARATOR, |
---|
344 | LINE_SEPARATOR, |
---|
345 | PARAGRAPH_SEPARATOR, |
---|
346 | CONTROL, |
---|
347 | FORMAT, |
---|
348 | PRIVATE_USE, |
---|
349 | SURROGATE, |
---|
350 | CONNECTOR_PUNCTUATION, |
---|
351 | DASH_PUNCTUATION, |
---|
352 | OPEN_PUNCTUATION, |
---|
353 | CLOSE_PUNCTUATION, |
---|
354 | INITIAL_QUOTE_PUNCTUATION, |
---|
355 | FINAL_QUOTE_PUNCTUATION, |
---|
356 | OTHER_PUNCTUATION, |
---|
357 | MATH_SYMBOL, |
---|
358 | CURRENCY_SYMBOL, |
---|
359 | MODIFIER_SYMBOL, |
---|
360 | OTHER_SYMBOL |
---|
361 | }; |
---|
362 | |
---|
363 | /* |
---|
364 | * The following macros extract the fields of the character info. The |
---|
365 | * GetDelta() macro is complicated because we can't rely on the C compiler |
---|
366 | * to do sign extension on right shifts. |
---|
367 | */ |
---|
368 | |
---|
369 | #define GetCaseType(info) (((info) & 0xE0) >> 5) |
---|
370 | #define GetCategory(info) ((info) & 0x1F) |
---|
371 | #define GetDelta(info) (((info) > 0) ? ((info) >> 22) : (~(~((info)) >> 22))) |
---|
372 | |
---|
373 | /* |
---|
374 | * This macro extracts the information about a character from the |
---|
375 | * Unicode character tables. |
---|
376 | */ |
---|
377 | |
---|
378 | #define GetUniCharInfo(ch) (groups\[groupMap\[(pageMap\[(((int)(ch)) & 0xffff) >> OFFSET_BITS\] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))\]\]) |
---|
379 | " |
---|
380 | |
---|
381 | close $f |
---|
382 | } |
---|
383 | |
---|
384 | uni::main |
---|
385 | |
---|
386 | return |
---|