1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
| with interfaces;
with ada.strings;
with ada.strings.fixed;
with ada.streams.stream_io;
with ada.characters.latin_1;
with aada_character_categories;
with aada_compiler_ucs4_string_normalized_uppercase;
-- we need many operators so use is required
use interfaces;
use ada.streams;
package body aada_compiler_character_package is
procedure init_character(c: out compiler_character; status: character_status) is
begin
c.status := status;
c.category := unknown;
c.ucs4 := 16#FEFF#; -- endian mark
c.utf8_length := 0;
c.utf8 := (others => 16#20#);
end init_character;
-- 's' must be long enough to accept the transliteration, if not
-- an overflow error will be raised; to support all languages,
-- it should be 5 x the size of 'u'
--
-- note that proper transformation from lower to uppercase
-- requires the entire entity (identifier) instead of just one
-- character at a time
--
-- Unicode algorithms used to compute the uppercase letters are:
-- 1. Normalization expension,
-- 2. Uppercasing,
-- 3. Normalized compression.
--
-- When the input string is only composed of ISO-8859-1 characters
-- then we do the uppercasing right here. If any other character
-- appears in the string, then the full Unicode transformation is used.
-- Note that since 99.9% of the identifiers are using ASCII (especially
-- all keywords are ASCII only,) this will take care of most of the
-- strings.
--
-- Unicode string normalization (Chapter 5.6 as of 6.0.0)
-- http://www.unicode.org/reports/tr15/
--
-- Uppercasing is defined in Chapter 5.8 as:
-- http://www.unicode.org/versions/Unicode6.0.0/ch05.pdf#G21180
-- Q(X) = NFC(toUppercase(NFD(X)))
--
-- So we use the normalization to expand X, compute uppercase,
-- then apply NFC which shrink X as much as possible.
--
-- See also node #44 in regard to case mapping, the Simple case mapping
-- can be used when no special case mapping exists for the transformation
-- http://www.unicode.org/reports/tr44/#Casemapping
--
procedure get_uppercase_string(u: in compiler_character_string;
s: out compiler_character_string) is
-- modular definitions used for the and operation
type ucs4_modular is mod 16#7FFFFFFF#;
-- ISO-10646:2003 does not define all the pages, but we want to support
-- them all in the compiler; we accept unknown character as is
-- the page are the higher 23 bits (bit 31 unused)
type character_page is range 0 .. 16#7FFFFF#;
page: character_page;
i: positive := u'first;
j: positive := s'first;
begin
-- reset the output string with nil characters
-- this would just be a waste of time; we just terminate the string that way
-- (see end of function)
whole_string:
while i <= u'last
loop
-- determine the page number
page := character_page(shift_right(unsigned_32(u(i).ucs4), 8));
-- special case since 99% of the program content will be in 7bits ASCII
if page /= 0 then
-- the string includes characters that are outside the
-- ISO-8859-1 page, so we have to call the UCS-4 uppercase
-- function to make sure the transliteration is complete
-- as expected
declare
temp: ucs4_string(1 .. u'length);
vtemp: ucs4_vstring;
begin
for k in positive range u'range
loop
temp(ucs4_string_length(k - u'first + 1)) := u(k).ucs4;
end loop;
vstr(temp, vtemp);
-- normalized uppercase transliterations
aada_compiler_ucs4_string_normalized_uppercase
.ucs4_string_to_normalized_uppercase(vtemp);
j := s'first;
for k in ucs4_string_length range 1 .. length(vtemp)
loop
init_character(s(j), valid);
s(j).ucs4 := get(vtemp, k);
j := j + 1;
end loop;
end;
exit whole_string;
end if;
-- ISO 8859-1 (Latin 1) uppercase transliterations done manually
-- (this is a lot faster than calling the full UCS-4 uppercase
-- implementation and useful 99.9% of the time)
case u(i).ucs4 is
when 16#61# .. 16#7A# -- 'a' .. 'z'
| 16#E0# .. 16#F6# -- accentuated A, E, I, and O; AE ligature;
-- C cedilla; EDH; N tilde
| 16#F8# .. 16#FE# -- accentuated U, Y; ETH
=> init_character(s(j), valid);
s(j).ucs4 := ucs4_character(
ucs4_modular(u(i).ucs4) and 16#DF#);
j := j + 1;
when 16#B5# -- Micro Sign
=> init_character(s(j), valid);
s(j).ucs4 := 16#3BC#;
j := j + 1;
when 16#DF# -- German Sharp S
=> init_character(s(j), valid);
init_character(s(j + 1), valid);
s(j).ucs4 := 16#53#;
s(j + 1).ucs4 := 16#53#;
j := j + 2;
when 16#FF# -- Y with dialesis
=> init_character(s(j), valid);
s(j).ucs4 := 16#178#;
j := j + 1;
when 16#AD# -- format (small dash), ignore
=> null;
when others -- other characters do not require any transformation
=> s(j) := u(i);
j := j + 1;
end case;
i := i + 1;
end loop whole_string;
-- end the string with a NIL character
if j <= s'last then
init_character(s(j), nil);
end if;
end get_uppercase_string;
procedure utf8_buffer_put_byte(u: in out utf8_buffer; b: in utf8_byte) is
begin
if u.index = utf8_index'last then
raise utf8_buffer_full_error;
end if;
u.index := utf8_index'succ(u.index);
u.buffer(u.index) := b;
end utf8_buffer_put_byte;
function utf8_buffer_is_empty(u: in utf8_buffer) return boolean is
begin
return u.index = utf8_index'first;
end utf8_buffer_is_empty;
function utf8_buffer_is_full(u: in utf8_buffer) return boolean is
begin
return u.index = utf8_index'last;
end utf8_buffer_is_full;
procedure set_utf8_string(c: out compiler_character; u: in out utf8_buffer) is
-- declare the necessary constant to check the largest character
-- based on the length of the UTF-8 character for proper normalization
-- note that the last value of ucs4_character is expected to be 16#7FFFFFFD#
type char_size is array(1 .. 6) of ucs4_character;
max_for_size: constant char_size := (16#7F#, 16#7FF#, 16#FFFF#,
16#1FFFFF#, 16#3FFFFFF#, 16#7FFFFFFD#);
-- local variable
b: utf8_byte; -- first byte of the UTF-8 sequence
r: unsigned_32; -- should be ucs4_character, but it may be 0 at some point
n: utf8_string_length; -- number of bytes in UTF-8 sequence
s: character_status; -- valid or unnormalized
ib: constant := utf8_string_buffer'first; -- the first index in u
begin
if utf8_buffer_is_empty(u) then
-- an empty UTF-8 string is equivalent to a null character
init_character(c, nil);
return;
end if;
b := u.buffer(ib);
if b < 16#80# then
-- a "stand-alone" character (more or less, a 7 bits ASCII character)
init_character(c, valid);
r := unsigned_32(b);
n := 1;
elsif b < 16#C0# then
-- invalid size? (codes 16#80# to 16#BF# are not valid UTF-8 introducers)
init_character(c, error);
raise byte_sequence_error;
elsif b < 16#E0# then -- two bytes character?
r := unsigned_32(b) and 16#1F#;
n := 2;
elsif b < 16#F0# then -- three bytes character?
r := unsigned_32(b) and 16#0F#;
n := 3;
elsif b < 16#F8# then -- four bytes character?
r := unsigned_32(b) and 16#07#;
n := 4;
elsif b < 16#FC# then -- five bytes character?
r := unsigned_32(b) and 16#03#;
n := 5;
elsif b <= 16#FD# then -- six bytes character? (1 bit + 5 x 6 bits = 31 bits)
r := unsigned_32(b) and 16#01#;
n := 6;
else
-- unless the definition of the UTF-8 byte is incorrect this cannot occur
raise program_error;
end if;
-- we expect up to 6 bytes, but the input could be much smaller
if n > u.index then
init_character(c, error);
raise sequence_too_short_error;
end if;
-- verify that all the following bytes are valid
-- and as we're at it, generate the final UCS-4 value
if n > 1 then
-- (ib + 1) because we already read (ib)
for i in utf8_index_buffer_range range ib + 1 .. ib + n - 1
loop
if u.buffer(i) not in 16#80# .. 16#BF# then
init_character(c, error);
raise byte_sequence_error;
end if;
r := shift_left(r, 6) or (unsigned_32(u.buffer(i)) and 16#3F#);
end loop;
-- verify that the least significant 16 bits are valid
-- (note no need to check the 7 bits characters here)
if r = 0 or else (r and 16#FFFF#) in 16#FFFE# .. 16#FFFF# then
init_character(c, illegal);
raise illegal_character_error;
end if;
end if;
-- check whether the character is normalized
-- when it is 1 byte it is always valid
-- max size is: shift_right(16#7FFFFFFF#, (6 - n) * 5)
-- except when n = 1 in which case it's 16#7F#
s := valid;
if n > 1 then
if ucs4_character(r) <= max_for_size(integer(n - 1)) then
s := unnormalized;
end if;
end if;
-- set the character
init_character(c, s);
c.ucs4 := ucs4_character(r);
c.utf8_length := n;
c.utf8(1 .. n) := u.buffer(ib .. ib + n - 1); -- copy the slice we just eat
-- (note that this copy is not always normalized)
-- we just eat n bytes
u.index := u.index - n;
-- eat the slice we just used, it's saved in the character now
if u.index > 0 then
u.buffer(ib .. ib + u.index - 1) := u.buffer(ib + n .. ib + n + u.index - 1);
end if;
-- note: the category is left unknown, if queried and the character
-- is valid, then we compute it;
-- similarly, we do not define the upper case letter just yet
end set_utf8_string;
procedure set_ucs4_character(c: out compiler_character; u: in ucs4_character) is
begin
-- this one is very easy, but we still check that it is valid
if (unsigned_32(u) and 16#FFFF#) in 16#FFFE# .. 16#FFFF# then
init_character(c, illegal);
raise illegal_character_error;
end if;
-- the UTF-8 string will be computed only if necessary and so is the category
init_character(c, valid);
c.ucs4 := u;
end set_ucs4_character;
procedure get_special_character(c: out compiler_character;
status: in special_character_status) is
begin
init_character(c, status);
end get_special_character;
pragma inline(get_status);
function get_status(c: in compiler_character) return character_status is
begin
return c.status;
end get_status;
pragma inline(is_ascii);
function is_ascii(c: in compiler_character) return boolean is
begin
if c.status not in valid .. unnormalized then
raise illegal_character_error;
end if;
return c.ucs4 < 16#7F#;
end is_ascii;
procedure get_category(c: in out compiler_character;
category: out character_category) is
begin
if c.status not in valid .. unnormalized then
raise illegal_character_error;
end if;
-- if still unknown we want to determine the category now
if c.category = unknown then
-- get the actual character category
c.category := aada_character_categories.character_to_category(c.ucs4);
end if;
category := c.category;
end get_category;
-- 'output' should be at least get_utf8_length() characters starting at 1
-- the output is a normalized UTF-8 string; this is used to generate the
-- vstring as defined in get_utf8_string() and get_uppercase()
procedure retrieve_utf8_string(s: in compiler_character_string;
output: out string) is
p: integer := 1;
u: utf8_string(utf8_index_buffer_range'range);
l: utf8_string_length;
begin
for i in s'first .. s'last
loop
exit when s(i).status = nil;
get_normalized_utf8_code(s(i), u, l);
for j in utf8_string_length range 1 .. l
loop
-- transform UTF-8 to regular Latin 1 characters for easy handling
output(p) := character'val(u(j));
p := p + 1;
end loop;
end loop;
end retrieve_utf8_string;
procedure get_utf8_string(s: in compiler_character_string;
u: in out aada_vstrings.vstring) is
l: utf8_string_length;
begin
l := get_utf8_length(s);
declare
output: string(1 .. integer(l));
begin
retrieve_utf8_string(s, output);
aada_vstrings.vstr(output, u);
end;
end get_utf8_string;
function get_utf8_length(s: in compiler_character_string)
return utf8_string_length is
length: utf8_string_length := 0;
begin
for i in s'first .. s'last
loop
-- upper case buffer ends with a NIL character
exit when s(i).status = nil;
length := length + get_normalized_utf8_code_length(s(i));
end loop;
return length;
end get_utf8_length;
procedure get_uppercase(s: in compiler_character_string;
u: in out aada_vstrings.vstring) is
-- any one character can be quintupled when transformed to upper case
-- i.e. German Sharp S (1 byte) becomes SS (2 bytes)
-- but some characters may use more space if they are followed by
-- modifiers (i.e. accents and other diacritics)
upper: compiler_character_string(1 .. s'length * 5 + 1);
l: utf8_string_length;
begin
-- the get_uppercase_string() ends the upper string with a NIL character
get_uppercase_string(s, upper);
l := get_utf8_length(upper);
declare
output: string(1 .. integer(l));
begin
retrieve_utf8_string(upper, output);
aada_vstrings.vstr(output, u);
end;
end get_uppercase;
pragma inline(eof);
function eof(c: in compiler_character) return boolean is
begin
return c.status = eof;
end eof;
pragma inline(is_valid);
function is_valid(c: in compiler_character) return boolean is
begin
return c.status not in invalid .. error;
end is_valid;
pragma inline(get_ucs4_code);
function get_ucs4_code(c: in compiler_character) return ucs4_character is
begin
if c.status not in valid .. unnormalized then
raise illegal_character_error;
end if;
return c.ucs4;
end get_ucs4_code;
procedure get_utf8_code(c: in out compiler_character;
u: out utf8_string;
l: out utf8_string_length) is
ib: utf8_string_length; -- first character index in u
begin
if c.status not in valid .. unnormalized then
raise illegal_character_error;
end if;
ib := u'first;
if c.utf8_length = 0 then
-- get the normalized code and bufferize it
get_normalized_utf8_code(c, u, l);
c.utf8_length := l;
for i in utf8_index_buffer_range range utf8_string_buffer'first .. l
loop
c.utf8(i) := u(ib + i - 1);
end loop;
else
l := c.utf8_length;
u(ib .. ib + l - 1) := utf8_string(c.utf8(1 .. l));
end if;
end get_utf8_code;
function get_normalized_utf8_code_length(c: in compiler_character)
return utf8_string_length is
begin
if c.status not in valid .. unnormalized then
raise illegal_character_error;
end if;
if c.ucs4 <= 16#7F# then
-- special case of the 7 bit characters
return 1;
elsif c.ucs4 <= 16#7FF# then
return 2;
elsif c.ucs4 <= 16#FFFF# then
return 3;
elsif c.ucs4 <= 16#1FFFFF# then
return 4;
elsif c.ucs4 <= 16#3FFFFFF# then
return 5;
else -- if c.ucs4 <= 16#7FFFFFFF# then
return 6;
end if;
end get_normalized_utf8_code_length;
procedure get_normalized_utf8_code(c: in compiler_character;
u: out utf8_string;
l: out utf8_string_length) is
r: unsigned_32;
begin
l := 0;
u := (others => 16#20#);
if c.status not in valid .. unnormalized then
raise illegal_character_error;
end if;
if c.ucs4 <= 16#7F# then
-- special case of the 7 bit characters
l := 1;
u(1) := utf8_byte(c.ucs4);
return;
elsif c.ucs4 <= 16#7FF# then
l := 2;
elsif c.ucs4 <= 16#FFFF# then
l := 3;
elsif c.ucs4 <= 16#1FFFFF# then
l := 4;
elsif c.ucs4 <= 16#3FFFFFF# then
l := 5;
else -- if c.ucs4 <= 16#7FFFFFFF# then
l := 6;
end if;
-- convert UCS-4 to UTF-8
r := unsigned_32(c.ucs4);
for i in reverse utf8_string_length range 2 .. l
loop
u(i) := utf8_byte((r and 16#3F#) or 16#80#);
r := shift_right(r, 6);
end loop;
u(1) := utf8_byte((r or shift_right(16#FF00#, integer(l))) and 16#FF#);
end get_normalized_utf8_code;
end aada_compiler_character_package;
-- vim: ts=2 sw=2 et syntax=ada |