AAda Compiler Character Package

27
Dec
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
-- This package handles the compiler characters
-- The compiler supports ISO-10646:2003
-- The internal representation is UCS-4 (local endian)
-- This library knows how to handle all characters for
-- the compiler (i.e. it knows their types)
with aada_compiler_ucs4_character;
with aada_character_categories;
with aada_vstrings;
 
use aada_compiler_ucs4_character;
use aada_character_categories;
 
package aada_compiler_character_package is
  -- invalid string length versus user specified length
  user_length_error: exception;
  -- incompatible byte found in the UTF-8 sequence of bytes
  byte_sequence_error: exception;
  -- the byte sequence does not represent a full UTF-8 character definition
  sequence_too_short_error: exception;
  -- illegal characters generate an exception
  illegal_character_error: exception;
  -- the loaded file is not a valid character page file
  page_file_magic_error: exception;
  -- the loaded file is not supported
  page_file_not_supported_error: exception;
  -- the loaded file contains invalid data
  page_file_corrupted_error: exception;
  -- the UTF-8 buffer is full and put_byte did not save the character
  utf8_buffer_full_error: exception;
 
  -- the character is a private record
  type compiler_character is private;
  type compiler_character_string is array(positive range <>)
                                              of compiler_character;
 
  -- the UTF-8 representation uses bytes 1 to 254
  type utf8_byte is range 1 .. 16#FD#;
  -- a UTF-8 character string range, any range should be a sub-range of this one
  type utf8_string_length is range 0 .. 16#7FFF_FFFF#;
  -- a UTF-8 character uses an array of utf8 bytes
  type utf8_string is array(utf8_string_length range <>) of utf8_byte;
  -- a UTF-8 buffer handled by this package
  type utf8_buffer is limited private;
 
  -- the character status
  --  nil -- character was not set yet
  --  valid -- the UTF-8 input sequence is 100% valid
  --  unnormalized -- the UTF-8 is longer than necessary
  --  eol -- this character represents the end of a line
  --  eop -- this character represents the end of a page
  --  eof -- this character represents the end of a file
  --  invalid -- the UTF-8 string is not valid (invalid bits set)
  --  illegal -- this character is not legal (0, 16#FFFE#, 16#FFFF#)
  type character_status is (nil, valid, unnormalized, eol, eop, eof,
                            invalid, illegal, error);
  subtype special_character_status is character_status range eol .. eof;
 
  -- add a byte in a UTF-8 buffer
  procedure utf8_buffer_put_byte(u: in out utf8_buffer; b: in utf8_byte);
 
  -- check whether the buffer is empty
  function utf8_buffer_is_empty(u: in utf8_buffer) return boolean;
 
  -- check whether the buffer is full
  function utf8_buffer_is_full(u: in utf8_buffer) return boolean;
 
  -- set a character to a UTF-8 string
  -- the bytes used from u are automatically moved to the beginning of the buffer
  procedure set_utf8_string(c: out compiler_character;
                            u: in out utf8_buffer);
 
  -- set a character to a UCS-4 character
  procedure set_ucs4_character(c: out compiler_character; u: in ucs4_character);
 
  -- create a special character
  procedure get_special_character(c: out compiler_character;
                                  status: in special_character_status);
 
  -- get the status of this character
  function get_status(c: in compiler_character) return character_status;
 
  -- determine whether this character is just ASCII or International
  function is_ascii(c: in compiler_character) return boolean;
 
  -- get the category of this character
  procedure get_category(c: in out compiler_character;
                         category: out character_category);
 
  -- get aada_vstrings (utf8 strings really) from compiler character strings
  function get_utf8_length(s: in compiler_character_string)
                                                     return utf8_string_length;
  procedure get_utf8_string(s: in compiler_character_string;
                            u: in out aada_vstrings.vstring);
  procedure get_uppercase(s: in compiler_character_string;
                          u: in out aada_vstrings.vstring);
 
 
  -- check whether the character represents the end of a file
  function eof(c: in compiler_character) return boolean;
 
  -- check whether the character is valid (i.e. not null, invalid or illegal)
  function is_valid(c: in compiler_character) return boolean;
 
  -- get the UCS-4 code of this character
  function get_ucs4_code(c: in compiler_character) return ucs4_character;
 
  -- get the UTF-8 code of this character; the string must be
  -- long enough for this character (i.e. 7 to support all characters)
  procedure get_utf8_code(c: in out compiler_character;
                          u: out utf8_string; l: out utf8_string_length);
 
  -- the normalized version returns the UCS-4 converted to proper UTF-8
  function get_normalized_utf8_code_length(c: in compiler_character)
                                                     return utf8_string_length;
  procedure get_normalized_utf8_code(c: in compiler_character;
                                     u: out utf8_string;
                                     l: out utf8_string_length);
 
private
  subtype utf8_index is utf8_string_length
          range utf8_string_length'first .. utf8_string_length'first + 6;
  subtype utf8_index_buffer_range is utf8_index
          range utf8_index'first + 1 .. utf8_index'last;
  type utf8_string_buffer is array(utf8_index_buffer_range'range) of utf8_byte;
 
  type utf8_buffer is
    record
      index: utf8_index := utf8_index'first;
      buffer: utf8_string_buffer := (others => 16#20#);
    end record;
 
  type compiler_character is
    record
      -- the character status
      status: character_status := nil;
 
      -- the character category as determine by the compiler
      -- (for a pure Ada 2005 compiler, it is based on ISO-10646:2003)
      category: character_category := aada_character_categories.unknown;
 
      -- the actual character in UCS-4 format
      ucs4: ucs4_character := 16#FEFF#;  -- endian mark
 
      -- the exact UTF-8 input string (allows unnormalized chars)
      utf8_length: utf8_index := 0;  -- empty
      utf8: utf8_string_buffer;
    end record;
 
end aada_compiler_character_package;
-- vim: ts=2 sw=2 et syntax=ada
Project aada v1.0-338 (Project id #3)
Process Done (Last compiled on 2012/01/13 01:21:26)
Description Alexis Ada Compiler written in Ada (my first attempt was in C++ which is not correct for an Ada compiler.)