1 | /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 |
---|
2 | // utf8_codecvt_facet.cpp |
---|
3 | |
---|
4 | // Copyright © 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu) |
---|
5 | // Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). |
---|
6 | // Use, modification and distribution is subject to the Boost Software |
---|
7 | // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at |
---|
8 | // http://www.boost.org/LICENSE_1_0.txt) |
---|
9 | |
---|
10 | // Please see the comments in <boost/detail/utf8_codecvt_facet.hpp> to |
---|
11 | // learn how this file should be used. |
---|
12 | |
---|
13 | #include <boost/detail/utf8_codecvt_facet.hpp> |
---|
14 | |
---|
15 | #include <cstdlib> // for multi-byte converson routines |
---|
16 | #include <cassert> |
---|
17 | |
---|
18 | #include <boost/limits.hpp> |
---|
19 | #include <boost/config.hpp> |
---|
20 | |
---|
21 | // If we don't have wstring, then Unicode support |
---|
22 | // is not available anyway, so we don't need to even |
---|
23 | // compiler this file. This also fixes the problem |
---|
24 | // with mingw, which can compile this file, but will |
---|
25 | // generate link error when building DLL. |
---|
26 | #ifndef BOOST_NO_STD_WSTRING |
---|
27 | |
---|
28 | BOOST_UTF8_BEGIN_NAMESPACE |
---|
29 | |
---|
30 | /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8 |
---|
31 | // implementation for wchar_t |
---|
32 | |
---|
33 | // Translate incoming UTF-8 into UCS-4 |
---|
34 | std::codecvt_base::result utf8_codecvt_facet::do_in( |
---|
35 | std::mbstate_t& state, |
---|
36 | const char * from, |
---|
37 | const char * from_end, |
---|
38 | const char * & from_next, |
---|
39 | wchar_t * to, |
---|
40 | wchar_t * to_end, |
---|
41 | wchar_t * & to_next |
---|
42 | ) const { |
---|
43 | // Basic algorithm: The first octet determines how many |
---|
44 | // octets total make up the UCS-4 character. The remaining |
---|
45 | // "continuing octets" all begin with "10". To convert, subtract |
---|
46 | // the amount that specifies the number of octets from the first |
---|
47 | // octet. Subtract 0x80 (1000 0000) from each continuing octet, |
---|
48 | // then mash the whole lot together. Note that each continuing |
---|
49 | // octet only uses 6 bits as unique values, so only shift by |
---|
50 | // multiples of 6 to combine. |
---|
51 | while (from != from_end && to != to_end) { |
---|
52 | |
---|
53 | // Error checking on the first octet |
---|
54 | if (invalid_leading_octet(*from)){ |
---|
55 | from_next = from; |
---|
56 | to_next = to; |
---|
57 | return std::codecvt_base::error; |
---|
58 | } |
---|
59 | |
---|
60 | // The first octet is adjusted by a value dependent upon |
---|
61 | // the number of "continuing octets" encoding the character |
---|
62 | const int cont_octet_count = get_cont_octet_count(*from); |
---|
63 | const wchar_t octet1_modifier_table[] = { |
---|
64 | 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc |
---|
65 | }; |
---|
66 | |
---|
67 | // The unsigned char conversion is necessary in case char is |
---|
68 | // signed (I learned this the hard way) |
---|
69 | wchar_t ucs_result = |
---|
70 | (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count]; |
---|
71 | |
---|
72 | // Invariants : |
---|
73 | // 1) At the start of the loop, 'i' continuing characters have been |
---|
74 | // processed |
---|
75 | // 2) *from points to the next continuing character to be processed. |
---|
76 | int i = 0; |
---|
77 | while(i != cont_octet_count && from != from_end) { |
---|
78 | |
---|
79 | // Error checking on continuing characters |
---|
80 | if (invalid_continuing_octet(*from)) { |
---|
81 | from_next = from; |
---|
82 | to_next = to; |
---|
83 | return std::codecvt_base::error; |
---|
84 | } |
---|
85 | |
---|
86 | ucs_result *= (1 << 6); |
---|
87 | |
---|
88 | // each continuing character has an extra (10xxxxxx)b attached to |
---|
89 | // it that must be removed. |
---|
90 | ucs_result += (unsigned char)(*from++) - 0x80; |
---|
91 | ++i; |
---|
92 | } |
---|
93 | |
---|
94 | // If the buffer ends with an incomplete unicode character... |
---|
95 | if (from == from_end && i != cont_octet_count) { |
---|
96 | // rewind "from" to before the current character translation |
---|
97 | from_next = from - (i+1); |
---|
98 | to_next = to; |
---|
99 | return std::codecvt_base::partial; |
---|
100 | } |
---|
101 | *to++ = ucs_result; |
---|
102 | } |
---|
103 | from_next = from; |
---|
104 | to_next = to; |
---|
105 | |
---|
106 | // Were we done converting or did we run out of destination space? |
---|
107 | if(from == from_end) return std::codecvt_base::ok; |
---|
108 | else return std::codecvt_base::partial; |
---|
109 | } |
---|
110 | |
---|
111 | std::codecvt_base::result utf8_codecvt_facet::do_out( |
---|
112 | std::mbstate_t& state, |
---|
113 | const wchar_t * from, |
---|
114 | const wchar_t * from_end, |
---|
115 | const wchar_t * & from_next, |
---|
116 | char * to, |
---|
117 | char * to_end, |
---|
118 | char * & to_next |
---|
119 | ) const |
---|
120 | { |
---|
121 | // RG - consider merging this table with the other one |
---|
122 | const wchar_t octet1_modifier_table[] = { |
---|
123 | 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc |
---|
124 | }; |
---|
125 | |
---|
126 | wchar_t max_wchar = (std::numeric_limits<wchar_t>::max)(); |
---|
127 | while (from != from_end && to != to_end) { |
---|
128 | |
---|
129 | // Check for invalid UCS-4 character |
---|
130 | if (*from > max_wchar) { |
---|
131 | from_next = from; |
---|
132 | to_next = to; |
---|
133 | return std::codecvt_base::error; |
---|
134 | } |
---|
135 | |
---|
136 | int cont_octet_count = get_cont_octet_out_count(*from); |
---|
137 | |
---|
138 | // RG - comment this formula better |
---|
139 | int shift_exponent = (cont_octet_count) * 6; |
---|
140 | |
---|
141 | // Process the first character |
---|
142 | *to++ = octet1_modifier_table[cont_octet_count] + |
---|
143 | (unsigned char)(*from / (1 << shift_exponent)); |
---|
144 | |
---|
145 | // Process the continuation characters |
---|
146 | // Invariants: At the start of the loop: |
---|
147 | // 1) 'i' continuing octets have been generated |
---|
148 | // 2) '*to' points to the next location to place an octet |
---|
149 | // 3) shift_exponent is 6 more than needed for the next octet |
---|
150 | int i = 0; |
---|
151 | while (i != cont_octet_count && to != to_end) { |
---|
152 | shift_exponent -= 6; |
---|
153 | *to++ = 0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)); |
---|
154 | ++i; |
---|
155 | } |
---|
156 | // If we filled up the out buffer before encoding the character |
---|
157 | if(to == to_end && i != cont_octet_count) { |
---|
158 | from_next = from; |
---|
159 | to_next = to - (i+1); |
---|
160 | return std::codecvt_base::partial; |
---|
161 | } |
---|
162 | *from++; |
---|
163 | } |
---|
164 | from_next = from; |
---|
165 | to_next = to; |
---|
166 | // Were we done or did we run out of destination space |
---|
167 | if(from == from_end) return std::codecvt_base::ok; |
---|
168 | else return std::codecvt_base::partial; |
---|
169 | } |
---|
170 | |
---|
171 | // How many char objects can I process to get <= max_limit |
---|
172 | // wchar_t objects? |
---|
173 | int utf8_codecvt_facet::do_length( |
---|
174 | BOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &, |
---|
175 | const char * from, |
---|
176 | const char * from_end, |
---|
177 | std::size_t max_limit |
---|
178 | #if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600)) |
---|
179 | ) const throw() |
---|
180 | #else |
---|
181 | ) const |
---|
182 | #endif |
---|
183 | { |
---|
184 | // RG - this code is confusing! I need a better way to express it. |
---|
185 | // and test cases. |
---|
186 | |
---|
187 | // Invariants: |
---|
188 | // 1) last_octet_count has the size of the last measured character |
---|
189 | // 2) char_count holds the number of characters shown to fit |
---|
190 | // within the bounds so far (no greater than max_limit) |
---|
191 | // 3) from_next points to the octet 'last_octet_count' before the |
---|
192 | // last measured character. |
---|
193 | int last_octet_count=0; |
---|
194 | std::size_t char_count = 0; |
---|
195 | const char* from_next = from; |
---|
196 | // Use "<" because the buffer may represent incomplete characters |
---|
197 | while (from_next+last_octet_count <= from_end && char_count <= max_limit) { |
---|
198 | from_next += last_octet_count; |
---|
199 | last_octet_count = (get_octet_count(*from_next)); |
---|
200 | ++char_count; |
---|
201 | } |
---|
202 | return from_next-from_end; |
---|
203 | } |
---|
204 | |
---|
205 | unsigned int utf8_codecvt_facet::get_octet_count( |
---|
206 | unsigned char lead_octet |
---|
207 | ){ |
---|
208 | // if the 0-bit (MSB) is 0, then 1 character |
---|
209 | if (lead_octet <= 0x7f) return 1; |
---|
210 | |
---|
211 | // Otherwise the count number of consecutive 1 bits starting at MSB |
---|
212 | // assert(0xc0 <= lead_octet && lead_octet <= 0xfd); |
---|
213 | |
---|
214 | if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2; |
---|
215 | else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3; |
---|
216 | else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4; |
---|
217 | else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5; |
---|
218 | else return 6; |
---|
219 | } |
---|
220 | BOOST_UTF8_END_NAMESPACE |
---|
221 | |
---|
222 | namespace { |
---|
223 | template<std::size_t s> |
---|
224 | int get_cont_octet_out_count_impl(wchar_t word){ |
---|
225 | if (word < 0x80) { |
---|
226 | return 0; |
---|
227 | } |
---|
228 | if (word < 0x800) { |
---|
229 | return 1; |
---|
230 | } |
---|
231 | return 2; |
---|
232 | } |
---|
233 | |
---|
234 | // note the following code will generate on some platforms where |
---|
235 | // wchar_t is defined as UCS2. The warnings are superfluous as |
---|
236 | // the specialization is never instantitiated with such compilers. |
---|
237 | template<> |
---|
238 | int get_cont_octet_out_count_impl<4>(wchar_t word){ |
---|
239 | if (word < 0x80) { |
---|
240 | return 0; |
---|
241 | } |
---|
242 | if (word < 0x800) { |
---|
243 | return 1; |
---|
244 | } |
---|
245 | if (word < 0x10000) { |
---|
246 | return 2; |
---|
247 | } |
---|
248 | if (word < 0x200000) { |
---|
249 | return 3; |
---|
250 | } |
---|
251 | if (word < 0x4000000) { |
---|
252 | return 4; |
---|
253 | } |
---|
254 | return 5; |
---|
255 | } |
---|
256 | |
---|
257 | } // namespace anonymous |
---|
258 | |
---|
259 | BOOST_UTF8_BEGIN_NAMESPACE |
---|
260 | // How many "continuing octets" will be needed for this word |
---|
261 | // == total octets - 1. |
---|
262 | int utf8_codecvt_facet::get_cont_octet_out_count( |
---|
263 | wchar_t word |
---|
264 | ) const { |
---|
265 | return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word); |
---|
266 | } |
---|
267 | BOOST_UTF8_END_NAMESPACE |
---|
268 | |
---|
269 | #endif |
---|