BadVPN – Blame information for rev 1

Subversion Repositories:
Rev:
Rev Author Line No. Line
1 office 1 /**
2 * @file unicode_funcs.h
3 * @author Ambroz Bizjak <ambrop7@gmail.com>
4 *
5 * @section LICENSE
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Neither the name of the author nor the
15 * names of its contributors may be used to endorse or promote products
16 * derived from this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29  
30 #ifndef BADVPN_UNICODE_FUNCS_H
31 #define BADVPN_UNICODE_FUNCS_H
32  
33 #include <misc/expstring.h>
34 #include <misc/bsize.h>
35 #include <misc/Utf8Encoder.h>
36 #include <misc/Utf8Decoder.h>
37 #include <misc/Utf16Encoder.h>
38 #include <misc/Utf16Decoder.h>
39  
40 /**
41 * Decodes UTF-16 data as bytes into an allocated null-terminated UTF-8 string.
42 *
43 * @param data UTF-16 data, in big endian
44 * @param data_len size of data in bytes
45 * @param out_is_error if not NULL and the function returns a string,
46 * *out_is_error will be set to 0 or 1, indicating
47 * whether there have been errors decoding the input.
48 * A null decoded character is treated as an error.
49 * @return An UTF-8 null-terminated string which can be freed with free(),
50 * or NULL if out of memory.
51 */
52 static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error);
53  
54 /**
55 * Decodes UTF-8 data into UTF-16 data as bytes.
56 *
57 * @param data UTF-8 data
58 * @param data_len size of data in bytes
59 * @param out output buffer
60 * @param out_avail number of bytes available in output buffer
61 * @param out_len if not NULL, *out_len will contain the number of bytes
62 * required to store the resulting data (or overflow)
63 * @param out_is_error if not NULL, *out_is_error will contain 0 or 1,
64 * indicating whether there have been errors decoding
65 * the input
66 */
67 static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error);
68  
69 static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error)
70 {
71 // will build the resulting UTF-8 string by appending to ExpString
72 ExpString str;
73 if (!ExpString_Init(&str)) {
74 goto fail0;
75 }
76  
77 // init UTF-16 decoder
78 Utf16Decoder decoder;
79 Utf16Decoder_Init(&decoder);
80  
81 // set initial input and input matching positions
82 size_t i_in = 0;
83 size_t i_ch = 0;
84  
85 int error = 0;
86  
87 while (i_in < data_len) {
88 // read two input bytes from the input position
89 uint8_t x = data[i_in++];
90 if (i_in == data_len) {
91 break;
92 }
93 uint8_t y = data[i_in++];
94  
95 // combine them into a 16-bit value
96 uint16_t xy = (((uint16_t)x << 8) | (uint16_t)y);
97  
98 // give the 16-bit value to the UTF-16 decoder and maybe
99 // receive a Unicode character back
100 uint32_t ch;
101 if (!Utf16Decoder_Input(&decoder, xy, &ch)) {
102 continue;
103 }
104  
105 if (!error) {
106 // encode the Unicode character back into UTF-16
107 uint16_t chenc[2];
108 int chenc_n = Utf16Encoder_EncodeCharacter(ch, chenc);
109 ASSERT(chenc_n > 0)
110  
111 // match the result with input
112 for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
113 uint8_t cx = (chenc[chenc_i] >> 8);
114 uint8_t cy = (chenc[chenc_i] & 0xFF);
115  
116 if (i_ch >= data_len || data[i_ch] != cx) {
117 error = 1;
118 break;
119 }
120 i_ch++;
121  
122 if (i_ch >= data_len || data[i_ch] != cy) {
123 error = 1;
124 break;
125 }
126 i_ch++;
127 }
128 }
129  
130 // we don't like null Unicode characters because we're building a
131 // null-terminated UTF-8 string
132 if (ch == 0) {
133 error = 1;
134 continue;
135 }
136  
137 // encode the Unicode character into UTF-8
138 uint8_t enc[5];
139 int enc_n = Utf8Encoder_EncodeCharacter(ch, enc);
140 ASSERT(enc_n > 0)
141  
142 // append the resulting UTF-8 bytes to the result string
143 enc[enc_n] = 0;
144 if (!ExpString_Append(&str, enc)) {
145 goto fail1;
146 }
147 }
148  
149 // check if we matched the whole input string when encoding back
150 if (i_ch < data_len) {
151 error = 1;
152 }
153  
154 if (out_is_error) {
155 *out_is_error = error;
156 }
157 return ExpString_Get(&str);
158  
159 fail1:
160 ExpString_Free(&str);
161 fail0:
162 return NULL;
163 }
164  
165 static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error)
166 {
167 Utf8Decoder decoder;
168 Utf8Decoder_Init(&decoder);
169  
170 size_t i_in = 0;
171 size_t i_ch = 0;
172  
173 bsize_t len = bsize_fromsize(0);
174  
175 int error = 0;
176  
177 while (i_in < data_len) {
178 uint8_t x = data[i_in++];
179  
180 uint32_t ch;
181 if (!Utf8Decoder_Input(&decoder, x, &ch)) {
182 continue;
183 }
184  
185 if (!error) {
186 uint8_t chenc[4];
187 int chenc_n = Utf8Encoder_EncodeCharacter(ch, chenc);
188 ASSERT(chenc_n > 0)
189  
190 for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {
191 if (i_ch >= data_len || data[i_ch] != chenc[chenc_i]) {
192 error = 1;
193 break;
194 }
195 i_ch++;
196 }
197 }
198  
199 uint16_t enc[2];
200 int enc_n = Utf16Encoder_EncodeCharacter(ch, enc);
201 ASSERT(enc_n > 0)
202  
203 len = bsize_add(len, bsize_fromsize(2 * enc_n));
204  
205 for (int enc_i = 0; enc_i < enc_n; enc_i++) {
206 if (out_avail == 0) {
207 break;
208 }
209 *(out++) = (enc[enc_i] >> 8);
210 out_avail--;
211  
212 if (out_avail == 0) {
213 break;
214 }
215 *(out++) = (enc[enc_i] & 0xFF);
216 out_avail--;
217 }
218 }
219  
220 if (i_ch < data_len) {
221 error = 1;
222 }
223  
224 if (out_len) {
225 *out_len = len;
226 }
227 if (out_is_error) {
228 *out_is_error = error;
229 }
230 }
231  
232 #endif