BadVPN – Blame information for rev 1
?pathlinks?
Rev | Author | Line No. | Line |
---|---|---|---|
1 | office | 1 | /** |
2 | * @file unicode_funcs.h |
||
3 | * @author Ambroz Bizjak <ambrop7@gmail.com> |
||
4 | * |
||
5 | * @section LICENSE |
||
6 | * |
||
7 | * Redistribution and use in source and binary forms, with or without |
||
8 | * modification, are permitted provided that the following conditions are met: |
||
9 | * 1. Redistributions of source code must retain the above copyright |
||
10 | * notice, this list of conditions and the following disclaimer. |
||
11 | * 2. Redistributions in binary form must reproduce the above copyright |
||
12 | * notice, this list of conditions and the following disclaimer in the |
||
13 | * documentation and/or other materials provided with the distribution. |
||
14 | * 3. Neither the name of the author nor the |
||
15 | * names of its contributors may be used to endorse or promote products |
||
16 | * derived from this software without specific prior written permission. |
||
17 | * |
||
18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
||
19 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
||
20 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
||
21 | * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
||
22 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
||
23 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
||
24 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
||
25 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
||
26 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
||
27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||
28 | */ |
||
29 | |||
30 | #ifndef BADVPN_UNICODE_FUNCS_H |
||
31 | #define BADVPN_UNICODE_FUNCS_H |
||
32 | |||
33 | #include <misc/expstring.h> |
||
34 | #include <misc/bsize.h> |
||
35 | #include <misc/Utf8Encoder.h> |
||
36 | #include <misc/Utf8Decoder.h> |
||
37 | #include <misc/Utf16Encoder.h> |
||
38 | #include <misc/Utf16Decoder.h> |
||
39 | |||
40 | /** |
||
41 | * Decodes UTF-16 data as bytes into an allocated null-terminated UTF-8 string. |
||
42 | * |
||
43 | * @param data UTF-16 data, in big endian |
||
44 | * @param data_len size of data in bytes |
||
45 | * @param out_is_error if not NULL and the function returns a string, |
||
46 | * *out_is_error will be set to 0 or 1, indicating |
||
47 | * whether there have been errors decoding the input. |
||
48 | * A null decoded character is treated as an error. |
||
49 | * @return An UTF-8 null-terminated string which can be freed with free(), |
||
50 | * or NULL if out of memory. |
||
51 | */ |
||
52 | static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error); |
||
53 | |||
54 | /** |
||
55 | * Decodes UTF-8 data into UTF-16 data as bytes. |
||
56 | * |
||
57 | * @param data UTF-8 data |
||
58 | * @param data_len size of data in bytes |
||
59 | * @param out output buffer |
||
60 | * @param out_avail number of bytes available in output buffer |
||
61 | * @param out_len if not NULL, *out_len will contain the number of bytes |
||
62 | * required to store the resulting data (or overflow) |
||
63 | * @param out_is_error if not NULL, *out_is_error will contain 0 or 1, |
||
64 | * indicating whether there have been errors decoding |
||
65 | * the input |
||
66 | */ |
||
67 | static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error); |
||
68 | |||
69 | static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error) |
||
70 | { |
||
71 | // will build the resulting UTF-8 string by appending to ExpString |
||
72 | ExpString str; |
||
73 | if (!ExpString_Init(&str)) { |
||
74 | goto fail0; |
||
75 | } |
||
76 | |||
77 | // init UTF-16 decoder |
||
78 | Utf16Decoder decoder; |
||
79 | Utf16Decoder_Init(&decoder); |
||
80 | |||
81 | // set initial input and input matching positions |
||
82 | size_t i_in = 0; |
||
83 | size_t i_ch = 0; |
||
84 | |||
85 | int error = 0; |
||
86 | |||
87 | while (i_in < data_len) { |
||
88 | // read two input bytes from the input position |
||
89 | uint8_t x = data[i_in++]; |
||
90 | if (i_in == data_len) { |
||
91 | break; |
||
92 | } |
||
93 | uint8_t y = data[i_in++]; |
||
94 | |||
95 | // combine them into a 16-bit value |
||
96 | uint16_t xy = (((uint16_t)x << 8) | (uint16_t)y); |
||
97 | |||
98 | // give the 16-bit value to the UTF-16 decoder and maybe |
||
99 | // receive a Unicode character back |
||
100 | uint32_t ch; |
||
101 | if (!Utf16Decoder_Input(&decoder, xy, &ch)) { |
||
102 | continue; |
||
103 | } |
||
104 | |||
105 | if (!error) { |
||
106 | // encode the Unicode character back into UTF-16 |
||
107 | uint16_t chenc[2]; |
||
108 | int chenc_n = Utf16Encoder_EncodeCharacter(ch, chenc); |
||
109 | ASSERT(chenc_n > 0) |
||
110 | |||
111 | // match the result with input |
||
112 | for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) { |
||
113 | uint8_t cx = (chenc[chenc_i] >> 8); |
||
114 | uint8_t cy = (chenc[chenc_i] & 0xFF); |
||
115 | |||
116 | if (i_ch >= data_len || data[i_ch] != cx) { |
||
117 | error = 1; |
||
118 | break; |
||
119 | } |
||
120 | i_ch++; |
||
121 | |||
122 | if (i_ch >= data_len || data[i_ch] != cy) { |
||
123 | error = 1; |
||
124 | break; |
||
125 | } |
||
126 | i_ch++; |
||
127 | } |
||
128 | } |
||
129 | |||
130 | // we don't like null Unicode characters because we're building a |
||
131 | // null-terminated UTF-8 string |
||
132 | if (ch == 0) { |
||
133 | error = 1; |
||
134 | continue; |
||
135 | } |
||
136 | |||
137 | // encode the Unicode character into UTF-8 |
||
138 | uint8_t enc[5]; |
||
139 | int enc_n = Utf8Encoder_EncodeCharacter(ch, enc); |
||
140 | ASSERT(enc_n > 0) |
||
141 | |||
142 | // append the resulting UTF-8 bytes to the result string |
||
143 | enc[enc_n] = 0; |
||
144 | if (!ExpString_Append(&str, enc)) { |
||
145 | goto fail1; |
||
146 | } |
||
147 | } |
||
148 | |||
149 | // check if we matched the whole input string when encoding back |
||
150 | if (i_ch < data_len) { |
||
151 | error = 1; |
||
152 | } |
||
153 | |||
154 | if (out_is_error) { |
||
155 | *out_is_error = error; |
||
156 | } |
||
157 | return ExpString_Get(&str); |
||
158 | |||
159 | fail1: |
||
160 | ExpString_Free(&str); |
||
161 | fail0: |
||
162 | return NULL; |
||
163 | } |
||
164 | |||
165 | static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error) |
||
166 | { |
||
167 | Utf8Decoder decoder; |
||
168 | Utf8Decoder_Init(&decoder); |
||
169 | |||
170 | size_t i_in = 0; |
||
171 | size_t i_ch = 0; |
||
172 | |||
173 | bsize_t len = bsize_fromsize(0); |
||
174 | |||
175 | int error = 0; |
||
176 | |||
177 | while (i_in < data_len) { |
||
178 | uint8_t x = data[i_in++]; |
||
179 | |||
180 | uint32_t ch; |
||
181 | if (!Utf8Decoder_Input(&decoder, x, &ch)) { |
||
182 | continue; |
||
183 | } |
||
184 | |||
185 | if (!error) { |
||
186 | uint8_t chenc[4]; |
||
187 | int chenc_n = Utf8Encoder_EncodeCharacter(ch, chenc); |
||
188 | ASSERT(chenc_n > 0) |
||
189 | |||
190 | for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) { |
||
191 | if (i_ch >= data_len || data[i_ch] != chenc[chenc_i]) { |
||
192 | error = 1; |
||
193 | break; |
||
194 | } |
||
195 | i_ch++; |
||
196 | } |
||
197 | } |
||
198 | |||
199 | uint16_t enc[2]; |
||
200 | int enc_n = Utf16Encoder_EncodeCharacter(ch, enc); |
||
201 | ASSERT(enc_n > 0) |
||
202 | |||
203 | len = bsize_add(len, bsize_fromsize(2 * enc_n)); |
||
204 | |||
205 | for (int enc_i = 0; enc_i < enc_n; enc_i++) { |
||
206 | if (out_avail == 0) { |
||
207 | break; |
||
208 | } |
||
209 | *(out++) = (enc[enc_i] >> 8); |
||
210 | out_avail--; |
||
211 | |||
212 | if (out_avail == 0) { |
||
213 | break; |
||
214 | } |
||
215 | *(out++) = (enc[enc_i] & 0xFF); |
||
216 | out_avail--; |
||
217 | } |
||
218 | } |
||
219 | |||
220 | if (i_ch < data_len) { |
||
221 | error = 1; |
||
222 | } |
||
223 | |||
224 | if (out_len) { |
||
225 | *out_len = len; |
||
226 | } |
||
227 | if (out_is_error) { |
||
228 | *out_is_error = error; |
||
229 | } |
||
230 | } |
||
231 | |||
232 | #endif |