BadVPN – Blame information for rev 1
?pathlinks?
Rev | Author | Line No. | Line |
---|---|---|---|
1 | office | 1 | /** |
2 | * @file Utf8Decoder.h |
||
3 | * @author Ambroz Bizjak <ambrop7@gmail.com> |
||
4 | * |
||
5 | * @section LICENSE |
||
6 | * |
||
7 | * Redistribution and use in source and binary forms, with or without |
||
8 | * modification, are permitted provided that the following conditions are met: |
||
9 | * 1. Redistributions of source code must retain the above copyright |
||
10 | * notice, this list of conditions and the following disclaimer. |
||
11 | * 2. Redistributions in binary form must reproduce the above copyright |
||
12 | * notice, this list of conditions and the following disclaimer in the |
||
13 | * documentation and/or other materials provided with the distribution. |
||
14 | * 3. Neither the name of the author nor the |
||
15 | * names of its contributors may be used to endorse or promote products |
||
16 | * derived from this software without specific prior written permission. |
||
17 | * |
||
18 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
||
19 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
||
20 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
||
21 | * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY |
||
22 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
||
23 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
||
24 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
||
25 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
||
26 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
||
27 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
||
28 | */ |
||
29 | |||
30 | #ifndef BADVPN_UTF8DECODER_H |
||
31 | #define BADVPN_UTF8DECODER_H |
||
32 | |||
33 | #include <stdint.h> |
||
34 | |||
35 | #include <misc/debug.h> |
||
36 | |||
37 | /** |
||
38 | * Decodes UTF-8 data into Unicode characters. |
||
39 | */ |
||
40 | typedef struct { |
||
41 | int bytes; |
||
42 | int pos; |
||
43 | uint32_t ch; |
||
44 | } Utf8Decoder; |
||
45 | |||
46 | /** |
||
47 | * Initializes the UTF-8 decoder. |
||
48 | * |
||
49 | * @param o the object |
||
50 | */ |
||
51 | static void Utf8Decoder_Init (Utf8Decoder *o); |
||
52 | |||
53 | /** |
||
54 | * Inputs a byte to the decoder. |
||
55 | * |
||
56 | * @param o the object |
||
57 | * @param b byte to input |
||
58 | * @param out_ch will receive a Unicode character if this function returns 1. |
||
59 | * If written, the character will be in the range 0 - 0x10FFFF, |
||
60 | * excluding the surrogate range 0xD800 - 0xDFFF. |
||
61 | * @return 1 if a Unicode character has been written to *out_ch, 0 if not |
||
62 | */ |
||
63 | static int Utf8Decoder_Input (Utf8Decoder *o, uint8_t b, uint32_t *out_ch); |
||
64 | |||
65 | void Utf8Decoder_Init (Utf8Decoder *o) |
||
66 | { |
||
67 | o->bytes = 0; |
||
68 | } |
||
69 | |||
70 | int Utf8Decoder_Input (Utf8Decoder *o, uint8_t b, uint32_t *out_ch) |
||
71 | { |
||
72 | // one-byte character |
||
73 | if ((b & 128) == 0) { |
||
74 | o->bytes = 0; |
||
75 | *out_ch = b; |
||
76 | return 1; |
||
77 | } |
||
78 | |||
79 | // start of two-byte character |
||
80 | if ((b & 224) == 192) { |
||
81 | o->bytes = 2; |
||
82 | o->pos = 1; |
||
83 | o->ch = (uint32_t)(b & 31) << 6; |
||
84 | return 0; |
||
85 | } |
||
86 | |||
87 | // start of three-byte character |
||
88 | if ((b & 240) == 224) { |
||
89 | o->bytes = 3; |
||
90 | o->pos = 1; |
||
91 | o->ch = (uint32_t)(b & 15) << 12; |
||
92 | return 0; |
||
93 | } |
||
94 | |||
95 | // start of four-byte character |
||
96 | if ((b & 248) == 240) { |
||
97 | o->bytes = 4; |
||
98 | o->pos = 1; |
||
99 | o->ch = (uint32_t)(b & 7) << 18; |
||
100 | return 0; |
||
101 | } |
||
102 | |||
103 | // continuation of multi-byte character |
||
104 | if ((b & 192) == 128 && o->bytes > 0) { |
||
105 | ASSERT(o->bytes <= 4) |
||
106 | ASSERT(o->pos > 0) |
||
107 | ASSERT(o->pos < o->bytes) |
||
108 | |||
109 | // add bits from this byte |
||
110 | o->ch |= (uint32_t)(b & 63) << (6 * (o->bytes - o->pos - 1)); |
||
111 | |||
112 | // end of multi-byte character? |
||
113 | if (o->pos == o->bytes - 1) { |
||
114 | // reset state |
||
115 | o->bytes = 0; |
||
116 | |||
117 | // don't report out-of-range characters |
||
118 | if (o->ch > UINT32_C(0x10FFFF)) { |
||
119 | return 0; |
||
120 | } |
||
121 | |||
122 | // don't report surrogates |
||
123 | if (o->ch >= UINT32_C(0xD800) && o->ch <= UINT32_C(0xDFFF)) { |
||
124 | return 0; |
||
125 | } |
||
126 | |||
127 | *out_ch = o->ch; |
||
128 | return 1; |
||
129 | } |
||
130 | |||
131 | // increment byte index |
||
132 | o->pos++; |
||
133 | |||
134 | return 0; |
||
135 | } |
||
136 | |||
137 | // error, reset state |
||
138 | o->bytes = 0; |
||
139 | |||
140 | return 0; |
||
141 | } |
||
142 | |||
143 | #endif |