BadVPN – Blame information for rev 1

Subversion Repositories:
Rev:
Rev Author Line No. Line
1 office 1 /**
2 * @file Utf8Decoder.h
3 * @author Ambroz Bizjak <ambrop7@gmail.com>
4 *
5 * @section LICENSE
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 * 3. Neither the name of the author nor the
15 * names of its contributors may be used to endorse or promote products
16 * derived from this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
22 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29  
30 #ifndef BADVPN_UTF8DECODER_H
31 #define BADVPN_UTF8DECODER_H
32  
33 #include <stdint.h>
34  
35 #include <misc/debug.h>
36  
37 /**
38 * Decodes UTF-8 data into Unicode characters.
39 */
40 typedef struct {
41 int bytes;
42 int pos;
43 uint32_t ch;
44 } Utf8Decoder;
45  
46 /**
47 * Initializes the UTF-8 decoder.
48 *
49 * @param o the object
50 */
51 static void Utf8Decoder_Init (Utf8Decoder *o);
52  
53 /**
54 * Inputs a byte to the decoder.
55 *
56 * @param o the object
57 * @param b byte to input
58 * @param out_ch will receive a Unicode character if this function returns 1.
59 * If written, the character will be in the range 0 - 0x10FFFF,
60 * excluding the surrogate range 0xD800 - 0xDFFF.
61 * @return 1 if a Unicode character has been written to *out_ch, 0 if not
62 */
63 static int Utf8Decoder_Input (Utf8Decoder *o, uint8_t b, uint32_t *out_ch);
64  
65 void Utf8Decoder_Init (Utf8Decoder *o)
66 {
67 o->bytes = 0;
68 }
69  
70 int Utf8Decoder_Input (Utf8Decoder *o, uint8_t b, uint32_t *out_ch)
71 {
72 // one-byte character
73 if ((b & 128) == 0) {
74 o->bytes = 0;
75 *out_ch = b;
76 return 1;
77 }
78  
79 // start of two-byte character
80 if ((b & 224) == 192) {
81 o->bytes = 2;
82 o->pos = 1;
83 o->ch = (uint32_t)(b & 31) << 6;
84 return 0;
85 }
86  
87 // start of three-byte character
88 if ((b & 240) == 224) {
89 o->bytes = 3;
90 o->pos = 1;
91 o->ch = (uint32_t)(b & 15) << 12;
92 return 0;
93 }
94  
95 // start of four-byte character
96 if ((b & 248) == 240) {
97 o->bytes = 4;
98 o->pos = 1;
99 o->ch = (uint32_t)(b & 7) << 18;
100 return 0;
101 }
102  
103 // continuation of multi-byte character
104 if ((b & 192) == 128 && o->bytes > 0) {
105 ASSERT(o->bytes <= 4)
106 ASSERT(o->pos > 0)
107 ASSERT(o->pos < o->bytes)
108  
109 // add bits from this byte
110 o->ch |= (uint32_t)(b & 63) << (6 * (o->bytes - o->pos - 1));
111  
112 // end of multi-byte character?
113 if (o->pos == o->bytes - 1) {
114 // reset state
115 o->bytes = 0;
116  
117 // don't report out-of-range characters
118 if (o->ch > UINT32_C(0x10FFFF)) {
119 return 0;
120 }
121  
122 // don't report surrogates
123 if (o->ch >= UINT32_C(0xD800) && o->ch <= UINT32_C(0xDFFF)) {
124 return 0;
125 }
126  
127 *out_ch = o->ch;
128 return 1;
129 }
130  
131 // increment byte index
132 o->pos++;
133  
134 return 0;
135 }
136  
137 // error, reset state
138 o->bytes = 0;
139  
140 return 0;
141 }
142  
143 #endif