WebSVN – BadVPN – Blame – Rev 1

1

office

1

/**

2

* @file Utf8Decoder.h

3

* @author Ambroz Bizjak <ambrop7@gmail.com>

*

* @section LICENSE

*

* Redistribution and use in source and binary forms, with or without

8

* modification, are permitted provided that the following conditions are met:

9

* 1. Redistributions of source code must retain the above copyright

10

* notice, this list of conditions and the following disclaimer.

11

* 2. Redistributions in binary form must reproduce the above copyright

12

* notice, this list of conditions and the following disclaimer in the

13

* documentation and/or other materials provided with the distribution.

14

* 3. Neither the name of the author nor the

15

* names of its contributors may be used to endorse or promote products

16

* derived from this software without specific prior written permission.

17

*

18

* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

19

* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

20

* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

21

* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY

22

* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

23

* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

24

* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

25

* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

26

* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

27

* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

28

*/

29

30

#ifndef BADVPN_UTF8DECODER_H

31

#define BADVPN_UTF8DECODER_H

32

33

#include <stdint.h>

34

35

#include <misc/debug.h>

36

37

/**

38

* Decodes UTF-8 data into Unicode characters.

*/

typedef struct {

int bytes;

int pos;

uint32_t ch;

} Utf8Decoder;

/**

* Initializes the UTF-8 decoder.

48

*

49

* @param o the object

50

*/

51

static void Utf8Decoder_Init (Utf8Decoder *o);

52

53

/**

54

* Inputs a byte to the decoder.

55

*

56

* @param o the object

57

* @param b byte to input

58

* @param out_ch will receive a Unicode character if this function returns 1.

59

* If written, the character will be in the range 0 - 0x10FFFF,

60

* excluding the surrogate range 0xD800 - 0xDFFF.

61

* @return 1 if a Unicode character has been written to *out_ch, 0 if not

62

*/

63

static int Utf8Decoder_Input (Utf8Decoder *o, uint8_t b, uint32_t *out_ch);

64

65

void Utf8Decoder_Init (Utf8Decoder *o)

{

o->bytes = 0;

}

int Utf8Decoder_Input (Utf8Decoder *o, uint8_t b, uint32_t *out_ch)

71

{

72

// one-byte character

73

if ((b & 128) == 0) {

o->bytes = 0;

*out_ch = b;

return 1;

}

// start of two-byte character

80

if ((b & 224) == 192) {

81

o->bytes = 2;

82

o->pos = 1;

83

o->ch = (uint32_t)(b & 31) << 6;

return 0;

}

// start of three-byte character

88

if ((b & 240) == 224) {

89

o->bytes = 3;

90

o->pos = 1;

91

o->ch = (uint32_t)(b & 15) << 12;

return 0;

}

// start of four-byte character

96

if ((b & 248) == 240) {

97

o->bytes = 4;

98

o->pos = 1;

99

o->ch = (uint32_t)(b & 7) << 18;

return 0;

}

// continuation of multi-byte character

104

if ((b & 192) == 128 && o->bytes > 0) {

105

ASSERT(o->bytes <= 4)

106

ASSERT(o->pos > 0)

107

ASSERT(o->pos < o->bytes)

108

109

// add bits from this byte

110

o->ch |= (uint32_t)(b & 63) << (6 * (o->bytes - o->pos - 1));

111

112

// end of multi-byte character?

113

if (o->pos == o->bytes - 1) {

// reset state

o->bytes = 0;

// don't report out-of-range characters

118

if (o->ch > UINT32_C(0x10FFFF)) {

return 0;

}

// don't report surrogates

123

if (o->ch >= UINT32_C(0xD800) && o->ch <= UINT32_C(0xDFFF)) {

return 0;

}

*out_ch = o->ch;

return 1;

}

// increment byte index

o->pos++;

return 0;

}

// error, reset state

o->bytes = 0;

return 0;

}

#endif

BadVPN – Blame information for rev 1