WebSVN – BadVPN – Blame – Rev 1 – /misc/unicode

1

office

1

/**

2

* @file unicode_funcs.h

3

* @author Ambroz Bizjak <ambrop7@gmail.com>

*

* @section LICENSE

*

* Redistribution and use in source and binary forms, with or without

8

* modification, are permitted provided that the following conditions are met:

9

* 1. Redistributions of source code must retain the above copyright

10

* notice, this list of conditions and the following disclaimer.

11

* 2. Redistributions in binary form must reproduce the above copyright

12

* notice, this list of conditions and the following disclaimer in the

13

* documentation and/or other materials provided with the distribution.

14

* 3. Neither the name of the author nor the

15

* names of its contributors may be used to endorse or promote products

16

* derived from this software without specific prior written permission.

17

*

18

* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND

19

* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED

20

* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE

21

* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY

22

* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES

23

* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

24

* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND

25

* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

26

* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

27

* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

28

*/

29

30

#ifndef BADVPN_UNICODE_FUNCS_H

31

#define BADVPN_UNICODE_FUNCS_H

32

33

#include <misc/expstring.h>

34

#include <misc/bsize.h>

35

#include <misc/Utf8Encoder.h>

36

#include <misc/Utf8Decoder.h>

37

#include <misc/Utf16Encoder.h>

38

#include <misc/Utf16Decoder.h>

39

40

/**

41

* Decodes UTF-16 data as bytes into an allocated null-terminated UTF-8 string.

42

*

43

* @param data UTF-16 data, in big endian

44

* @param data_len size of data in bytes

45

* @param out_is_error if not NULL and the function returns a string,

46

* *out_is_error will be set to 0 or 1, indicating

47

* whether there have been errors decoding the input.

48

* A null decoded character is treated as an error.

49

* @return An UTF-8 null-terminated string which can be freed with free(),

50

* or NULL if out of memory.

51

*/

52

static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error);

53

54

/**

55

* Decodes UTF-8 data into UTF-16 data as bytes.

56

*

57

* @param data UTF-8 data

58

* @param data_len size of data in bytes

59

* @param out output buffer

60

* @param out_avail number of bytes available in output buffer

61

* @param out_len if not NULL, *out_len will contain the number of bytes

62

* required to store the resulting data (or overflow)

63

* @param out_is_error if not NULL, *out_is_error will contain 0 or 1,

64

* indicating whether there have been errors decoding

65

* the input

66

*/

67

static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error);

68

69

static char * unicode_decode_utf16_to_utf8 (const uint8_t *data, size_t data_len, int *out_is_error)

70

{

71

// will build the resulting UTF-8 string by appending to ExpString

72

ExpString str;

73

if (!ExpString_Init(&str)) {

goto fail0;

}

// init UTF-16 decoder

78

Utf16Decoder decoder;

79

Utf16Decoder_Init(&decoder);

80

81

// set initial input and input matching positions

size_t i_in = 0;

size_t i_ch = 0;

int error = 0;

while (i_in < data_len) {

88

// read two input bytes from the input position

89

uint8_t x = data[i_in++];

90

if (i_in == data_len) {

91

break;

92

}

93

uint8_t y = data[i_in++];

94

95

// combine them into a 16-bit value

96

uint16_t xy = (((uint16_t)x << 8) | (uint16_t)y);

97

98

// give the 16-bit value to the UTF-16 decoder and maybe

99

// receive a Unicode character back

100

uint32_t ch;

101

if (!Utf16Decoder_Input(&decoder, xy, &ch)) {

continue;

}

if (!error) {

// encode the Unicode character back into UTF-16

107

uint16_t chenc[2];

108

int chenc_n = Utf16Encoder_EncodeCharacter(ch, chenc);

109

ASSERT(chenc_n > 0)

110

111

// match the result with input

112

for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {

113

uint8_t cx = (chenc[chenc_i] >> 8);

114

uint8_t cy = (chenc[chenc_i] & 0xFF);

115

116

if (i_ch >= data_len || data[i_ch] != cx) {

error = 1;

break;

}

i_ch++;

if (i_ch >= data_len || data[i_ch] != cy) {

error = 1;

break;

}

i_ch++;

}

}

// we don't like null Unicode characters because we're building a

131

// null-terminated UTF-8 string

if (ch == 0) {

error = 1;

continue;

}

// encode the Unicode character into UTF-8

138

uint8_t enc[5];

139

int enc_n = Utf8Encoder_EncodeCharacter(ch, enc);

140

ASSERT(enc_n > 0)

141

142

// append the resulting UTF-8 bytes to the result string

143

enc[enc_n] = 0;

144

if (!ExpString_Append(&str, enc)) {

goto fail1;

}

}

// check if we matched the whole input string when encoding back

150

if (i_ch < data_len) {

error = 1;

}

if (out_is_error) {

155

*out_is_error = error;

156

}

157

return ExpString_Get(&str);

158

159

fail1:

160

ExpString_Free(&str);

fail0:

return NULL;

}

static void unicode_decode_utf8_to_utf16 (const uint8_t *data, size_t data_len, uint8_t *out, size_t out_avail, bsize_t *out_len, int *out_is_error)

166

{

167

Utf8Decoder decoder;

168

Utf8Decoder_Init(&decoder);

size_t i_in = 0;

size_t i_ch = 0;

bsize_t len = bsize_fromsize(0);

int error = 0;

while (i_in < data_len) {

178

uint8_t x = data[i_in++];

179

180

uint32_t ch;

181

if (!Utf8Decoder_Input(&decoder, x, &ch)) {

continue;

}

if (!error) {

uint8_t chenc[4];

int chenc_n = Utf8Encoder_EncodeCharacter(ch, chenc);

188

ASSERT(chenc_n > 0)

189

190

for (int chenc_i = 0; chenc_i < chenc_n; chenc_i++) {

191

if (i_ch >= data_len || data[i_ch] != chenc[chenc_i]) {

error = 1;

break;

}

i_ch++;

}

}

uint16_t enc[2];

int enc_n = Utf16Encoder_EncodeCharacter(ch, enc);

201

ASSERT(enc_n > 0)

202

203

len = bsize_add(len, bsize_fromsize(2 * enc_n));

204

205

for (int enc_i = 0; enc_i < enc_n; enc_i++) {

206

if (out_avail == 0) {

207

break;

208

}

209

*(out++) = (enc[enc_i] >> 8);

210

out_avail--;

211

212

if (out_avail == 0) {

213

break;

214

}

215

*(out++) = (enc[enc_i] & 0xFF);

out_avail--;

}

}

if (i_ch < data_len) {

error = 1;

}

if (out_len) {

*out_len = len;

}

if (out_is_error) {

228

*out_is_error = error;

}

}

#endif

BadVPN – Blame information for rev 1