WebSVN – nexmon – Blame – Rev 1 – /utilities/glib/glib/gutf8.c

1

office

1

/* gutf8.c - Operations on UTF-8 strings.

*

*

* This library is free software; you can redistribute it and/or

7

* modify it under the terms of the GNU Lesser General Public

8

* License as published by the Free Software Foundation; either

9

* version 2 of the License, or (at your option) any later version.

10

*

11

* This library is distributed in the hope that it will be useful,

12

* but WITHOUT ANY WARRANTY; without even the implied warranty of

13

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

14

* Lesser General Public License for more details.

15

*

16

* You should have received a copy of the GNU Lesser General Public

17

* License along with this library; if not, see <http://www.gnu.org/licenses/>.

18

*/

19

20

#include "config.h"

21

22

#include <stdlib.h>

23

#ifdef HAVE_CODESET

24

#include <langinfo.h>

25

#endif

26

#include <string.h>

27

28

#ifdef G_PLATFORM_WIN32

29

#include <stdio.h>

30

#define STRICT

31

#include <windows.h>

#undef STRICT

#endif

#include "gconvert.h"

36

#include "ghash.h"

37

#include "gstrfuncs.h"

38

#include "gtestutils.h"

39

#include "gtypes.h"

40

#include "gthread.h"

41

#include "glibintl.h"

42

43

#define UTF8_COMPUTE(Char, Mask, Len) \

if (Char < 128) \

{ \

Len = 1; \

Mask = 0x7f; \

} \

else if ((Char & 0xe0) == 0xc0) \

{ \

Len = 2; \

Mask = 0x1f; \

} \

else if ((Char & 0xf0) == 0xe0) \

{ \

Len = 3; \

Mask = 0x0f; \

} \

else if ((Char & 0xf8) == 0xf0) \

{ \

Len = 4; \

Mask = 0x07; \

} \

else if ((Char & 0xfc) == 0xf8) \

{ \

Len = 5; \

Mask = 0x03; \

} \

else if ((Char & 0xfe) == 0xfc) \

{ \

Len = 6; \

Mask = 0x01; \

} \

else \

Len = -1;

#define UTF8_LENGTH(Char) \

78

((Char) < 0x80 ? 1 : \

79

((Char) < 0x800 ? 2 : \

80

((Char) < 0x10000 ? 3 : \

81

((Char) < 0x200000 ? 4 : \

82

((Char) < 0x4000000 ? 5 : 6)))))

83

84

85

#define UTF8_GET(Result, Chars, Count, Mask, Len) \

86

(Result) = (Chars)[0] & (Mask); \

87

for ((Count) = 1; (Count) < (Len); ++(Count)) \

88

{ \

89

if (((Chars)[(Count)] & 0xc0) != 0x80) \

{ \

(Result) = -1; \

break; \

} \

(Result) <<= 6; \

(Result) |= ((Chars)[(Count)] & 0x3f); \

}

/*

* Check whether a Unicode (5.2) char is in a valid range.

100

*

101

* The first check comes from the Unicode guarantee to never encode

102

* a point above 0x0010ffff, since UTF-16 couldn't represent it.

103

*

104

* The second check covers surrogate pairs (category Cs).

105

*

106

* @param Char the character

107

*/

108

#define UNICODE_VALID(Char) \

109

((Char) < 0x110000 && \

110

(((Char) & 0xFFFFF800) != 0xD800))

111

112

113

static const gchar utf8_skip_data[256] = {

114

1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

115

1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

116

1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

117

1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

118

1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

119

1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

120

2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,

121

3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1

122

};

123

124

const gchar * const g_utf8_skip = utf8_skip_data;

125

126

/**

127

* g_utf8_find_prev_char:

128

* @str: pointer to the beginning of a UTF-8 encoded string

129

* @p: pointer to some position within @str

130

*

131

* Given a position @p with a UTF-8 encoded string @str, find the start

132

* of the previous UTF-8 character starting before @p. Returns %NULL if no

133

* UTF-8 characters are present in @str before @p.

134

*

135

* @p does not have to be at the beginning of a UTF-8 character. No check

136

* is made to see if the character found is actually valid other than

137

* it starts with an appropriate byte.

138

*

139

* Returns: a pointer to the found character or %NULL.

140

*/

141

gchar *

142

g_utf8_find_prev_char (const char *str,

143

const char *p)

144

{

145

for (--p; p >= str; --p)

146

{

147

if ((*p & 0xc0) != 0x80)

return (gchar *)p;

}

return NULL;

}

/**

* g_utf8_find_next_char:

155

* @p: a pointer to a position within a UTF-8 encoded string

156

* @end: (nullable): a pointer to the byte following the end of the string,

157

* or %NULL to indicate that the string is nul-terminated

158

*

159

* Finds the start of the next UTF-8 character in the string after @p.

160

*

161

* @p does not have to be at the beginning of a UTF-8 character. No check

162

* is made to see if the character found is actually valid other than

163

* it starts with an appropriate byte.

164

*

165

* Returns: a pointer to the found character or %NULL

166

*/

167

gchar *

168

g_utf8_find_next_char (const gchar *p,

const gchar *end)

{

if (*p)

{

if (end)

for (++p; p < end && (*p & 0xc0) == 0x80; ++p)

175

;

176

else

177

for (++p; (*p & 0xc0) == 0x80; ++p)

178

;

179

}

180

return (p == end) ? NULL : (gchar *)p;

}

/**

* g_utf8_prev_char:

185

* @p: a pointer to a position within a UTF-8 encoded string

186

*

187

* Finds the previous UTF-8 character in the string before @p.

188

*

189

* @p does not have to be at the beginning of a UTF-8 character. No check

190

* is made to see if the character found is actually valid other than

191

* it starts with an appropriate byte. If @p might be the first

192

* character of the string, you must use g_utf8_find_prev_char() instead.

193

*

194

* Returns: a pointer to the found character

195

*/

196

gchar *

197

g_utf8_prev_char (const gchar *p)

{

while (TRUE)

{

p--;

if ((*p & 0xc0) != 0x80)

return (gchar *)p;

}

}

/**

* g_utf8_strlen:

* @p: pointer to the start of a UTF-8 encoded string

210

* @max: the maximum number of bytes to examine. If @max

211

* is less than 0, then the string is assumed to be

212

* nul-terminated. If @max is 0, @p will not be examined and

213

* may be %NULL. If @max is greater than 0, up to @max

214

* bytes are examined

215

*

216

* Computes the length of the string in characters, not including

217

* the terminating nul character. If the @max'th byte falls in the

218

* middle of a character, the last (partial) character is not counted.

219

*

220

* Returns: the length of the string in characters

221

*/

222

glong

223

g_utf8_strlen (const gchar *p,

gssize max)

{

glong len = 0;

const gchar *start = p;

228

g_return_val_if_fail (p != NULL || max == 0, 0);

if (max < 0)

{

while (*p)

{

p = g_utf8_next_char (p);

++len;

}

}

else

{

if (max == 0 || !*p)

241

return 0;

242

243

p = g_utf8_next_char (p);

244

245

while (p - start < max && *p)

246

{

247

++len;

248

p = g_utf8_next_char (p);

249

}

250

251

/* only do the last len increment if we got a complete

252

* char (don't count partial chars)

253

*/

254

if (p - start <= max)

++len;

}

return len;

}

/**

* g_utf8_substring:

263

* @str: a UTF-8 encoded string

264

* @start_pos: a character offset within @str

265

* @end_pos: another character offset within @str

266

*

267

* Copies a substring out of a UTF-8 encoded string.

268

* The substring will contain @end_pos - @start_pos characters.

269

*

270

* Returns: a newly allocated copy of the requested

271

* substring. Free with g_free() when no longer needed.

*

* Since: 2.30

*/

gchar *

g_utf8_substring (const gchar *str,

glong start_pos,

glong end_pos)

{

gchar *start, *end, *out;

281

282

start = g_utf8_offset_to_pointer (str, start_pos);

283

end = g_utf8_offset_to_pointer (start, end_pos - start_pos);

284

285

out = g_malloc (end - start + 1);

286

memcpy (out, start, end - start);

287

out[end - start] = 0;

return out;

}

/**

* g_utf8_get_char:

* @p: a pointer to Unicode character encoded as UTF-8

295

*

296

* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.

297

*

298

* If @p does not point to a valid UTF-8 encoded character, results

299

* are undefined. If you are not sure that the bytes are complete

300

* valid Unicode characters, you should use g_utf8_get_char_validated()

301

* instead.

302

*

303

* Returns: the resulting character

304

*/

305

gunichar

306

g_utf8_get_char (const gchar *p)

307

{

308

int i, mask = 0, len;

309

gunichar result;

310

unsigned char c = (unsigned char) *p;

311

312

UTF8_COMPUTE (c, mask, len);

313

if (len == -1)

314

return (gunichar)-1;

315

UTF8_GET (result, p, i, mask, len);

return result;

}

/**

* g_utf8_offset_to_pointer:

322

* @str: a UTF-8 encoded string

323

* @offset: a character offset within @str

324

*

325

* Converts from an integer character offset to a pointer to a position

326

* within the string.

327

*

328

* Since 2.10, this function allows to pass a negative @offset to

329

* step backwards. It is usually worth stepping backwards from the end

330

* instead of forwards if @offset is in the last fourth of the string,

331

* since moving forward is about 3 times faster than moving backward.

332

*

333

* Note that this function doesn't abort when reaching the end of @str.

334

* Therefore you should be sure that @offset is within string boundaries

335

* before calling that function. Call g_utf8_strlen() when unsure.

336

* This limitation exists as this function is called frequently during

337

* text rendering and therefore has to be as fast as possible.

338

*

339

* Returns: the resulting pointer

340

*/

341

gchar *

342

g_utf8_offset_to_pointer (const gchar *str,

343

glong offset)

344

{

345

const gchar *s = str;

if (offset > 0)

while (offset--)

s = g_utf8_next_char (s);

else

{

const char *s1;

/* This nice technique for fast backwards stepping

355

* through a UTF-8 string was dubbed "stutter stepping"

356

* by its inventor, Larry Ewing.

*/

while (offset)

{

s1 = s;

s += offset;

while ((*s & 0xc0) == 0x80)

363

s--;

364

365

offset += g_utf8_pointer_to_offset (s, s1);

}

}

return (gchar *)s;

}

/**

* g_utf8_pointer_to_offset:

374

* @str: a UTF-8 encoded string

375

* @pos: a pointer to a position within @str

376

*

377

* Converts from a pointer to position within a string to a integer

378

* character offset.

379

*

380

* Since 2.10, this function allows @pos to be before @str, and returns

381

* a negative offset in this case.

382

*

383

* Returns: the resulting character offset

384

*/

385

glong

386

g_utf8_pointer_to_offset (const gchar *str,

387

const gchar *pos)

388

{

389

const gchar *s = str;

glong offset = 0;

if (pos < str)

offset = - g_utf8_pointer_to_offset (pos, str);

else

while (s < pos)

{

s = g_utf8_next_char (s);

offset++;

}

return offset;

}

/**

* g_utf8_strncpy:

* @dest: buffer to fill with characters from @src

408

* @src: UTF-8 encoded string

409

* @n: character count

410

*

411

* Like the standard C strncpy() function, but copies a given number

412

* of characters instead of a given number of bytes. The @src string

413

* must be valid UTF-8 encoded text. (Use g_utf8_validate() on all

414

* text before trying to use UTF-8 utility functions with it.)

*

* Returns: @dest

*/

gchar *

g_utf8_strncpy (gchar *dest,

const gchar *src,

gsize n)

{

const gchar *s = src;

424

while (n && *s)

425

{

426

s = g_utf8_next_char(s);

427

n--;

428

}

429

strncpy(dest, src, s - src);

dest[s - src] = 0;

return dest;

}

/* unicode_strchr */

435

436

/**

437

* g_unichar_to_utf8:

438

* @c: a Unicode character code

439

* @outbuf: (out caller-allocates) (optional): output buffer, must have at

440

* least 6 bytes of space. If %NULL, the length will be computed and

441

* returned and nothing will be written to @outbuf.

442

*

443

* Converts a single character to UTF-8.

444

*

445

* Returns: number of bytes written

446

*/

447

int

448

g_unichar_to_utf8 (gunichar c,

449

gchar *outbuf)

450

{

451

/* If this gets modified, also update the copy in g_string_insert_unichar() */

guint len = 0;

int first;

int i;

if (c < 0x80)

{

first = 0;

len = 1;

}

else if (c < 0x800)

{

first = 0xc0;

len = 2;

}

else if (c < 0x10000)

{

first = 0xe0;

len = 3;

}

else if (c < 0x200000)

{

first = 0xf0;

len = 4;

}

else if (c < 0x4000000)

{

first = 0xf8;

len = 5;

}

else

{

first = 0xfc;

len = 6;

}

if (outbuf)

{

for (i = len - 1; i > 0; --i)

490

{

491

outbuf[i] = (c & 0x3f) | 0x80;

492

c >>= 6;

493

}

494

outbuf[0] = c | first;

}

return len;

}

/**

* g_utf8_strchr:

* @p: a nul-terminated UTF-8 encoded string

503

* @len: the maximum length of @p

504

* @c: a Unicode character

505

*

506

* Finds the leftmost occurrence of the given Unicode character

507

* in a UTF-8 encoded string, while limiting the search to @len bytes.

508

* If @len is -1, allow unbounded search.

509

*

510

* Returns: %NULL if the string does not contain the character,

511

* otherwise, a pointer to the start of the leftmost occurrence

512

* of the character in the string.

513

*/

514

gchar *

515

g_utf8_strchr (const char *p,

gssize len,

gunichar c)

{

gchar ch[10];

gint charlen = g_unichar_to_utf8 (c, ch);

522

ch[charlen] = '\0';

523

524

return g_strstr_len (p, len, ch);

}

/**

* g_utf8_strrchr:

* @p: a nul-terminated UTF-8 encoded string

531

* @len: the maximum length of @p

532

* @c: a Unicode character

533

*

534

* Find the rightmost occurrence of the given Unicode character

535

* in a UTF-8 encoded string, while limiting the search to @len bytes.

536

* If @len is -1, allow unbounded search.

537

*

538

* Returns: %NULL if the string does not contain the character,

539

* otherwise, a pointer to the start of the rightmost occurrence

540

* of the character in the string.

541

*/

542

gchar *

543

g_utf8_strrchr (const char *p,

gssize len,

gunichar c)

{

gchar ch[10];

gint charlen = g_unichar_to_utf8 (c, ch);

550

ch[charlen] = '\0';

551

552

return g_strrstr_len (p, len, ch);

}

/* Like g_utf8_get_char, but take a maximum length

557

* and return (gunichar)-2 on incomplete trailing character;

558

* also check for malformed or overlong sequences

559

* and return (gunichar)-1 in this case.

560

*/

561

static inline gunichar

562

g_utf8_get_char_extended (const gchar *p,

gssize max_len)

{

guint i, len;

gunichar min_code;

gunichar wc = (guchar) *p;

if (wc < 0x80)

{

return wc;

}

else if (G_UNLIKELY (wc < 0xc0))

574

{

575

return (gunichar)-1;

576

}

577

else if (wc < 0xe0)

{

len = 2;

wc &= 0x1f;

min_code = 1 << 7;

}

else if (wc < 0xf0)

{

len = 3;

wc &= 0x0f;

min_code = 1 << 11;

588

}

589

else if (wc < 0xf8)

{

len = 4;

wc &= 0x07;

min_code = 1 << 16;

594

}

595

else if (wc < 0xfc)

{

len = 5;

wc &= 0x03;

min_code = 1 << 21;

600

}

601

else if (wc < 0xfe)

{

len = 6;

wc &= 0x01;

min_code = 1 << 26;

}

else

{

return (gunichar)-1;

610

}

611

612

if (G_UNLIKELY (max_len >= 0 && len > max_len))

613

{

614

for (i = 1; i < max_len; i++)

615

{

616

if ((((guchar *)p)[i] & 0xc0) != 0x80)

617

return (gunichar)-1;

618

}

619

return (gunichar)-2;

620

}

621

622

for (i = 1; i < len; ++i)

623

{

624

gunichar ch = ((guchar *)p)[i];

625

626

if (G_UNLIKELY ((ch & 0xc0) != 0x80))

627

{

628

if (ch)

629

return (gunichar)-1;

630

else

631

return (gunichar)-2;

}

wc <<= 6;

wc |= (ch & 0x3f);

}

if (G_UNLIKELY (wc < min_code))

639

return (gunichar)-1;

return wc;

}

/**

* g_utf8_get_char_validated:

646

* @p: a pointer to Unicode character encoded as UTF-8

647

* @max_len: the maximum number of bytes to read, or -1, for no maximum or

648

* if @p is nul-terminated

649

*

650

* Convert a sequence of bytes encoded as UTF-8 to a Unicode character.

651

* This function checks for incomplete characters, for invalid characters

652

* such as characters that are out of the range of Unicode, and for

653

* overlong encodings of valid characters.

654

*

655

* Returns: the resulting character. If @p points to a partial

656

* sequence at the end of a string that could begin a valid

657

* character (or if @max_len is zero), returns (gunichar)-2;

658

* otherwise, if @p does not point to a valid UTF-8 encoded

659

* Unicode character, returns (gunichar)-1.

660

*/

661

gunichar

662

g_utf8_get_char_validated (const gchar *p,

gssize max_len)

{

gunichar result;

if (max_len == 0)

return (gunichar)-2;

669

670

result = g_utf8_get_char_extended (p, max_len);

671

672

if (result & 0x80000000)

673

return result;

674

else if (!UNICODE_VALID (result))

675

return (gunichar)-1;

else

return result;

}

#define CONT_BYTE_FAST(p) ((guchar)*p++ & 0x3f)

681

682

/**

683

* g_utf8_to_ucs4_fast:

684

* @str: a UTF-8 encoded string

685

* @len: the maximum length of @str to use, in bytes. If @len < 0,

686

* then the string is nul-terminated.

687

* @items_written: (out caller-allocates) (optional): location to store the

688

* number of characters in the result, or %NULL.

689

*

690

* Convert a string from UTF-8 to a 32-bit fixed width

691

* representation as UCS-4, assuming valid UTF-8 input.

692

* This function is roughly twice as fast as g_utf8_to_ucs4()

693

* but does no error checking on the input. A trailing 0 character

694

* will be added to the string after the converted text.

695

*

696

* Returns: a pointer to a newly allocated UCS-4 string.

697

* This value must be freed with g_free().

698

*/

699

gunichar *

700

g_utf8_to_ucs4_fast (const gchar *str,

701

glong len,

702

glong *items_written)

{

gunichar *result;

gint n_chars, i;

const gchar *p;

g_return_val_if_fail (str != NULL, NULL);

p = str;

n_chars = 0;

if (len < 0)

{

while (*p)

{

p = g_utf8_next_char (p);

++n_chars;

}

}

else

{

while (p < str + len && *p)

723

{

724

p = g_utf8_next_char (p);

++n_chars;

}

}

result = g_new (gunichar, n_chars + 1);

730

731

p = str;

732

for (i=0; i < n_chars; i++)

733

{

734

guchar first = (guchar)*p++;

gunichar wc;

if (first < 0xc0)

{

/* We really hope first < 0x80, but we don't want to test an

740

* extra branch for invalid input, which this function

741

* does not care about. Handling unexpected continuation bytes

742

* here will do the least damage. */

wc = first;

}

else

{

gunichar c1 = CONT_BYTE_FAST(p);

748

if (first < 0xe0)

749

{

750

wc = ((first & 0x1f) << 6) | c1;

}

else

{

gunichar c2 = CONT_BYTE_FAST(p);

755

if (first < 0xf0)

756

{

757

wc = ((first & 0x0f) << 12) | (c1 << 6) | c2;

}

else

{

gunichar c3 = CONT_BYTE_FAST(p);

762

wc = ((first & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;

763

if (G_UNLIKELY (first >= 0xf8))

764

{

765

/* This can't be valid UTF-8, but g_utf8_next_char()

766

* and company allow out-of-range sequences */

767

gunichar mask = 1 << 20;

768

while ((wc & mask) != 0)

769

{

770

wc <<= 6;

771

wc |= CONT_BYTE_FAST(p);

mask <<= 5;

}

wc &= mask - 1;

}

}

}

}

result[i] = wc;

}

result[i] = 0;

if (items_written)

*items_written = i;

return result;

}

static gpointer

try_malloc_n (gsize n_blocks, gsize n_block_bytes, GError **error)

791

{

792

gpointer ptr = g_try_malloc_n (n_blocks, n_block_bytes);

793

if (ptr == NULL)

794

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_MEMORY,

795

_("Failed to allocate memory"));

return ptr;

}

/**

* g_utf8_to_ucs4:

* @str: a UTF-8 encoded string

802

* @len: the maximum length of @str to use, in bytes. If @len < 0,

803

* then the string is nul-terminated.

804

* @items_read: (out caller-allocates) (optional): location to store number of

805

* bytes read, or %NULL.

806

* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be

807

* returned in case @str contains a trailing partial

808

* character. If an error occurs then the index of the

809

* invalid input is stored here.

810

* @items_written: (out caller-allocates) (optional): location to store number

811

* of characters written or %NULL. The value here stored does not include

812

* the trailing 0 character.

813

* @error: location to store the error occurring, or %NULL to ignore

814

* errors. Any of the errors in #GConvertError other than

815

* %G_CONVERT_ERROR_NO_CONVERSION may occur.

816

*

817

* Convert a string from UTF-8 to a 32-bit fixed width

818

* representation as UCS-4. A trailing 0 character will be added to the

819

* string after the converted text.

820

*

821

* Returns: a pointer to a newly allocated UCS-4 string.

822

* This value must be freed with g_free(). If an error occurs,

823

* %NULL will be returned and @error set.

824

*/

825

gunichar *

826

g_utf8_to_ucs4 (const gchar *str,

827

glong len,

828

glong *items_read,

829

glong *items_written,

830

GError **error)

831

{

832

gunichar *result = NULL;

gint n_chars, i;

const gchar *in;

in = str;

n_chars = 0;

while ((len < 0 || str + len - in > 0) && *in)

839

{

840

gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);

841

if (wc & 0x80000000)

842

{

843

if (wc == (gunichar)-2)

{

if (items_read)

break;

else

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,

849

_("Partial character sequence at end of input"));

850

}

851

else

852

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

853

_("Invalid byte sequence in conversion input"));

goto err_out;

}

n_chars++;

in = g_utf8_next_char (in);

861

}

862

863

result = try_malloc_n (n_chars + 1, sizeof (gunichar), error);

864

if (result == NULL)

goto err_out;

in = str;

for (i=0; i < n_chars; i++)

869

{

870

result[i] = g_utf8_get_char (in);

871

in = g_utf8_next_char (in);

}

result[i] = 0;

if (items_written)

*items_written = n_chars;

err_out:

if (items_read)

*items_read = in - str;

return result;

}

/**

* g_ucs4_to_utf8:

* @str: a UCS-4 encoded string

888

* @len: the maximum length (number of characters) of @str to use.

889

* If @len < 0, then the string is nul-terminated.

890

* @items_read: (out caller-allocates) (optional): location to store number of

891

* characters read, or %NULL.

892

* @items_written: (out caller-allocates) (optional): location to store number

893

* of bytes written or %NULL. The value here stored does not include the

894

* trailing 0 byte.

895

* @error: location to store the error occurring, or %NULL to ignore

896

* errors. Any of the errors in #GConvertError other than

897

* %G_CONVERT_ERROR_NO_CONVERSION may occur.

898

*

899

* Convert a string from a 32-bit fixed width representation as UCS-4.

900

* to UTF-8. The result will be terminated with a 0 byte.

901

*

902

* Returns: a pointer to a newly allocated UTF-8 string.

903

* This value must be freed with g_free(). If an error occurs,

904

* %NULL will be returned and @error set. In that case, @items_read

905

* will be set to the position of the first invalid input character.

906

*/

907

gchar *

908

g_ucs4_to_utf8 (const gunichar *str,

909

glong len,

910

glong *items_read,

911

glong *items_written,

912

GError **error)

913

{

914

gint result_length;

915

gchar *result = NULL;

gchar *p;

gint i;

result_length = 0;

for (i = 0; len < 0 || i < len ; i++)

{

if (!str[i])

break;

if (str[i] >= 0x80000000)

926

{

927

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

928

_("Character out of range for UTF-8"));

goto err_out;

}

result_length += UTF8_LENGTH (str[i]);

933

}

934

935

result = try_malloc_n (result_length + 1, 1, error);

936

if (result == NULL)

goto err_out;

p = result;

i = 0;

while (p < result + result_length)

943

p += g_unichar_to_utf8 (str[i++], p);

*p = '\0';

if (items_written)

*items_written = p - result;

err_out:

if (items_read)

*items_read = i;

return result;

}

#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)

/**

* g_utf16_to_utf8:

* @str: a UTF-16 encoded string

962

* @len: the maximum length (number of #gunichar2) of @str to use.

963

* If @len < 0, then the string is nul-terminated.

964

* @items_read: (out caller-allocates) (optional): location to store number of

965

* words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will

966

* be returned in case @str contains a trailing partial character. If

967

* an error occurs then the index of the invalid input is stored here.

968

* @items_written: (out caller-allocates) (optional): location to store number

969

* of bytes written, or %NULL. The value stored here does not include the

970

* trailing 0 byte.

971

* @error: location to store the error occurring, or %NULL to ignore

972

* errors. Any of the errors in #GConvertError other than

973

* %G_CONVERT_ERROR_NO_CONVERSION may occur.

974

*

975

* Convert a string from UTF-16 to UTF-8. The result will be

976

* terminated with a 0 byte.

977

*

978

* Note that the input is expected to be already in native endianness,

979

* an initial byte-order-mark character is not handled specially.

980

* g_convert() can be used to convert a byte buffer of UTF-16 data of

981

* ambiguous endianess.

982

*

983

* Further note that this function does not validate the result

984

* string; it may e.g. include embedded NUL characters. The only

985

* validation done by this function is to ensure that the input can

986

* be correctly interpreted as UTF-16, i.e. it doesn't contain

987

* things unpaired surrogates.

988

*

989

* Returns: a pointer to a newly allocated UTF-8 string.

990

* This value must be freed with g_free(). If an error occurs,

991

* %NULL will be returned and @error set.

992

**/

993

gchar *

994

g_utf16_to_utf8 (const gunichar2 *str,

995

glong len,

996

glong *items_read,

997

glong *items_written,

998

GError **error)

999

{

1000

/* This function and g_utf16_to_ucs4 are almost exactly identical -

1001

* The lines that differ are marked.

1002

*/

1003

const gunichar2 *in;

1004

gchar *out;

1005

gchar *result = NULL;

1006

gint n_bytes;

1007

gunichar high_surrogate;

1008

1009

g_return_val_if_fail (str != NULL, NULL);

n_bytes = 0;

in = str;

high_surrogate = 0;

1014

while ((len < 0 || in - str < len) && *in)

{

gunichar2 c = *in;

gunichar wc;

if (c >= 0xdc00 && c < 0xe000) /* low surrogate */

1020

{

1021

if (high_surrogate)

1022

{

1023

wc = SURROGATE_VALUE (high_surrogate, c);

1024

high_surrogate = 0;

}

else

{

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

1029

_("Invalid sequence in conversion input"));

goto err_out;

}

}

else

{

if (high_surrogate)

1036

{

1037

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

1038

_("Invalid sequence in conversion input"));

goto err_out;

}

if (c >= 0xd800 && c < 0xdc00) /* high surrogate */

1043

{

1044

high_surrogate = c;

goto next1;

}

else

wc = c;

}

/********** DIFFERENT for UTF8/UCS4 **********/

1052

n_bytes += UTF8_LENGTH (wc);

next1:

in++;

}

if (high_surrogate && !items_read)

1059

{

1060

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,

1061

_("Partial character sequence at end of input"));

goto err_out;

}

/* At this point, everything is valid, and we just need to convert

1066

*/

1067

/********** DIFFERENT for UTF8/UCS4 **********/

1068

result = try_malloc_n (n_bytes + 1, 1, error);

1069

if (result == NULL)

1070

goto err_out;

1071

1072

high_surrogate = 0;

1073

out = result;

1074

in = str;

1075

while (out < result + n_bytes)

{

gunichar2 c = *in;

gunichar wc;

if (c >= 0xdc00 && c < 0xe000) /* low surrogate */

1081

{

1082

wc = SURROGATE_VALUE (high_surrogate, c);

1083

high_surrogate = 0;

1084

}

1085

else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */

1086

{

1087

high_surrogate = c;

goto next2;

}

else

wc = c;

/********** DIFFERENT for UTF8/UCS4 **********/

1094

out += g_unichar_to_utf8 (wc, out);

next2:

in++;

}

/********** DIFFERENT for UTF8/UCS4 **********/

*out = '\0';

if (items_written)

/********** DIFFERENT for UTF8/UCS4 **********/

1105

*items_written = out - result;

err_out:

if (items_read)

*items_read = in - str;

return result;

}

/**

* g_utf16_to_ucs4:

* @str: a UTF-16 encoded string

1117

* @len: the maximum length (number of #gunichar2) of @str to use.

1118

* If @len < 0, then the string is nul-terminated.

1119

* @items_read: (out caller-allocates) (optional): location to store number of

1120

* words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will

1121

* be returned in case @str contains a trailing partial character. If

1122

* an error occurs then the index of the invalid input is stored here.

1123

* @items_written: (out caller-allocates) (optional): location to store number

1124

* of characters written, or %NULL. The value stored here does not include

1125

* the trailing 0 character.

1126

* @error: location to store the error occurring, or %NULL to ignore

1127

* errors. Any of the errors in #GConvertError other than

1128

* %G_CONVERT_ERROR_NO_CONVERSION may occur.

1129

*

1130

* Convert a string from UTF-16 to UCS-4. The result will be

1131

* nul-terminated.

1132

*

1133

* Returns: a pointer to a newly allocated UCS-4 string.

1134

* This value must be freed with g_free(). If an error occurs,

1135

* %NULL will be returned and @error set.

1136

*/

1137

gunichar *

1138

g_utf16_to_ucs4 (const gunichar2 *str,

1139

glong len,

1140

glong *items_read,

1141

glong *items_written,

1142

GError **error)

1143

{

1144

const gunichar2 *in;

1145

gchar *out;

1146

gchar *result = NULL;

1147

gint n_bytes;

1148

gunichar high_surrogate;

1149

1150

g_return_val_if_fail (str != NULL, NULL);

n_bytes = 0;

in = str;

high_surrogate = 0;

1155

while ((len < 0 || in - str < len) && *in)

{

gunichar2 c = *in;

if (c >= 0xdc00 && c < 0xe000) /* low surrogate */

1160

{

1161

if (high_surrogate)

1162

{

1163

high_surrogate = 0;

}

else

{

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

1168

_("Invalid sequence in conversion input"));

goto err_out;

}

}

else

{

if (high_surrogate)

1175

{

1176

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

1177

_("Invalid sequence in conversion input"));

goto err_out;

}

if (c >= 0xd800 && c < 0xdc00) /* high surrogate */

1182

{

1183

high_surrogate = c;

goto next1;

}

}

/********** DIFFERENT for UTF8/UCS4 **********/

1189

n_bytes += sizeof (gunichar);

next1:

in++;

}

if (high_surrogate && !items_read)

1196

{

1197

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,

1198

_("Partial character sequence at end of input"));

goto err_out;

}

/* At this point, everything is valid, and we just need to convert

1203

*/

1204

/********** DIFFERENT for UTF8/UCS4 **********/

1205

result = try_malloc_n (n_bytes + 4, 1, error);

1206

if (result == NULL)

1207

goto err_out;

1208

1209

high_surrogate = 0;

1210

out = result;

1211

in = str;

1212

while (out < result + n_bytes)

{

gunichar2 c = *in;

gunichar wc;

if (c >= 0xdc00 && c < 0xe000) /* low surrogate */

1218

{

1219

wc = SURROGATE_VALUE (high_surrogate, c);

1220

high_surrogate = 0;

1221

}

1222

else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */

1223

{

1224

high_surrogate = c;

goto next2;

}

else

wc = c;

/********** DIFFERENT for UTF8/UCS4 **********/

1231

*(gunichar *)out = wc;

1232

out += sizeof (gunichar);

next2:

in++;

}

/********** DIFFERENT for UTF8/UCS4 **********/

1239

*(gunichar *)out = 0;

1240

1241

if (items_written)

1242

/********** DIFFERENT for UTF8/UCS4 **********/

1243

*items_written = (out - result) / sizeof (gunichar);

err_out:

if (items_read)

*items_read = in - str;

1248

1249

return (gunichar *)result;

}

/**

* g_utf8_to_utf16:

* @str: a UTF-8 encoded string

1255

* @len: the maximum length (number of bytes) of @str to use.

1256

* If @len < 0, then the string is nul-terminated.

1257

* @items_read: (out caller-allocates) (optional): location to store number of

1258

* bytes read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will

1259

* be returned in case @str contains a trailing partial character. If

1260

* an error occurs then the index of the invalid input is stored here.

1261

* @items_written: (out caller-allocates) (optional): location to store number

1262

* of #gunichar2 written, or %NULL. The value stored here does not include

1263

* the trailing 0.

1264

* @error: location to store the error occurring, or %NULL to ignore

1265

* errors. Any of the errors in #GConvertError other than

1266

* %G_CONVERT_ERROR_NO_CONVERSION may occur.

1267

*

1268

* Convert a string from UTF-8 to UTF-16. A 0 character will be

1269

* added to the result after the converted text.

1270

*

1271

* Returns: a pointer to a newly allocated UTF-16 string.

1272

* This value must be freed with g_free(). If an error occurs,

1273

* %NULL will be returned and @error set.

1274

*/

1275

gunichar2 *

1276

g_utf8_to_utf16 (const gchar *str,

1277

glong len,

1278

glong *items_read,

1279

glong *items_written,

1280

GError **error)

1281

{

1282

gunichar2 *result = NULL;

gint n16;

const gchar *in;

gint i;

g_return_val_if_fail (str != NULL, NULL);

in = str;

n16 = 0;

while ((len < 0 || str + len - in > 0) && *in)

1292

{

1293

gunichar wc = g_utf8_get_char_extended (in, len < 0 ? 6 : str + len - in);

1294

if (wc & 0x80000000)

1295

{

1296

if (wc == (gunichar)-2)

{

if (items_read)

break;

else

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,

1302

_("Partial character sequence at end of input"));

1303

}

1304

else

1305

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

1306

_("Invalid byte sequence in conversion input"));

goto err_out;

}

if (wc < 0xd800)

n16 += 1;

else if (wc < 0xe000)

1314

{

1315

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

1316

_("Invalid sequence in conversion input"));

goto err_out;

}

else if (wc < 0x10000)

1321

n16 += 1;

1322

else if (wc < 0x110000)

n16 += 2;

else

{

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

1327

_("Character out of range for UTF-16"));

goto err_out;

}

in = g_utf8_next_char (in);

1333

}

1334

1335

result = try_malloc_n (n16 + 1, sizeof (gunichar2), error);

1336

if (result == NULL)

goto err_out;

in = str;

for (i = 0; i < n16;)

1341

{

1342

gunichar wc = g_utf8_get_char (in);

if (wc < 0x10000)

{

result[i++] = wc;

}

else

{

result[i++] = (wc - 0x10000) / 0x400 + 0xd800;

1351

result[i++] = (wc - 0x10000) % 0x400 + 0xdc00;

1352

}

1353

1354

in = g_utf8_next_char (in);

}

result[i] = 0;

if (items_written)

*items_written = n16;

err_out:

if (items_read)

*items_read = in - str;

return result;

}

/**

* g_ucs4_to_utf16:

* @str: a UCS-4 encoded string

1372

* @len: the maximum length (number of characters) of @str to use.

1373

* If @len < 0, then the string is nul-terminated.

1374

* @items_read: (out caller-allocates) (optional): location to store number of

1375

* bytes read, or %NULL. If an error occurs then the index of the invalid

1376

* input is stored here.

1377

* @items_written: (out caller-allocates) (optional): location to store number

1378

* of #gunichar2 written, or %NULL. The value stored here does not include

1379

* the trailing 0.

1380

* @error: location to store the error occurring, or %NULL to ignore

1381

* errors. Any of the errors in #GConvertError other than

1382

* %G_CONVERT_ERROR_NO_CONVERSION may occur.

1383

*

1384

* Convert a string from UCS-4 to UTF-16. A 0 character will be

1385

* added to the result after the converted text.

1386

*

1387

* Returns: a pointer to a newly allocated UTF-16 string.

1388

* This value must be freed with g_free(). If an error occurs,

1389

* %NULL will be returned and @error set.

1390

*/

1391

gunichar2 *

1392

g_ucs4_to_utf16 (const gunichar *str,

1393

glong len,

1394

glong *items_read,

1395

glong *items_written,

1396

GError **error)

1397

{

1398

gunichar2 *result = NULL;

gint n16;

gint i, j;

n16 = 0;

i = 0;

while ((len < 0 || i < len) && str[i])

1405

{

1406

gunichar wc = str[i];

if (wc < 0xd800)

n16 += 1;

else if (wc < 0xe000)

1411

{

1412

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

1413

_("Invalid sequence in conversion input"));

goto err_out;

}

else if (wc < 0x10000)

1418

n16 += 1;

1419

else if (wc < 0x110000)

n16 += 2;

else

{

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

1424

_("Character out of range for UTF-16"));

goto err_out;

}

i++;

}

result = try_malloc_n (n16 + 1, sizeof (gunichar2), error);

1433

if (result == NULL)

1434

goto err_out;

1435

1436

for (i = 0, j = 0; j < n16; i++)

1437

{

1438

gunichar wc = str[i];

if (wc < 0x10000)

{

result[j++] = wc;

}

else

{

result[j++] = (wc - 0x10000) / 0x400 + 0xd800;

1447

result[j++] = (wc - 0x10000) % 0x400 + 0xdc00;

}

}

result[j] = 0;

if (items_written)

*items_written = n16;

err_out:

if (items_read)

*items_read = i;

return result;

}

#define VALIDATE_BYTE(mask, expect) \

1463

G_STMT_START { \

1464

if (G_UNLIKELY((*(guchar *)p & (mask)) != (expect))) \

goto error; \

} G_STMT_END

/* see IETF RFC 3629 Section 4 */

1469

1470

static const gchar *

1471

fast_validate (const char *str)

{

const gchar *p;

for (p = str; *p; p++)

1477

{

1478

if (*(guchar *)p < 128)

/* done */;

else

{

const gchar *last;

last = p;

if (*(guchar *)p < 0xe0) /* 110xxxxx */

1486

{

1487

if (G_UNLIKELY (*(guchar *)p < 0xc2))

goto error;

}

else

{

if (*(guchar *)p < 0xf0) /* 1110xxxx */

1493

{

1494

switch (*(guchar *)p++ & 0x0f)

1495

{

1496

case 0:

1497

VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */

1498

break;

1499

case 0x0d:

1500

VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */

1501

break;

1502

default:

1503

VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */

1504

}

1505

}

1506

else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */

1507

{

1508

switch (*(guchar *)p++ & 0x07)

1509

{

1510

case 0:

1511

VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */

1512

if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))

goto error;

break;

case 4:

VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */

1517

break;

1518

default:

1519

VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */

1520

}

1521

p++;

1522

VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */

}

else

goto error;

}

p++;

VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */

continue;

error:

return last;

}

}

return p;

}

static const gchar *

1542

fast_validate_len (const char *str,

gssize max_len)

{

const gchar *p;

g_assert (max_len >= 0);

1549

1550

for (p = str; ((p - str) < max_len) && *p; p++)

1551

{

1552

if (*(guchar *)p < 128)

/* done */;

else

{

const gchar *last;

last = p;

if (*(guchar *)p < 0xe0) /* 110xxxxx */

1560

{

1561

if (G_UNLIKELY (max_len - (p - str) < 2))

1562

goto error;

1563

1564

if (G_UNLIKELY (*(guchar *)p < 0xc2))

goto error;

}

else

{

if (*(guchar *)p < 0xf0) /* 1110xxxx */

1570

{

1571

if (G_UNLIKELY (max_len - (p - str) < 3))

1572

goto error;

1573

1574

switch (*(guchar *)p++ & 0x0f)

1575

{

1576

case 0:

1577

VALIDATE_BYTE(0xe0, 0xa0); /* 0xa0 ... 0xbf */

1578

break;

1579

case 0x0d:

1580

VALIDATE_BYTE(0xe0, 0x80); /* 0x80 ... 0x9f */

1581

break;

1582

default:

1583

VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */

1584

}

1585

}

1586

else if (*(guchar *)p < 0xf5) /* 11110xxx excluding out-of-range */

1587

{

1588

if (G_UNLIKELY (max_len - (p - str) < 4))

1589

goto error;

1590

1591

switch (*(guchar *)p++ & 0x07)

1592

{

1593

case 0:

1594

VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */

1595

if (G_UNLIKELY((*(guchar *)p & 0x30) == 0))

goto error;

break;

case 4:

VALIDATE_BYTE(0xf0, 0x80); /* 0x80 ... 0x8f */

1600

break;

1601

default:

1602

VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */

1603

}

1604

p++;

1605

VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */

}

else

goto error;

}

p++;

VALIDATE_BYTE(0xc0, 0x80); /* 10xxxxxx */

continue;

error:

return last;

}

}

return p;

}

/**

* g_utf8_validate:

* @str: (array length=max_len) (element-type guint8): a pointer to character data

1627

* @max_len: max bytes to validate, or -1 to go until NUL

1628

* @end: (allow-none) (out) (transfer none): return location for end of valid data

1629

*

1630

* Validates UTF-8 encoded text. @str is the text to validate;

1631

* if @str is nul-terminated, then @max_len can be -1, otherwise

1632

* @max_len should be the number of bytes to validate.

1633

* If @end is non-%NULL, then the end of the valid range

1634

* will be stored there (i.e. the start of the first invalid

1635

* character if some bytes were invalid, or the end of the text

1636

* being validated otherwise).

1637

*

1638

* Note that g_utf8_validate() returns %FALSE if @max_len is

1639

* positive and any of the @max_len bytes are nul.

1640

*

1641

* Returns %TRUE if all of @str was valid. Many GLib and GTK+

1642

* routines require valid UTF-8 as input; so data read from a file

1643

* or the network should be checked with g_utf8_validate() before

1644

* doing anything else with it.

1645

*

1646

* Returns: %TRUE if the text was valid UTF-8

1647

*/

1648

gboolean

1649

g_utf8_validate (const char *str,

gssize max_len,

const gchar **end)

{

const gchar *p;

if (max_len < 0)

p = fast_validate (str);

1658

else

1659

p = fast_validate_len (str, max_len);

if (end)

*end = p;

if ((max_len >= 0 && p != str + max_len) ||

1665

(max_len < 0 && *p != '\0'))

return FALSE;

else

return TRUE;

}

/**

* g_unichar_validate:

1673

* @ch: a Unicode character

1674

*

1675

* Checks whether @ch is a valid Unicode character. Some possible

1676

* integer values of @ch will not be valid. 0 is considered a valid

1677

* character, though it's normally a string terminator.

1678

*

1679

* Returns: %TRUE if @ch is a valid Unicode character

1680

**/

1681

gboolean

1682

g_unichar_validate (gunichar ch)

1683

{

1684

return UNICODE_VALID (ch);

}

/**

* g_utf8_strreverse:

1689

* @str: a UTF-8 encoded string

1690

* @len: the maximum length of @str to use, in bytes. If @len < 0,

1691

* then the string is nul-terminated.

1692

*

1693

* Reverses a UTF-8 string. @str must be valid UTF-8 encoded text.

1694

* (Use g_utf8_validate() on all text before trying to use UTF-8

1695

* utility functions with it.)

1696

*

1697

* This function is intended for programmatic uses of reversed strings.

1698

* It pays no attention to decomposed characters, combining marks, byte

1699

* order marks, directional indicators (LRM, LRO, etc) and similar

1700

* characters which might need special handling when reversing a string

1701

* for display purposes.

1702

*

1703

* Note that unlike g_strreverse(), this function returns

1704

* newly-allocated memory, which should be freed with g_free() when

1705

* no longer needed.

1706

*

1707

* Returns: a newly-allocated string which is the reverse of @str

*

* Since: 2.2

*/

gchar *

g_utf8_strreverse (const gchar *str,

gssize len)

{

gchar *r, *result;

const gchar *p;

if (len < 0)

len = strlen (str);

1720

1721

result = g_new (gchar, len + 1);

r = result + len;

p = str;

while (r > result)

{

gchar *m, skip = g_utf8_skip[*(guchar*) p];

1727

r -= skip;

1728

for (m = r; skip; skip--)

*m++ = *p++;

}

result[len] = 0;

return result;

}

gchar *

_g_utf8_make_valid (const gchar *name)

1739

{

1740

GString *string;

1741

const gchar *remainder, *invalid;

1742

gint remaining_bytes, valid_bytes;

1743

1744

g_return_val_if_fail (name != NULL, NULL);

string = NULL;

remainder = name;

remaining_bytes = strlen (name);

1749

1750

while (remaining_bytes != 0)

1751

{

1752

if (g_utf8_validate (remainder, remaining_bytes, &invalid))

1753

break;

1754

valid_bytes = invalid - remainder;

1755

1756

if (string == NULL)

1757

string = g_string_sized_new (remaining_bytes);

1758

1759

g_string_append_len (string, remainder, valid_bytes);

1760

/* append U+FFFD REPLACEMENT CHARACTER */

1761

g_string_append (string, "\357\277\275");

1762

1763

remaining_bytes -= valid_bytes + 1;

1764

remainder = invalid + 1;

1765

}

1766

1767

if (string == NULL)

1768

return g_strdup (name);

1769

1770

g_string_append (string, remainder);

1771

1772

g_assert (g_utf8_validate (string->str, -1, NULL));

1773

1774

return g_string_free (string, FALSE);

1775

}

nexmon – Blame information for rev 1