WebSVN – nexmon – Blame – Rev 1 – /utilities/glib/glib/gconvert.c

1

office

1

/* GLIB - Library of useful routines for C programming

2

*

3

* gconvert.c: Convert between character sets using iconv

4

* Copyright Red Hat Inc., 2000

5

* Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com>

6

*

7

* This library is free software; you can redistribute it and/or

8

* modify it under the terms of the GNU Lesser General Public

9

* License as published by the Free Software Foundation; either

10

* version 2 of the License, or (at your option) any later version.

11

*

12

* This library is distributed in the hope that it will be useful,

13

* but WITHOUT ANY WARRANTY; without even the implied warranty of

14

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

15

* Lesser General Public License for more details.

16

*

17

* You should have received a copy of the GNU Lesser General Public

18

* License along with this library; if not, see <http://www.gnu.org/licenses/>.

19

*/

20

21

#include "config.h"

22

#include "glibconfig.h"

#ifndef G_OS_WIN32

#include <iconv.h>

#endif

#include <errno.h>

#include <stdio.h>

#include <string.h>

30

#include <stdlib.h>

31

32

#ifdef G_OS_WIN32

33

#include "win_iconv.c"

34

#endif

35

36

#ifdef G_PLATFORM_WIN32

37

#define STRICT

38

#include <windows.h>

#undef STRICT

#endif

#include "gconvert.h"

43

44

#include "gcharsetprivate.h"

45

#include "gslist.h"

46

#include "gstrfuncs.h"

47

#include "gtestutils.h"

48

#include "gthread.h"

49

#include "gunicode.h"

50

#include "gfileutils.h"

51

52

#include "glibintl.h"

53

54

#if defined(USE_LIBICONV_GNU) && !defined (_LIBICONV_H)

55

#error GNU libiconv in use but included iconv.h not from libiconv

56

#endif

57

#if !defined(USE_LIBICONV_GNU) && defined (_LIBICONV_H) \

58

&& !defined (__APPLE_CC__) && !defined (__LP_64__)

59

#error GNU libiconv not in use but included iconv.h is from libiconv

#endif

/**

* SECTION:conversions

65

* @title: Character Set Conversion

66

* @short_description: convert strings between different character sets

67

*

68

* The g_convert() family of function wraps the functionality of iconv().

69

* In addition to pure character set conversions, GLib has functions to

70

* deal with the extra complications of encodings for file names.

71

*

72

* ## File Name Encodings

73

*

74

* Historically, UNIX has not had a defined encoding for file names:

75

* a file name is valid as long as it does not have path separators

76

* in it ("/"). However, displaying file names may require conversion:

77

* from the character set in which they were created, to the character

78

* set in which the application operates. Consider the Spanish file name

79

* "Presentación.sxi". If the application which created it uses

80

* ISO-8859-1 for its encoding,

81

* |[

82

* Character: P r e s e n t a c i ó n . s x i

83

* Hex code: 50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69

84

* ]|

85

* However, if the application use UTF-8, the actual file name on

86

* disk would look like this:

87

* |[

88

* Character: P r e s e n t a c i ó n . s x i

89

* Hex code: 50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69

90

* ]|

91

* Glib uses UTF-8 for its strings, and GUI toolkits like GTK+ that use

92

* Glib do the same thing. If you get a file name from the file system,

93

* for example, from readdir() or from g_dir_read_name(), and you wish

94

* to display the file name to the user, you will need to convert it

95

* into UTF-8. The opposite case is when the user types the name of a

96

* file he wishes to save: the toolkit will give you that string in

97

* UTF-8 encoding, and you will need to convert it to the character

98

* set used for file names before you can create the file with open()

99

* or fopen().

100

*

101

* By default, Glib assumes that file names on disk are in UTF-8

102

* encoding. This is a valid assumption for file systems which

103

* were created relatively recently: most applications use UTF-8

104

* encoding for their strings, and that is also what they use for

105

* the file names they create. However, older file systems may

106

* still contain file names created in "older" encodings, such as

107

* ISO-8859-1. In this case, for compatibility reasons, you may want

108

* to instruct Glib to use that particular encoding for file names

109

* rather than UTF-8. You can do this by specifying the encoding for

110

* file names in the [`G_FILENAME_ENCODING`][G_FILENAME_ENCODING]

111

* environment variable. For example, if your installation uses

112

* ISO-8859-1 for file names, you can put this in your `~/.profile`

113

* |[

114

* export G_FILENAME_ENCODING=ISO-8859-1

115

* ]|

116

* Glib provides the functions g_filename_to_utf8() and

117

* g_filename_from_utf8() to perform the necessary conversions.

118

* These functions convert file names from the encoding specified

119

* in `G_FILENAME_ENCODING` to UTF-8 and vice-versa. This

120

* [diagram][file-name-encodings-diagram] illustrates how

121

* these functions are used to convert between UTF-8 and the

122

* encoding for file names in the file system.

123

*

124

* ## Conversion between file name encodings # {#file-name-encodings-diagram)

125

*

126

* ![](file-name-encodings.png)

127

*

128

* ## Checklist for Application Writers

129

*

130

* This section is a practical summary of the detailed

131

132

* things to do to make sure your applications process file

133

* name encodings correctly.

134

*

135

* 1. If you get a file name from the file system from a function

136

* such as readdir() or gtk_file_chooser_get_filename(), you do

137

* not need to do any conversion to pass that file name to

138

* functions like open(), rename(), or fopen() -- those are "raw"

139

* file names which the file system understands.

140

*

141

* 2. If you need to display a file name, convert it to UTF-8 first

142

* by using g_filename_to_utf8(). If conversion fails, display a

143

* string like "Unknown file name". Do not convert this string back

144

* into the encoding used for file names if you wish to pass it to

145

* the file system; use the original file name instead.

146

*

147

* For example, the document window of a word processor could display

148

* "Unknown file name" in its title bar but still let the user save

149

* the file, as it would keep the raw file name internally. This

150

* can happen if the user has not set the `G_FILENAME_ENCODING`

151

* environment variable even though he has files whose names are

152

* not encoded in UTF-8.

153

*

154

* 3. If your user interface lets the user type a file name for saving

155

* or renaming, convert it to the encoding used for file names in

156

* the file system by using g_filename_from_utf8(). Pass the converted

157

* file name to functions like fopen(). If conversion fails, ask the

158

* user to enter a different file name. This can happen if the user

159

* types Japanese characters when `G_FILENAME_ENCODING` is set to

160

* `ISO-8859-1`, for example.

161

*/

162

163

/* We try to terminate strings in unknown charsets with this many zero bytes

164

* to ensure that multibyte strings really are nul-terminated when we return

165

* them from g_convert() and friends.

166

*/

167

#define NUL_TERMINATOR_LENGTH 4

168

169

G_DEFINE_QUARK (g_convert_error, g_convert_error)

170

171

static gboolean

172

try_conversion (const char *to_codeset,

173

const char *from_codeset,

174

iconv_t *cd)

175

{

176

*cd = iconv_open (to_codeset, from_codeset);

177

178

if (*cd == (iconv_t)-1 && errno == EINVAL)

return FALSE;

else

return TRUE;

}

static gboolean

try_to_aliases (const char **to_aliases,

186

const char *from_codeset,

iconv_t *cd)

{

if (to_aliases)

{

const char **p = to_aliases;

192

while (*p)

193

{

194

if (try_conversion (*p, from_codeset, cd))

return TRUE;

p++;

}

}

return FALSE;

}

/**

* g_iconv_open:

* @to_codeset: destination codeset

207

* @from_codeset: source codeset

208

*

209

* Same as the standard UNIX routine iconv_open(), but

210

* may be implemented via libiconv on UNIX flavors that lack

211

* a native implementation.

212

*

213

* GLib provides g_convert() and g_locale_to_utf8() which are likely

214

* more convenient than the raw iconv wrappers.

215

*

216

* Returns: a "conversion descriptor", or (GIConv)-1 if

217

* opening the converter failed.

218

**/

219

GIConv

220

g_iconv_open (const gchar *to_codeset,

221

const gchar *from_codeset)

{

iconv_t cd;

if (!try_conversion (to_codeset, from_codeset, &cd))

226

{

227

const char **to_aliases = _g_charset_get_aliases (to_codeset);

228

const char **from_aliases = _g_charset_get_aliases (from_codeset);

if (from_aliases)

{

const char **p = from_aliases;

233

while (*p)

234

{

235

if (try_conversion (to_codeset, *p, &cd))

236

goto out;

237

238

if (try_to_aliases (to_aliases, *p, &cd))

goto out;

p++;

}

}

if (try_to_aliases (to_aliases, from_codeset, &cd))

goto out;

}

out:

return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd;

}

/**

* g_iconv:

* @converter: conversion descriptor from g_iconv_open()

256

* @inbuf: bytes to convert

257

* @inbytes_left: inout parameter, bytes remaining to convert in @inbuf

258

* @outbuf: converted output bytes

259

* @outbytes_left: inout parameter, bytes available to fill in @outbuf

260

*

261

* Same as the standard UNIX routine iconv(), but

262

* may be implemented via libiconv on UNIX flavors that lack

263

* a native implementation.

264

*

265

* GLib provides g_convert() and g_locale_to_utf8() which are likely

266

* more convenient than the raw iconv wrappers.

267

*

268

* Returns: count of non-reversible conversions, or -1 on error

269

**/

270

gsize

271

g_iconv (GIConv converter,

272

gchar **inbuf,

273

gsize *inbytes_left,

274

gchar **outbuf,

275

gsize *outbytes_left)

276

{

277

iconv_t cd = (iconv_t)converter;

278

279

return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left);

}

/**

* g_iconv_close:

* @converter: a conversion descriptor from g_iconv_open()

285

*

286

* Same as the standard UNIX routine iconv_close(), but

287

* may be implemented via libiconv on UNIX flavors that lack

288

* a native implementation. Should be called to clean up

289

* the conversion descriptor from g_iconv_open() when

290

* you are done converting things.

291

*

292

* GLib provides g_convert() and g_locale_to_utf8() which are likely

293

* more convenient than the raw iconv wrappers.

294

*

295

* Returns: -1 on error, 0 on success

296

**/

297

gint

298

g_iconv_close (GIConv converter)

299

{

300

iconv_t cd = (iconv_t)converter;

301

302

return iconv_close (cd);

}

static GIConv

open_converter (const gchar *to_codeset,

307

const gchar *from_codeset,

GError **error)

{

GIConv cd;

cd = g_iconv_open (to_codeset, from_codeset);

313

314

if (cd == (GIConv) -1)

315

{

316

/* Something went wrong. */

317

if (error)

318

{

319

if (errno == EINVAL)

320

g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION,

321

_("Conversion from character set '%s' to '%s' is not supported"),

322

from_codeset, to_codeset);

323

else

324

g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,

325

_("Could not open converter from '%s' to '%s'"),

326

from_codeset, to_codeset);

}

}

return cd;

}

static int

close_converter (GIConv cd)

335

{

336

if (cd == (GIConv) -1)

337

return 0;

338

339

return g_iconv_close (cd);

}

/**

* g_convert_with_iconv:

344

* @str: the string to convert

345

* @len: the length of the string in bytes, or -1 if the string is

346

* nul-terminated (Note that some encodings may allow nul

347

* bytes to occur inside strings. In that case, using -1

348

* for the @len parameter is unsafe)

349

* @converter: conversion descriptor from g_iconv_open()

350

* @bytes_read: location to store the number of bytes in the

351

* input string that were successfully converted, or %NULL.

352

* Even if the conversion was successful, this may be

353

* less than @len if there were partial characters

354

* at the end of the input. If the error

355

* #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value

356

* stored will the byte offset after the last valid

357

* input sequence.

358

* @bytes_written: the number of bytes stored in the output buffer (not

359

* including the terminating nul).

360

* @error: location to store the error occurring, or %NULL to ignore

361

* errors. Any of the errors in #GConvertError may occur.

362

*

363

* Converts a string from one character set to another.

364

*

365

* Note that you should use g_iconv() for streaming conversions.

366

* Despite the fact that @byes_read can return information about partial

367

* characters, the g_convert_... functions are not generally suitable

368

* for streaming. If the underlying converter maintains internal state,

369

* then this won't be preserved across successive calls to g_convert(),

370

* g_convert_with_iconv() or g_convert_with_fallback(). (An example of

371

* this is the GNU C converter for CP1255 which does not emit a base

372

* character until it knows that the next character is not a mark that

373

* could combine with the base character.)

374

*

375

* Returns: If the conversion was successful, a newly allocated

376

* nul-terminated string, which must be freed with

377

* g_free(). Otherwise %NULL and @error will be set.

378

**/

379

gchar*

380

g_convert_with_iconv (const gchar *str,

gssize len,

GIConv converter,

gsize *bytes_read,

gsize *bytes_written,

GError **error)

{

gchar *dest;

gchar *outp;

const gchar *p;

gsize inbytes_remaining;

391

gsize outbytes_remaining;

392

gsize err;

393

gsize outbuf_size;

394

gboolean have_error = FALSE;

395

gboolean done = FALSE;

396

gboolean reset = FALSE;

397

398

g_return_val_if_fail (converter != (GIConv) -1, NULL);

399

400

if (len < 0)

401

len = strlen (str);

402

403

p = str;

404

inbytes_remaining = len;

405

outbuf_size = len + NUL_TERMINATOR_LENGTH;

406

407

outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;

408

outp = dest = g_malloc (outbuf_size);

409

410

while (!done && !have_error)

411

{

412

if (reset)

413

err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining);

414

else

415

err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining);

416

417

if (err == (gsize) -1)

{

switch (errno)

{

case EINVAL:

/* Incomplete text, do not report an error */

done = TRUE;

break;

case E2BIG:

{

gsize used = outp - dest;

428

429

outbuf_size *= 2;

430

dest = g_realloc (dest, outbuf_size);

431

432

outp = dest + used;

433

outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;

}

break;

case EILSEQ:

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

438

_("Invalid byte sequence in conversion input"));

have_error = TRUE;

break;

default:

{

int errsv = errno;

g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,

446

_("Error during conversion: %s"),

447

g_strerror (errsv));

}

have_error = TRUE;

break;

}

}

else

{

if (!reset)

{

/* call g_iconv with NULL inbuf to cleanup shift state */

458

reset = TRUE;

459

inbytes_remaining = 0;

}

else

done = TRUE;

}

}

memset (outp, 0, NUL_TERMINATOR_LENGTH);

467

468

if (bytes_read)

469

*bytes_read = p - str;

470

else

471

{

472

if ((p - str) != len)

{

if (!have_error)

{

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,

477

_("Partial character sequence at end of input"));

have_error = TRUE;

}

}

}

if (bytes_written)

*bytes_written = outp - dest; /* Doesn't include '\0' */

if (have_error)

{

g_free (dest);

return NULL;

}

else

return dest;

}

/**

* g_convert:

* @str: the string to convert

498

* @len: the length of the string in bytes, or -1 if the string is

499

* nul-terminated (Note that some encodings may allow nul

500

* bytes to occur inside strings. In that case, using -1

501

* for the @len parameter is unsafe)

502

* @to_codeset: name of character set into which to convert @str

503

* @from_codeset: character set of @str.

504

* @bytes_read: (out): location to store the number of bytes in the

505

* input string that were successfully converted, or %NULL.

506

* Even if the conversion was successful, this may be

507

* less than @len if there were partial characters

508

* at the end of the input. If the error

509

* #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value

510

* stored will the byte offset after the last valid

511

* input sequence.

512

* @bytes_written: (out): the number of bytes stored in the output buffer (not

513

* including the terminating nul).

514

* @error: location to store the error occurring, or %NULL to ignore

515

* errors. Any of the errors in #GConvertError may occur.

516

*

517

* Converts a string from one character set to another.

518

*

519

* Note that you should use g_iconv() for streaming conversions.

520

* Despite the fact that @byes_read can return information about partial

521

* characters, the g_convert_... functions are not generally suitable

522

* for streaming. If the underlying converter maintains internal state,

523

* then this won't be preserved across successive calls to g_convert(),

524

* g_convert_with_iconv() or g_convert_with_fallback(). (An example of

525

* this is the GNU C converter for CP1255 which does not emit a base

526

* character until it knows that the next character is not a mark that

527

* could combine with the base character.)

528

*

529

* Using extensions such as "//TRANSLIT" may not work (or may not work

530

* well) on many platforms. Consider using g_str_to_ascii() instead.

531

*

532

* Returns: If the conversion was successful, a newly allocated

533

* nul-terminated string, which must be freed with

534

* g_free(). Otherwise %NULL and @error will be set.

535

**/

536

gchar*

537

g_convert (const gchar *str,

538

gssize len,

539

const gchar *to_codeset,

540

const gchar *from_codeset,

541

gsize *bytes_read,

542

gsize *bytes_written,

GError **error)

{

gchar *res;

GIConv cd;

g_return_val_if_fail (str != NULL, NULL);

549

g_return_val_if_fail (to_codeset != NULL, NULL);

550

g_return_val_if_fail (from_codeset != NULL, NULL);

551

552

cd = open_converter (to_codeset, from_codeset, error);

553

554

if (cd == (GIConv) -1)

{

if (bytes_read)

*bytes_read = 0;

if (bytes_written)

*bytes_written = 0;

return NULL;

}

res = g_convert_with_iconv (str, len, cd,

566

bytes_read, bytes_written,

567

error);

568

569

close_converter (cd);

return res;

}

/**

* g_convert_with_fallback:

576

* @str: the string to convert

577

* @len: the length of the string in bytes, or -1 if the string is

578

* nul-terminated (Note that some encodings may allow nul

579

* bytes to occur inside strings. In that case, using -1

580

* for the @len parameter is unsafe)

581

* @to_codeset: name of character set into which to convert @str

582

* @from_codeset: character set of @str.

583

* @fallback: UTF-8 string to use in place of character not

584

* present in the target encoding. (The string must be

585

* representable in the target encoding).

586

If %NULL, characters not in the target encoding will

587

be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.

588

* @bytes_read: location to store the number of bytes in the

589

* input string that were successfully converted, or %NULL.

590

* Even if the conversion was successful, this may be

591

* less than @len if there were partial characters

592

* at the end of the input.

593

* @bytes_written: the number of bytes stored in the output buffer (not

594

* including the terminating nul).

595

* @error: location to store the error occurring, or %NULL to ignore

596

* errors. Any of the errors in #GConvertError may occur.

597

*

598

* Converts a string from one character set to another, possibly

599

* including fallback sequences for characters not representable

600

* in the output. Note that it is not guaranteed that the specification

601

* for the fallback sequences in @fallback will be honored. Some

602

* systems may do an approximate conversion from @from_codeset

603

* to @to_codeset in their iconv() functions,

604

* in which case GLib will simply return that approximate conversion.

605

*

606

* Note that you should use g_iconv() for streaming conversions.

607

* Despite the fact that @byes_read can return information about partial

608

* characters, the g_convert_... functions are not generally suitable

609

* for streaming. If the underlying converter maintains internal state,

610

* then this won't be preserved across successive calls to g_convert(),

611

* g_convert_with_iconv() or g_convert_with_fallback(). (An example of

612

* this is the GNU C converter for CP1255 which does not emit a base

613

* character until it knows that the next character is not a mark that

614

* could combine with the base character.)

615

*

616

* Returns: If the conversion was successful, a newly allocated

617

* nul-terminated string, which must be freed with

618

* g_free(). Otherwise %NULL and @error will be set.

619

**/

620

gchar*

621

g_convert_with_fallback (const gchar *str,

622

gssize len,

623

const gchar *to_codeset,

624

const gchar *from_codeset,

625

const gchar *fallback,

626

gsize *bytes_read,

627

gsize *bytes_written,

GError **error)

{

gchar *utf8;

gchar *dest;

gchar *outp;

const gchar *insert_str = NULL;

634

const gchar *p;

635

gsize inbytes_remaining;

636

const gchar *save_p = NULL;

637

gsize save_inbytes = 0;

638

gsize outbytes_remaining;

gsize err;

GIConv cd;

gsize outbuf_size;

gboolean have_error = FALSE;

643

gboolean done = FALSE;

644

645

GError *local_error = NULL;

646

647

g_return_val_if_fail (str != NULL, NULL);

648

g_return_val_if_fail (to_codeset != NULL, NULL);

649

g_return_val_if_fail (from_codeset != NULL, NULL);

650

651

if (len < 0)

652

len = strlen (str);

653

654

/* Try an exact conversion; we only proceed if this fails

655

* due to an illegal sequence in the input string.

656

*/

657

dest = g_convert (str, len, to_codeset, from_codeset,

658

bytes_read, bytes_written, &local_error);

if (!local_error)

return dest;

if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE))

663

{

664

g_propagate_error (error, local_error);

return NULL;

}

else

g_error_free (local_error);

669

670

local_error = NULL;

671

672

/* No go; to proceed, we need a converter from "UTF-8" to

673

* to_codeset, and the string as UTF-8.

674

*/

675

cd = open_converter (to_codeset, "UTF-8", error);

676

if (cd == (GIConv) -1)

{

if (bytes_read)

*bytes_read = 0;

if (bytes_written)

*bytes_written = 0;

return NULL;

}

utf8 = g_convert (str, len, "UTF-8", from_codeset,

688

bytes_read, &inbytes_remaining, error);

689

if (!utf8)

690

{

691

close_converter (cd);

692

if (bytes_written)

693

*bytes_written = 0;

return NULL;

}

/* Now the heart of the code. We loop through the UTF-8 string, and

698

* whenever we hit an offending character, we form fallback, convert

699

* the fallback to the target codeset, and then go back to

700

* converting the original string after finishing with the fallback.

701

*

702

* The variables save_p and save_inbytes store the input state

703

* for the original string while we are converting the fallback

*/

p = utf8;

outbuf_size = len + NUL_TERMINATOR_LENGTH;

708

outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;

709

outp = dest = g_malloc (outbuf_size);

710

711

while (!done && !have_error)

712

{

713

gsize inbytes_tmp = inbytes_remaining;

714

err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining);

715

inbytes_remaining = inbytes_tmp;

716

717

if (err == (gsize) -1)

{

switch (errno)

{

case EINVAL:

g_assert_not_reached();

break;

case E2BIG:

{

gsize used = outp - dest;

727

728

outbuf_size *= 2;

729

dest = g_realloc (dest, outbuf_size);

730

731

outp = dest + used;

732

outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;

break;

}

case EILSEQ:

if (save_p)

{

/* Error converting fallback string - fatal

740

*/

741

g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

742

_("Cannot convert fallback '%s' to codeset '%s'"),

743

insert_str, to_codeset);

have_error = TRUE;

break;

}

else if (p)

{

if (!fallback)

{

gunichar ch = g_utf8_get_char (p);

752

insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x",

ch);

}

else

insert_str = fallback;

757

758

save_p = g_utf8_next_char (p);

759

save_inbytes = inbytes_remaining - (save_p - p);

760

p = insert_str;

761

inbytes_remaining = strlen (p);

762

break;

763

}

764

/* fall thru if p is NULL */

default:

{

int errsv = errno;

g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED,

770

_("Error during conversion: %s"),

771

g_strerror (errsv));

}

have_error = TRUE;

break;

}

}

else

{

if (save_p)

{

if (!fallback)

g_free ((gchar *)insert_str);

784

p = save_p;

785

inbytes_remaining = save_inbytes;

save_p = NULL;

}

else if (p)

{

/* call g_iconv with NULL inbuf to cleanup shift state */

791

p = NULL;

792

inbytes_remaining = 0;

}

else

done = TRUE;

}

}

/* Cleanup

*/

memset (outp, 0, NUL_TERMINATOR_LENGTH);

802

803

close_converter (cd);

804

805

if (bytes_written)

806

*bytes_written = outp - dest; /* Doesn't include '\0' */

g_free (utf8);

if (have_error)

{

if (save_p && !fallback)

813

g_free ((gchar *)insert_str);

g_free (dest);

return NULL;

}

else

return dest;

}

/*

* g_locale_to_utf8

*

*

*/

static gchar *

strdup_len (const gchar *string,

829

gssize len,

830

gsize *bytes_written,

gsize *bytes_read,

GError **error)

{

gsize real_len;

if (!g_utf8_validate (string, len, NULL))

{

if (bytes_read)

*bytes_read = 0;

if (bytes_written)

*bytes_written = 0;

843

844

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

845

_("Invalid byte sequence in conversion input"));

return NULL;

}

if (len < 0)

real_len = strlen (string);

else

{

real_len = 0;

while (real_len < len && string[real_len])

real_len++;

}

if (bytes_read)

*bytes_read = real_len;

861

if (bytes_written)

862

*bytes_written = real_len;

863

864

return g_strndup (string, real_len);

}

/**

* g_locale_to_utf8:

869

* @opsysstring: a string in the encoding of the current locale. On Windows

870

* this means the system codepage.

871

* @len: the length of the string, or -1 if the string is

872

* nul-terminated (Note that some encodings may allow nul

873

* bytes to occur inside strings. In that case, using -1

874

* for the @len parameter is unsafe)

875

* @bytes_read: (out) (optional): location to store the number of bytes in the

876

* input string that were successfully converted, or %NULL.

877

* Even if the conversion was successful, this may be

878

* less than @len if there were partial characters

879

* at the end of the input. If the error

880

* #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value

881

* stored will the byte offset after the last valid

882

* input sequence.

883

* @bytes_written: (out) (optional): the number of bytes stored in the output

884

* buffer (not including the terminating nul).

885

* @error: location to store the error occurring, or %NULL to ignore

886

* errors. Any of the errors in #GConvertError may occur.

887

*

888

* Converts a string which is in the encoding used for strings by

889

* the C runtime (usually the same as that used by the operating

890

* system) in the [current locale][setlocale] into a UTF-8 string.

891

*

892

* Returns: A newly-allocated buffer containing the converted string,

893

* or %NULL on an error, and error will be set.

894

**/

895

gchar *

896

g_locale_to_utf8 (const gchar *opsysstring,

897

gssize len,

898

gsize *bytes_read,

899

gsize *bytes_written,

900

GError **error)

901

{

902

const char *charset;

903

904

if (g_get_charset (&charset))

905

return strdup_len (opsysstring, len, bytes_read, bytes_written, error);

906

else

907

return g_convert (opsysstring, len,

908

"UTF-8", charset, bytes_read, bytes_written, error);

}

/**

* g_locale_from_utf8:

913

* @utf8string: a UTF-8 encoded string

914

* @len: the length of the string, or -1 if the string is

915

* nul-terminated (Note that some encodings may allow nul

916

* bytes to occur inside strings. In that case, using -1

917

* for the @len parameter is unsafe)

918

* @bytes_read: (out) (optional): location to store the number of bytes in the

919

* input string that were successfully converted, or %NULL.

920

* Even if the conversion was successful, this may be

921

* less than @len if there were partial characters

922

* at the end of the input. If the error

923

* #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value

924

* stored will the byte offset after the last valid

925

* input sequence.

926

* @bytes_written: (out) (optional): the number of bytes stored in the output

927

* buffer (not including the terminating nul).

928

* @error: location to store the error occurring, or %NULL to ignore

929

* errors. Any of the errors in #GConvertError may occur.

930

*

931

* Converts a string from UTF-8 to the encoding used for strings by

932

* the C runtime (usually the same as that used by the operating

933

* system) in the [current locale][setlocale]. On Windows this means

934

* the system codepage.

935

*

936

* Returns: A newly-allocated buffer containing the converted string,

937

* or %NULL on an error, and error will be set.

938

**/

939

gchar *

940

g_locale_from_utf8 (const gchar *utf8string,

941

gssize len,

942

gsize *bytes_read,

943

gsize *bytes_written,

944

GError **error)

945

{

946

const gchar *charset;

947

948

if (g_get_charset (&charset))

949

return strdup_len (utf8string, len, bytes_read, bytes_written, error);

950

else

951

return g_convert (utf8string, len,

952

charset, "UTF-8", bytes_read, bytes_written, error);

953

}

954

955

#ifndef G_PLATFORM_WIN32

956

957

typedef struct _GFilenameCharsetCache GFilenameCharsetCache;

958

959

struct _GFilenameCharsetCache {

960

gboolean is_utf8;

961

gchar *charset;

962

gchar **filename_charsets;

};

static void

filename_charset_cache_free (gpointer data)

967

{

968

GFilenameCharsetCache *cache = data;

969

g_free (cache->charset);

970

g_strfreev (cache->filename_charsets);

g_free (cache);

}

/**

* g_get_filename_charsets:

976

* @charsets: return location for the %NULL-terminated list of encoding names

977

*

978

* Determines the preferred character sets used for filenames.

979

* The first character set from the @charsets is the filename encoding, the

980

* subsequent character sets are used when trying to generate a displayable

981

* representation of a filename, see g_filename_display_name().

982

*

983

* On Unix, the character sets are determined by consulting the

984

* environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`.

985

* On Windows, the character set used in the GLib API is always UTF-8

986

* and said environment variables have no effect.

987

*

988

* `G_FILENAME_ENCODING` may be set to a comma-separated list of

989

* character set names. The special token "\@locale" is taken

990

* to mean the character set for the [current locale][setlocale].

991

* If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is,

992

* the character set of the current locale is taken as the filename

993

* encoding. If neither environment variable is set, UTF-8 is taken

994

* as the filename encoding, but the character set of the current locale

995

* is also put in the list of encodings.

996

*

997

* The returned @charsets belong to GLib and must not be freed.

998

*

999

* Note that on Unix, regardless of the locale character set or

1000

* `G_FILENAME_ENCODING` value, the actual file names present

1001

* on a system might be in any random encoding or just gibberish.

1002

*

1003

* Returns: %TRUE if the filename encoding is UTF-8.

*

* Since: 2.6

*/

gboolean

g_get_filename_charsets (const gchar ***filename_charsets)

1009

{

1010

static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free);

1011

GFilenameCharsetCache *cache = g_private_get (&cache_private);

1012

const gchar *charset;

if (!cache)

{

cache = g_new0 (GFilenameCharsetCache, 1);

1017

g_private_set (&cache_private, cache);

1018

}

1019

1020

g_get_charset (&charset);

1021

1022

if (!(cache->charset && strcmp (cache->charset, charset) == 0))

1023

{

1024

const gchar *new_charset;

gchar *p;

gint i;

g_free (cache->charset);

1029

g_strfreev (cache->filename_charsets);

1030

cache->charset = g_strdup (charset);

1031

1032

p = getenv ("G_FILENAME_ENCODING");

1033

if (p != NULL && p[0] != '\0')

1034

{

1035

cache->filename_charsets = g_strsplit (p, ",", 0);

1036

cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0);

1037

1038

for (i = 0; cache->filename_charsets[i]; i++)

1039

{

1040

if (strcmp ("@locale", cache->filename_charsets[i]) == 0)

1041

{

1042

g_get_charset (&new_charset);

1043

g_free (cache->filename_charsets[i]);

1044

cache->filename_charsets[i] = g_strdup (new_charset);

}

}

}

else if (getenv ("G_BROKEN_FILENAMES") != NULL)

1049

{

1050

cache->filename_charsets = g_new0 (gchar *, 2);

1051

cache->is_utf8 = g_get_charset (&new_charset);

1052

cache->filename_charsets[0] = g_strdup (new_charset);

}

else

{

cache->filename_charsets = g_new0 (gchar *, 3);

1057

cache->is_utf8 = TRUE;

1058

cache->filename_charsets[0] = g_strdup ("UTF-8");

1059

if (!g_get_charset (&new_charset))

1060

cache->filename_charsets[1] = g_strdup (new_charset);

}

}

if (filename_charsets)

1065

*filename_charsets = (const gchar **)cache->filename_charsets;

1066

1067

return cache->is_utf8;

1068

}

1069

1070

#else /* G_PLATFORM_WIN32 */

1071

1072

gboolean

1073

g_get_filename_charsets (const gchar ***filename_charsets)

1074

{

1075

static const gchar *charsets[] = {

"UTF-8",

NULL

};

#ifdef G_OS_WIN32

/* On Windows GLib pretends that the filename charset is UTF-8 */

1082

if (filename_charsets)

1083

*filename_charsets = charsets;

return TRUE;

#else

gboolean result;

/* Cygwin works like before */

1090

result = g_get_charset (&(charsets[0]));

1091

1092

if (filename_charsets)

1093

*filename_charsets = charsets;

return result;

#endif

}

#endif /* G_PLATFORM_WIN32 */

1100

1101

static gboolean

1102

get_filename_charset (const gchar **filename_charset)

1103

{

1104

const gchar **charsets;

1105

gboolean is_utf8;

1106

1107

is_utf8 = g_get_filename_charsets (&charsets);

1108

1109

if (filename_charset)

1110

*filename_charset = charsets[0];

return is_utf8;

}

/**

* g_filename_to_utf8:

1117

* @opsysstring: a string in the encoding for filenames

1118

* @len: the length of the string, or -1 if the string is

1119

* nul-terminated (Note that some encodings may allow nul

1120

* bytes to occur inside strings. In that case, using -1

1121

* for the @len parameter is unsafe)

1122

* @bytes_read: (out) (optional): location to store the number of bytes in the

1123

* input string that were successfully converted, or %NULL.

1124

* Even if the conversion was successful, this may be

1125

* less than @len if there were partial characters

1126

* at the end of the input. If the error

1127

* #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value

1128

* stored will the byte offset after the last valid

1129

* input sequence.

1130

* @bytes_written: (out) (optional): the number of bytes stored in the output

1131

* buffer (not including the terminating nul).

1132

* @error: location to store the error occurring, or %NULL to ignore

1133

* errors. Any of the errors in #GConvertError may occur.

1134

*

1135

* Converts a string which is in the encoding used by GLib for

1136

* filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8

1137

* for filenames; on other platforms, this function indirectly depends on

1138

* the [current locale][setlocale].

1139

*

1140

* Returns: The converted string, or %NULL on an error.

1141

**/

1142

gchar*

1143

g_filename_to_utf8 (const gchar *opsysstring,

1144

gssize len,

1145

gsize *bytes_read,

1146

gsize *bytes_written,

1147

GError **error)

1148

{

1149

const gchar *charset;

1150

1151

g_return_val_if_fail (opsysstring != NULL, NULL);

1152

1153

if (get_filename_charset (&charset))

1154

return strdup_len (opsysstring, len, bytes_read, bytes_written, error);

1155

else

1156

return g_convert (opsysstring, len,

1157

"UTF-8", charset, bytes_read, bytes_written, error);

1158

}

1159

1160

#if defined (G_OS_WIN32) && !defined (_WIN64)

1161

1162

#undef g_filename_to_utf8

1163

1164

/* Binary compatibility version. Not for newly compiled code. Also not needed for

1165

* 64-bit versions as there should be no old deployed binaries that would use

1166

* the old versions.

*/

gchar*

g_filename_to_utf8 (const gchar *opsysstring,

1171

gssize len,

1172

gsize *bytes_read,

1173

gsize *bytes_written,

1174

GError **error)

1175

{

1176

const gchar *charset;

1177

1178

g_return_val_if_fail (opsysstring != NULL, NULL);

1179

1180

if (g_get_charset (&charset))

1181

return strdup_len (opsysstring, len, bytes_read, bytes_written, error);

1182

else

1183

return g_convert (opsysstring, len,

1184

"UTF-8", charset, bytes_read, bytes_written, error);

}

#endif

/**

* g_filename_from_utf8:

1191

* @utf8string: a UTF-8 encoded string.

1192

* @len: the length of the string, or -1 if the string is

1193

* nul-terminated.

1194

* @bytes_read: (out) (optional): location to store the number of bytes in

1195

* the input string that were successfully converted, or %NULL.

1196

* Even if the conversion was successful, this may be

1197

* less than @len if there were partial characters

1198

* at the end of the input. If the error

1199

* #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value

1200

* stored will the byte offset after the last valid

1201

* input sequence.

1202

* @bytes_written: (out): the number of bytes stored in the output buffer (not

1203

* including the terminating nul).

1204

* @error: location to store the error occurring, or %NULL to ignore

1205

* errors. Any of the errors in #GConvertError may occur.

1206

*

1207

* Converts a string from UTF-8 to the encoding GLib uses for

1208

* filenames. Note that on Windows GLib uses UTF-8 for filenames;

1209

* on other platforms, this function indirectly depends on the

1210

* [current locale][setlocale].

1211

*

1212

* Returns: (array length=bytes_written) (element-type guint8) (transfer full):

1213

* The converted string, or %NULL on an error.

1214

**/

1215

gchar*

1216

g_filename_from_utf8 (const gchar *utf8string,

1217

gssize len,

1218

gsize *bytes_read,

1219

gsize *bytes_written,

1220

GError **error)

1221

{

1222

const gchar *charset;

1223

1224

if (get_filename_charset (&charset))

1225

return strdup_len (utf8string, len, bytes_read, bytes_written, error);

1226

else

1227

return g_convert (utf8string, len,

1228

charset, "UTF-8", bytes_read, bytes_written, error);

1229

}

1230

1231

#if defined (G_OS_WIN32) && !defined (_WIN64)

1232

1233

#undef g_filename_from_utf8

1234

1235

/* Binary compatibility version. Not for newly compiled code. */

1236

1237

gchar*

1238

g_filename_from_utf8 (const gchar *utf8string,

1239

gssize len,

1240

gsize *bytes_read,

1241

gsize *bytes_written,

1242

GError **error)

1243

{

1244

const gchar *charset;

1245

1246

if (g_get_charset (&charset))

1247

return strdup_len (utf8string, len, bytes_read, bytes_written, error);

1248

else

1249

return g_convert (utf8string, len,

1250

charset, "UTF-8", bytes_read, bytes_written, error);

}

#endif

/* Test of haystack has the needle prefix, comparing case

1256

* insensitive. haystack may be UTF-8, but needle must

1257

* contain only ascii. */

1258

static gboolean

1259

has_case_prefix (const gchar *haystack, const gchar *needle)

1260

{

1261

const gchar *h, *n;

1262

1263

/* Eat one character at a time. */

h = haystack;

n = needle;

while (*n && *h &&

g_ascii_tolower (*n) == g_ascii_tolower (*h))

{

n++;

h++;

}

return *n == '\0';

}

typedef enum {

UNSAFE_ALL = 0x1, /* Escape all unsafe characters */

1279

UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */

1280

UNSAFE_PATH = 0x8, /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */

1281

UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */

1282

UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */

1283

} UnsafeCharacterSet;

1284

1285

static const guchar acceptable[96] = {

1286

/* A table of the ASCII chars from space (32) to DEL (127) */

1287

/* ! " # $ % & ' ( ) * + , - . / */

1288

0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C,

1289

/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */

1290

0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20,

1291

/* @ A B C D E F G H I J K L M N O */

1292

0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,

1293

/* P Q R S T U V W X Y Z [ \ ] ^ _ */

1294

0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F,

1295

/* ` a b c d e f g h i j k l m n o */

1296

0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,

1297

/* p q r s t u v w x y z { | } ~ DEL */

1298

0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20

1299

};

1300

1301

static const gchar hex[16] = "0123456789ABCDEF";

1302

1303

/* Note: This escape function works on file: URIs, but if you want to

1304

* escape something else, please read RFC-2396 */

1305

static gchar *

1306

g_escape_uri_string (const gchar *string,

1307

UnsafeCharacterSet mask)

1308

{

1309

#define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))

const gchar *p;

gchar *q;

gchar *result;

int c;

gint unacceptable;

UnsafeCharacterSet use_mask;

1317

1318

g_return_val_if_fail (mask == UNSAFE_ALL

1319

|| mask == UNSAFE_ALLOW_PLUS

1320

|| mask == UNSAFE_PATH

1321

|| mask == UNSAFE_HOST

1322

|| mask == UNSAFE_SLASHES, NULL);

unacceptable = 0;

use_mask = mask;

for (p = string; *p != '\0'; p++)

1327

{

1328

c = (guchar) *p;

1329

if (!ACCEPTABLE (c))

unacceptable++;

}

result = g_malloc (p - string + unacceptable * 2 + 1);

1334

1335

use_mask = mask;

1336

for (q = result, p = string; *p != '\0'; p++)

{

c = (guchar) *p;

if (!ACCEPTABLE (c))

1341

{

1342

*q++ = '%'; /* means hex coming */

1343

*q++ = hex[c >> 4];

1344

*q++ = hex[c & 15];

}

else

*q++ = *p;

}

*q = '\0';

return result;

}

static gchar *

g_escape_file_uri (const gchar *hostname,

1358

const gchar *pathname)

1359

{

1360

char *escaped_hostname = NULL;

1361

char *escaped_path;

char *res;

#ifdef G_OS_WIN32

char *p, *backslash;

1366

1367

/* Turn backslashes into forward slashes. That's what Netscape

1368

* does, and they are actually more or less equivalent in Windows.

1369

*/

1370

1371

pathname = g_strdup (pathname);

1372

p = (char *) pathname;

1373

1374

while ((backslash = strchr (p, '\\')) != NULL)

{

*backslash = '/';

p = backslash + 1;

}

#endif

if (hostname && *hostname != '\0')

1382

{

1383

escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST);

1384

}

1385

1386

escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH);

1387

1388

res = g_strconcat ("file://",

1389

(escaped_hostname) ? escaped_hostname : "",

1390

(*escaped_path != '/') ? "/" : "",

escaped_path,

NULL);

#ifdef G_OS_WIN32

g_free ((char *) pathname);

1396

#endif

1397

1398

g_free (escaped_hostname);

1399

g_free (escaped_path);

return res;

}

static int

unescape_character (const char *scanner)

{

int first_digit;

int second_digit;

first_digit = g_ascii_xdigit_value (scanner[0]);

1411

if (first_digit < 0)

1412

return -1;

1413

1414

second_digit = g_ascii_xdigit_value (scanner[1]);

1415

if (second_digit < 0)

1416

return -1;

1417

1418

return (first_digit << 4) | second_digit;

}

static gchar *

g_unescape_uri_string (const char *escaped,

1423

int len,

1424

const char *illegal_escaped_characters,

1425

gboolean ascii_must_not_be_escaped)

1426

{

1427

const gchar *in, *in_end;

1428

gchar *out, *result;

1429

int c;

1430

1431

if (escaped == NULL)

return NULL;

if (len < 0)

len = strlen (escaped);

1436

1437

result = g_malloc (len + 1);

1438

1439

out = result;

1440

for (in = escaped, in_end = escaped + len; in < in_end; in++)

{

c = *in;

if (c == '%')

{

/* catch partial escape sequences past the end of the substring */

1447

if (in + 3 > in_end)

1448

break;

1449

1450

c = unescape_character (in + 1);

1451

1452

/* catch bad escape sequences and NUL characters */

if (c <= 0)

break;

/* catch escaped ASCII */

1457

if (ascii_must_not_be_escaped && c <= 0x7F)

1458

break;

1459

1460

/* catch other illegal escaped characters */

1461

if (strchr (illegal_escaped_characters, c) != NULL)

break;

in += 2;

}

*out++ = c;

}

g_assert (out - result <= len);

*out = '\0';

if (in != in_end)

{

g_free (result);

return NULL;

}

return result;

}

static gboolean

is_asciialphanum (gunichar c)

1484

{

1485

return c <= 0x7F && g_ascii_isalnum (c);

}

static gboolean

is_asciialpha (gunichar c)

1490

{

1491

return c <= 0x7F && g_ascii_isalpha (c);

1492

}

1493

1494

/* allows an empty string */

1495

static gboolean

1496

hostname_validate (const char *hostname)

1497

{

1498

const char *p;

1499

gunichar c, first_char, last_char;

p = hostname;

if (*p == '\0')

return TRUE;

do

{

/* read in a label */

1507

c = g_utf8_get_char (p);

1508

p = g_utf8_next_char (p);

1509

if (!is_asciialphanum (c))

return FALSE;

first_char = c;

do

{

last_char = c;

c = g_utf8_get_char (p);

1516

p = g_utf8_next_char (p);

1517

}

1518

while (is_asciialphanum (c) || c == '-');

1519

if (last_char == '-')

1520

return FALSE;

1521

1522

/* if that was the last label, check that it was a toplabel */

1523

if (c == '\0' || (c == '.' && *p == '\0'))

1524

return is_asciialpha (first_char);

}

while (c == '.');

return FALSE;

}

/**

* g_filename_from_uri:

1532

* @uri: a uri describing a filename (escaped, encoded in ASCII).

1533

* @hostname: (out) (optional) (nullable): Location to store hostname for the

1534

* URI.

1535

* If there is no hostname in the URI, %NULL will be

1536

* stored in this location.

1537

* @error: location to store the error occurring, or %NULL to ignore

1538

* errors. Any of the errors in #GConvertError may occur.

1539

*

1540

* Converts an escaped ASCII-encoded URI to a local filename in the

1541

* encoding used for filenames.

1542

*

1543

* Returns: (type filename): a newly-allocated string holding

1544

* the resulting filename, or %NULL on an error.

1545

**/

1546

gchar *

1547

g_filename_from_uri (const gchar *uri,

gchar **hostname,

GError **error)

{

const char *path_part;

1552

const char *host_part;

1553

char *unescaped_hostname;

char *result;

char *filename;

int offs;

#ifdef G_OS_WIN32

char *p, *slash;

#endif

if (hostname)

*hostname = NULL;

if (!has_case_prefix (uri, "file:/"))

1565

{

1566

g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,

1567

_("The URI '%s' is not an absolute URI using the \"file\" scheme"),

uri);

return NULL;

}

path_part = uri + strlen ("file:");

1573

1574

if (strchr (path_part, '#') != NULL)

1575

{

1576

g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,

1577

_("The local file URI '%s' may not include a '#'"),

uri);

return NULL;

}

if (has_case_prefix (path_part, "///"))

1583

path_part += 2;

1584

else if (has_case_prefix (path_part, "//"))

1585

{

1586

path_part += 2;

1587

host_part = path_part;

1588

1589

path_part = strchr (path_part, '/');

1590

1591

if (path_part == NULL)

1592

{

1593

g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,

1594

_("The URI '%s' is invalid"),

uri);

return NULL;

}

unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE);

1600

1601

if (unescaped_hostname == NULL ||

1602

!hostname_validate (unescaped_hostname))

1603

{

1604

g_free (unescaped_hostname);

1605

g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,

1606

_("The hostname of the URI '%s' is invalid"),

uri);

return NULL;

}

if (hostname)

*hostname = unescaped_hostname;

1613

else

1614

g_free (unescaped_hostname);

1615

}

1616

1617

filename = g_unescape_uri_string (path_part, -1, "/", FALSE);

1618

1619

if (filename == NULL)

1620

{

1621

g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI,

1622

_("The URI '%s' contains invalidly escaped characters"),

uri);

return NULL;

}

offs = 0;

#ifdef G_OS_WIN32

/* Drop localhost */

1630

if (hostname && *hostname != NULL &&

1631

g_ascii_strcasecmp (*hostname, "localhost") == 0)

1632

{

1633

g_free (*hostname);

*hostname = NULL;

}

/* Turn slashes into backslashes, because that's the canonical spelling */

1638

p = filename;

1639

while ((slash = strchr (p, '/')) != NULL)

{

*slash = '\\';

p = slash + 1;

}

/* Windows URIs with a drive letter can be like "file://host/c:/foo"

1646

* or "file://host/c|/foo" (some Netscape versions). In those cases, start

1647

* the filename from the drive letter.

1648

*/

1649

if (g_ascii_isalpha (filename[1]))

1650

{

1651

if (filename[2] == ':')

1652

offs = 1;

1653

else if (filename[2] == '|')

{

filename[2] = ':';

offs = 1;

}

}

#endif

result = g_strdup (filename + offs);

g_free (filename);

return result;

}

#if defined (G_OS_WIN32) && !defined (_WIN64)

1668

1669

#undef g_filename_from_uri

1670

1671

gchar *

1672

g_filename_from_uri (const gchar *uri,

gchar **hostname,

GError **error)

{

gchar *utf8_filename;

1677

gchar *retval = NULL;

1678

1679

utf8_filename = g_filename_from_uri_utf8 (uri, hostname, error);

1680

if (utf8_filename)

1681

{

1682

retval = g_locale_from_utf8 (utf8_filename, -1, NULL, NULL, error);

1683

g_free (utf8_filename);

}

return retval;

}

#endif

/**

* g_filename_to_uri:

1692

* @filename: an absolute filename specified in the GLib file name encoding,

1693

* which is the on-disk file name bytes on Unix, and UTF-8 on

1694

* Windows

1695

* @hostname: (allow-none): A UTF-8 encoded hostname, or %NULL for none.

1696

* @error: location to store the error occurring, or %NULL to ignore

1697

* errors. Any of the errors in #GConvertError may occur.

1698

*

1699

* Converts an absolute filename to an escaped ASCII-encoded URI, with the path

1700

* component following Section 3.3. of RFC 2396.

1701

*

1702

* Returns: a newly-allocated string holding the resulting

1703

* URI, or %NULL on an error.

1704

**/

1705

gchar *

1706

g_filename_to_uri (const gchar *filename,

1707

const gchar *hostname,

GError **error)

{

char *escaped_uri;

g_return_val_if_fail (filename != NULL, NULL);

1713

1714

if (!g_path_is_absolute (filename))

1715

{

1716

g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,

1717

_("The pathname '%s' is not an absolute path"),

filename);

return NULL;

}

if (hostname &&

!(g_utf8_validate (hostname, -1, NULL)

1724

&& hostname_validate (hostname)))

1725

{

1726

g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,

1727

_("Invalid hostname"));

return NULL;

}

#ifdef G_OS_WIN32

/* Don't use localhost unnecessarily */

1733

if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0)

hostname = NULL;

#endif

escaped_uri = g_escape_file_uri (hostname, filename);

1738

1739

return escaped_uri;

1740

}

1741

1742

#if defined (G_OS_WIN32) && !defined (_WIN64)

1743

1744

#undef g_filename_to_uri

1745

1746

gchar *

1747

g_filename_to_uri (const gchar *filename,

1748

const gchar *hostname,

1749

GError **error)

1750

{

1751

gchar *utf8_filename;

1752

gchar *retval = NULL;

1753

1754

utf8_filename = g_locale_to_utf8 (filename, -1, NULL, NULL, error);

if (utf8_filename)

{

retval = g_filename_to_uri_utf8 (utf8_filename, hostname, error);

1759

g_free (utf8_filename);

}

return retval;

}

#endif

/**

* g_uri_list_extract_uris:

1769

* @uri_list: an URI list

1770

*

1771

* Splits an URI list conforming to the text/uri-list

1772

* mime type defined in RFC 2483 into individual URIs,

1773

* discarding any comments. The URIs are not validated.

1774

*

1775

* Returns: (transfer full): a newly allocated %NULL-terminated list

1776

* of strings holding the individual URIs. The array should be freed

1777

* with g_strfreev().

*

* Since: 2.6

*/

gchar **

g_uri_list_extract_uris (const gchar *uri_list)

1783

{

1784

GSList *uris, *u;

1785

const gchar *p, *q;

gchar **result;

gint n_uris = 0;

uris = NULL;

p = uri_list;

/* We don't actually try to validate the URI according to RFC

1794

* 2396, or even check for allowed characters - we just ignore

1795

* comments and trim whitespace off the ends. We also

1796

* allow LF delimination as well as the specified CRLF.

1797

*

1798

* We do allow comments like specified in RFC 2483.

*/

while (p)

{

if (*p != '#')

{

while (g_ascii_isspace (*p))

p++;

q = p;

while (*q && (*q != '\n') && (*q != '\r'))

q++;

if (q > p)

{

q--;

while (q > p && g_ascii_isspace (*q))

q--;

if (q > p)

{

uris = g_slist_prepend (uris, g_strndup (p, q - p + 1));

n_uris++;

}

}

}

p = strchr (p, '\n');

if (p)

p++;

}

result = g_new (gchar *, n_uris + 1);

1830

1831

result[n_uris--] = NULL;

1832

for (u = uris; u; u = u->next)

1833

result[n_uris--] = u->data;

1834

1835

g_slist_free (uris);

return result;

}

/**

* g_filename_display_basename:

1842

* @filename: an absolute pathname in the GLib file name encoding

1843

*

1844

* Returns the display basename for the particular filename, guaranteed

1845

* to be valid UTF-8. The display name might not be identical to the filename,

1846

* for instance there might be problems converting it to UTF-8, and some files

1847

* can be translated in the display.

1848

*

1849

* If GLib cannot make sense of the encoding of @filename, as a last resort it

1850

* replaces unknown characters with U+FFFD, the Unicode replacement character.

1851

* You can search the result for the UTF-8 encoding of this character (which is

1852

* "\357\277\275" in octal notation) to find out if @filename was in an invalid

1853

* encoding.

1854

*

1855

* You must pass the whole absolute pathname to this functions so that

1856

* translation of well known locations can be done.

1857

*

1858

* This function is preferred over g_filename_display_name() if you know the

1859

* whole path, as it allows translation.

1860

*

1861

* Returns: a newly allocated string containing

1862

* a rendition of the basename of the filename in valid UTF-8

*

* Since: 2.6

**/

gchar *

g_filename_display_basename (const gchar *filename)

1868

{

1869

char *basename;

1870

char *display_name;

1871

1872

g_return_val_if_fail (filename != NULL, NULL);

1873

1874

basename = g_path_get_basename (filename);

1875

display_name = g_filename_display_name (basename);

1876

g_free (basename);

1877

return display_name;

}

/**

* g_filename_display_name:

1882

* @filename: a pathname hopefully in the GLib file name encoding

1883

*

1884

* Converts a filename into a valid UTF-8 string. The conversion is

1885

* not necessarily reversible, so you should keep the original around

1886

* and use the return value of this function only for display purposes.

1887

* Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL

1888

* even if the filename actually isn't in the GLib file name encoding.

1889

*

1890

* If GLib cannot make sense of the encoding of @filename, as a last resort it

1891

* replaces unknown characters with U+FFFD, the Unicode replacement character.

1892

* You can search the result for the UTF-8 encoding of this character (which is

1893

* "\357\277\275" in octal notation) to find out if @filename was in an invalid

1894

* encoding.

1895

*

1896

* If you know the whole pathname of the file you should use

1897

* g_filename_display_basename(), since that allows location-based

1898

* translation of filenames.

1899

*

1900

* Returns: a newly allocated string containing

1901

* a rendition of the filename in valid UTF-8

*

* Since: 2.6

**/

gchar *

g_filename_display_name (const gchar *filename)

1907

{

1908

gint i;

1909

const gchar **charsets;

1910

gchar *display_name = NULL;

1911

gboolean is_utf8;

1912

1913

is_utf8 = g_get_filename_charsets (&charsets);

if (is_utf8)

{

if (g_utf8_validate (filename, -1, NULL))

1918

display_name = g_strdup (filename);

}

if (!display_name)

{

/* Try to convert from the filename charsets to UTF-8.

1924

* Skip the first charset if it is UTF-8.

1925

*/

1926

for (i = is_utf8 ? 1 : 0; charsets[i]; i++)

1927

{

1928

display_name = g_convert (filename, -1, "UTF-8", charsets[i],

NULL, NULL, NULL);

if (display_name)

break;

}

}

/* if all conversions failed, we replace invalid UTF-8

1937

* by a question mark

1938

*/

1939

if (!display_name)

1940

display_name = _g_utf8_make_valid (filename);

1941

1942

return display_name;

1943

}

nexmon – Blame information for rev 1