nexmon – Blame information for rev 1
?pathlinks?
Rev | Author | Line No. | Line |
---|---|---|---|
1 | office | 1 | /* GLIB - Library of useful routines for C programming |
2 | * |
||
3 | * gconvert.c: Convert between character sets using iconv |
||
4 | * Copyright Red Hat Inc., 2000 |
||
5 | * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com> |
||
6 | * |
||
7 | * This library is free software; you can redistribute it and/or |
||
8 | * modify it under the terms of the GNU Lesser General Public |
||
9 | * License as published by the Free Software Foundation; either |
||
10 | * version 2 of the License, or (at your option) any later version. |
||
11 | * |
||
12 | * This library is distributed in the hope that it will be useful, |
||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
15 | * Lesser General Public License for more details. |
||
16 | * |
||
17 | * You should have received a copy of the GNU Lesser General Public |
||
18 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
||
19 | */ |
||
20 | |||
21 | #include "config.h" |
||
22 | #include "glibconfig.h" |
||
23 | |||
24 | #ifndef G_OS_WIN32 |
||
25 | #include <iconv.h> |
||
26 | #endif |
||
27 | #include <errno.h> |
||
28 | #include <stdio.h> |
||
29 | #include <string.h> |
||
30 | #include <stdlib.h> |
||
31 | |||
32 | #ifdef G_OS_WIN32 |
||
33 | #include "win_iconv.c" |
||
34 | #endif |
||
35 | |||
36 | #ifdef G_PLATFORM_WIN32 |
||
37 | #define STRICT |
||
38 | #include <windows.h> |
||
39 | #undef STRICT |
||
40 | #endif |
||
41 | |||
42 | #include "gconvert.h" |
||
43 | |||
44 | #include "gcharsetprivate.h" |
||
45 | #include "gslist.h" |
||
46 | #include "gstrfuncs.h" |
||
47 | #include "gtestutils.h" |
||
48 | #include "gthread.h" |
||
49 | #include "gunicode.h" |
||
50 | #include "gfileutils.h" |
||
51 | |||
52 | #include "glibintl.h" |
||
53 | |||
54 | #if defined(USE_LIBICONV_GNU) && !defined (_LIBICONV_H) |
||
55 | #error GNU libiconv in use but included iconv.h not from libiconv |
||
56 | #endif |
||
57 | #if !defined(USE_LIBICONV_GNU) && defined (_LIBICONV_H) \ |
||
58 | && !defined (__APPLE_CC__) && !defined (__LP_64__) |
||
59 | #error GNU libiconv not in use but included iconv.h is from libiconv |
||
60 | #endif |
||
61 | |||
62 | |||
63 | /** |
||
64 | * SECTION:conversions |
||
65 | * @title: Character Set Conversion |
||
66 | * @short_description: convert strings between different character sets |
||
67 | * |
||
68 | * The g_convert() family of function wraps the functionality of iconv(). |
||
69 | * In addition to pure character set conversions, GLib has functions to |
||
70 | * deal with the extra complications of encodings for file names. |
||
71 | * |
||
72 | * ## File Name Encodings |
||
73 | * |
||
74 | * Historically, UNIX has not had a defined encoding for file names: |
||
75 | * a file name is valid as long as it does not have path separators |
||
76 | * in it ("/"). However, displaying file names may require conversion: |
||
77 | * from the character set in which they were created, to the character |
||
78 | * set in which the application operates. Consider the Spanish file name |
||
79 | * "Presentación.sxi". If the application which created it uses |
||
80 | * ISO-8859-1 for its encoding, |
||
81 | * |[ |
||
82 | * Character: P r e s e n t a c i ó n . s x i |
||
83 | * Hex code: 50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69 |
||
84 | * ]| |
||
85 | * However, if the application use UTF-8, the actual file name on |
||
86 | * disk would look like this: |
||
87 | * |[ |
||
88 | * Character: P r e s e n t a c i ó n . s x i |
||
89 | * Hex code: 50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69 |
||
90 | * ]| |
||
91 | * Glib uses UTF-8 for its strings, and GUI toolkits like GTK+ that use |
||
92 | * Glib do the same thing. If you get a file name from the file system, |
||
93 | * for example, from readdir() or from g_dir_read_name(), and you wish |
||
94 | * to display the file name to the user, you will need to convert it |
||
95 | * into UTF-8. The opposite case is when the user types the name of a |
||
96 | * file he wishes to save: the toolkit will give you that string in |
||
97 | * UTF-8 encoding, and you will need to convert it to the character |
||
98 | * set used for file names before you can create the file with open() |
||
99 | * or fopen(). |
||
100 | * |
||
101 | * By default, Glib assumes that file names on disk are in UTF-8 |
||
102 | * encoding. This is a valid assumption for file systems which |
||
103 | * were created relatively recently: most applications use UTF-8 |
||
104 | * encoding for their strings, and that is also what they use for |
||
105 | * the file names they create. However, older file systems may |
||
106 | * still contain file names created in "older" encodings, such as |
||
107 | * ISO-8859-1. In this case, for compatibility reasons, you may want |
||
108 | * to instruct Glib to use that particular encoding for file names |
||
109 | * rather than UTF-8. You can do this by specifying the encoding for |
||
110 | * file names in the [`G_FILENAME_ENCODING`][G_FILENAME_ENCODING] |
||
111 | * environment variable. For example, if your installation uses |
||
112 | * ISO-8859-1 for file names, you can put this in your `~/.profile` |
||
113 | * |[ |
||
114 | * export G_FILENAME_ENCODING=ISO-8859-1 |
||
115 | * ]| |
||
116 | * Glib provides the functions g_filename_to_utf8() and |
||
117 | * g_filename_from_utf8() to perform the necessary conversions. |
||
118 | * These functions convert file names from the encoding specified |
||
119 | * in `G_FILENAME_ENCODING` to UTF-8 and vice-versa. This |
||
120 | * [diagram][file-name-encodings-diagram] illustrates how |
||
121 | * these functions are used to convert between UTF-8 and the |
||
122 | * encoding for file names in the file system. |
||
123 | * |
||
124 | * ## Conversion between file name encodings # {#file-name-encodings-diagram) |
||
125 | * |
||
126 | *  |
||
127 | * |
||
128 | * ## Checklist for Application Writers |
||
129 | * |
||
130 | * This section is a practical summary of the detailed |
||
131 | |||
132 | * things to do to make sure your applications process file |
||
133 | * name encodings correctly. |
||
134 | * |
||
135 | * 1. If you get a file name from the file system from a function |
||
136 | * such as readdir() or gtk_file_chooser_get_filename(), you do |
||
137 | * not need to do any conversion to pass that file name to |
||
138 | * functions like open(), rename(), or fopen() -- those are "raw" |
||
139 | * file names which the file system understands. |
||
140 | * |
||
141 | * 2. If you need to display a file name, convert it to UTF-8 first |
||
142 | * by using g_filename_to_utf8(). If conversion fails, display a |
||
143 | * string like "Unknown file name". Do not convert this string back |
||
144 | * into the encoding used for file names if you wish to pass it to |
||
145 | * the file system; use the original file name instead. |
||
146 | * |
||
147 | * For example, the document window of a word processor could display |
||
148 | * "Unknown file name" in its title bar but still let the user save |
||
149 | * the file, as it would keep the raw file name internally. This |
||
150 | * can happen if the user has not set the `G_FILENAME_ENCODING` |
||
151 | * environment variable even though he has files whose names are |
||
152 | * not encoded in UTF-8. |
||
153 | * |
||
154 | * 3. If your user interface lets the user type a file name for saving |
||
155 | * or renaming, convert it to the encoding used for file names in |
||
156 | * the file system by using g_filename_from_utf8(). Pass the converted |
||
157 | * file name to functions like fopen(). If conversion fails, ask the |
||
158 | * user to enter a different file name. This can happen if the user |
||
159 | * types Japanese characters when `G_FILENAME_ENCODING` is set to |
||
160 | * `ISO-8859-1`, for example. |
||
161 | */ |
||
162 | |||
163 | /* We try to terminate strings in unknown charsets with this many zero bytes |
||
164 | * to ensure that multibyte strings really are nul-terminated when we return |
||
165 | * them from g_convert() and friends. |
||
166 | */ |
||
167 | #define NUL_TERMINATOR_LENGTH 4 |
||
168 | |||
169 | G_DEFINE_QUARK (g_convert_error, g_convert_error) |
||
170 | |||
171 | static gboolean |
||
172 | try_conversion (const char *to_codeset, |
||
173 | const char *from_codeset, |
||
174 | iconv_t *cd) |
||
175 | { |
||
176 | *cd = iconv_open (to_codeset, from_codeset); |
||
177 | |||
178 | if (*cd == (iconv_t)-1 && errno == EINVAL) |
||
179 | return FALSE; |
||
180 | else |
||
181 | return TRUE; |
||
182 | } |
||
183 | |||
184 | static gboolean |
||
185 | try_to_aliases (const char **to_aliases, |
||
186 | const char *from_codeset, |
||
187 | iconv_t *cd) |
||
188 | { |
||
189 | if (to_aliases) |
||
190 | { |
||
191 | const char **p = to_aliases; |
||
192 | while (*p) |
||
193 | { |
||
194 | if (try_conversion (*p, from_codeset, cd)) |
||
195 | return TRUE; |
||
196 | |||
197 | p++; |
||
198 | } |
||
199 | } |
||
200 | |||
201 | return FALSE; |
||
202 | } |
||
203 | |||
204 | /** |
||
205 | * g_iconv_open: |
||
206 | * @to_codeset: destination codeset |
||
207 | * @from_codeset: source codeset |
||
208 | * |
||
209 | * Same as the standard UNIX routine iconv_open(), but |
||
210 | * may be implemented via libiconv on UNIX flavors that lack |
||
211 | * a native implementation. |
||
212 | * |
||
213 | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
||
214 | * more convenient than the raw iconv wrappers. |
||
215 | * |
||
216 | * Returns: a "conversion descriptor", or (GIConv)-1 if |
||
217 | * opening the converter failed. |
||
218 | **/ |
||
219 | GIConv |
||
220 | g_iconv_open (const gchar *to_codeset, |
||
221 | const gchar *from_codeset) |
||
222 | { |
||
223 | iconv_t cd; |
||
224 | |||
225 | if (!try_conversion (to_codeset, from_codeset, &cd)) |
||
226 | { |
||
227 | const char **to_aliases = _g_charset_get_aliases (to_codeset); |
||
228 | const char **from_aliases = _g_charset_get_aliases (from_codeset); |
||
229 | |||
230 | if (from_aliases) |
||
231 | { |
||
232 | const char **p = from_aliases; |
||
233 | while (*p) |
||
234 | { |
||
235 | if (try_conversion (to_codeset, *p, &cd)) |
||
236 | goto out; |
||
237 | |||
238 | if (try_to_aliases (to_aliases, *p, &cd)) |
||
239 | goto out; |
||
240 | |||
241 | p++; |
||
242 | } |
||
243 | } |
||
244 | |||
245 | if (try_to_aliases (to_aliases, from_codeset, &cd)) |
||
246 | goto out; |
||
247 | } |
||
248 | |||
249 | out: |
||
250 | return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd; |
||
251 | } |
||
252 | |||
253 | /** |
||
254 | * g_iconv: |
||
255 | * @converter: conversion descriptor from g_iconv_open() |
||
256 | * @inbuf: bytes to convert |
||
257 | * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf |
||
258 | * @outbuf: converted output bytes |
||
259 | * @outbytes_left: inout parameter, bytes available to fill in @outbuf |
||
260 | * |
||
261 | * Same as the standard UNIX routine iconv(), but |
||
262 | * may be implemented via libiconv on UNIX flavors that lack |
||
263 | * a native implementation. |
||
264 | * |
||
265 | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
||
266 | * more convenient than the raw iconv wrappers. |
||
267 | * |
||
268 | * Returns: count of non-reversible conversions, or -1 on error |
||
269 | **/ |
||
270 | gsize |
||
271 | g_iconv (GIConv converter, |
||
272 | gchar **inbuf, |
||
273 | gsize *inbytes_left, |
||
274 | gchar **outbuf, |
||
275 | gsize *outbytes_left) |
||
276 | { |
||
277 | iconv_t cd = (iconv_t)converter; |
||
278 | |||
279 | return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left); |
||
280 | } |
||
281 | |||
282 | /** |
||
283 | * g_iconv_close: |
||
284 | * @converter: a conversion descriptor from g_iconv_open() |
||
285 | * |
||
286 | * Same as the standard UNIX routine iconv_close(), but |
||
287 | * may be implemented via libiconv on UNIX flavors that lack |
||
288 | * a native implementation. Should be called to clean up |
||
289 | * the conversion descriptor from g_iconv_open() when |
||
290 | * you are done converting things. |
||
291 | * |
||
292 | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
||
293 | * more convenient than the raw iconv wrappers. |
||
294 | * |
||
295 | * Returns: -1 on error, 0 on success |
||
296 | **/ |
||
297 | gint |
||
298 | g_iconv_close (GIConv converter) |
||
299 | { |
||
300 | iconv_t cd = (iconv_t)converter; |
||
301 | |||
302 | return iconv_close (cd); |
||
303 | } |
||
304 | |||
305 | static GIConv |
||
306 | open_converter (const gchar *to_codeset, |
||
307 | const gchar *from_codeset, |
||
308 | GError **error) |
||
309 | { |
||
310 | GIConv cd; |
||
311 | |||
312 | cd = g_iconv_open (to_codeset, from_codeset); |
||
313 | |||
314 | if (cd == (GIConv) -1) |
||
315 | { |
||
316 | /* Something went wrong. */ |
||
317 | if (error) |
||
318 | { |
||
319 | if (errno == EINVAL) |
||
320 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION, |
||
321 | _("Conversion from character set '%s' to '%s' is not supported"), |
||
322 | from_codeset, to_codeset); |
||
323 | else |
||
324 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
||
325 | _("Could not open converter from '%s' to '%s'"), |
||
326 | from_codeset, to_codeset); |
||
327 | } |
||
328 | } |
||
329 | |||
330 | return cd; |
||
331 | } |
||
332 | |||
333 | static int |
||
334 | close_converter (GIConv cd) |
||
335 | { |
||
336 | if (cd == (GIConv) -1) |
||
337 | return 0; |
||
338 | |||
339 | return g_iconv_close (cd); |
||
340 | } |
||
341 | |||
342 | /** |
||
343 | * g_convert_with_iconv: |
||
344 | * @str: the string to convert |
||
345 | * @len: the length of the string in bytes, or -1 if the string is |
||
346 | * nul-terminated (Note that some encodings may allow nul |
||
347 | * bytes to occur inside strings. In that case, using -1 |
||
348 | * for the @len parameter is unsafe) |
||
349 | * @converter: conversion descriptor from g_iconv_open() |
||
350 | * @bytes_read: location to store the number of bytes in the |
||
351 | * input string that were successfully converted, or %NULL. |
||
352 | * Even if the conversion was successful, this may be |
||
353 | * less than @len if there were partial characters |
||
354 | * at the end of the input. If the error |
||
355 | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
||
356 | * stored will the byte offset after the last valid |
||
357 | * input sequence. |
||
358 | * @bytes_written: the number of bytes stored in the output buffer (not |
||
359 | * including the terminating nul). |
||
360 | * @error: location to store the error occurring, or %NULL to ignore |
||
361 | * errors. Any of the errors in #GConvertError may occur. |
||
362 | * |
||
363 | * Converts a string from one character set to another. |
||
364 | * |
||
365 | * Note that you should use g_iconv() for streaming conversions. |
||
366 | * Despite the fact that @byes_read can return information about partial |
||
367 | * characters, the g_convert_... functions are not generally suitable |
||
368 | * for streaming. If the underlying converter maintains internal state, |
||
369 | * then this won't be preserved across successive calls to g_convert(), |
||
370 | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
||
371 | * this is the GNU C converter for CP1255 which does not emit a base |
||
372 | * character until it knows that the next character is not a mark that |
||
373 | * could combine with the base character.) |
||
374 | * |
||
375 | * Returns: If the conversion was successful, a newly allocated |
||
376 | * nul-terminated string, which must be freed with |
||
377 | * g_free(). Otherwise %NULL and @error will be set. |
||
378 | **/ |
||
379 | gchar* |
||
380 | g_convert_with_iconv (const gchar *str, |
||
381 | gssize len, |
||
382 | GIConv converter, |
||
383 | gsize *bytes_read, |
||
384 | gsize *bytes_written, |
||
385 | GError **error) |
||
386 | { |
||
387 | gchar *dest; |
||
388 | gchar *outp; |
||
389 | const gchar *p; |
||
390 | gsize inbytes_remaining; |
||
391 | gsize outbytes_remaining; |
||
392 | gsize err; |
||
393 | gsize outbuf_size; |
||
394 | gboolean have_error = FALSE; |
||
395 | gboolean done = FALSE; |
||
396 | gboolean reset = FALSE; |
||
397 | |||
398 | g_return_val_if_fail (converter != (GIConv) -1, NULL); |
||
399 | |||
400 | if (len < 0) |
||
401 | len = strlen (str); |
||
402 | |||
403 | p = str; |
||
404 | inbytes_remaining = len; |
||
405 | outbuf_size = len + NUL_TERMINATOR_LENGTH; |
||
406 | |||
407 | outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH; |
||
408 | outp = dest = g_malloc (outbuf_size); |
||
409 | |||
410 | while (!done && !have_error) |
||
411 | { |
||
412 | if (reset) |
||
413 | err = g_iconv (converter, NULL, &inbytes_remaining, &outp, &outbytes_remaining); |
||
414 | else |
||
415 | err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining); |
||
416 | |||
417 | if (err == (gsize) -1) |
||
418 | { |
||
419 | switch (errno) |
||
420 | { |
||
421 | case EINVAL: |
||
422 | /* Incomplete text, do not report an error */ |
||
423 | done = TRUE; |
||
424 | break; |
||
425 | case E2BIG: |
||
426 | { |
||
427 | gsize used = outp - dest; |
||
428 | |||
429 | outbuf_size *= 2; |
||
430 | dest = g_realloc (dest, outbuf_size); |
||
431 | |||
432 | outp = dest + used; |
||
433 | outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH; |
||
434 | } |
||
435 | break; |
||
436 | case EILSEQ: |
||
437 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
438 | _("Invalid byte sequence in conversion input")); |
||
439 | have_error = TRUE; |
||
440 | break; |
||
441 | default: |
||
442 | { |
||
443 | int errsv = errno; |
||
444 | |||
445 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
||
446 | _("Error during conversion: %s"), |
||
447 | g_strerror (errsv)); |
||
448 | } |
||
449 | have_error = TRUE; |
||
450 | break; |
||
451 | } |
||
452 | } |
||
453 | else |
||
454 | { |
||
455 | if (!reset) |
||
456 | { |
||
457 | /* call g_iconv with NULL inbuf to cleanup shift state */ |
||
458 | reset = TRUE; |
||
459 | inbytes_remaining = 0; |
||
460 | } |
||
461 | else |
||
462 | done = TRUE; |
||
463 | } |
||
464 | } |
||
465 | |||
466 | memset (outp, 0, NUL_TERMINATOR_LENGTH); |
||
467 | |||
468 | if (bytes_read) |
||
469 | *bytes_read = p - str; |
||
470 | else |
||
471 | { |
||
472 | if ((p - str) != len) |
||
473 | { |
||
474 | if (!have_error) |
||
475 | { |
||
476 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, |
||
477 | _("Partial character sequence at end of input")); |
||
478 | have_error = TRUE; |
||
479 | } |
||
480 | } |
||
481 | } |
||
482 | |||
483 | if (bytes_written) |
||
484 | *bytes_written = outp - dest; /* Doesn't include '\0' */ |
||
485 | |||
486 | if (have_error) |
||
487 | { |
||
488 | g_free (dest); |
||
489 | return NULL; |
||
490 | } |
||
491 | else |
||
492 | return dest; |
||
493 | } |
||
494 | |||
495 | /** |
||
496 | * g_convert: |
||
497 | * @str: the string to convert |
||
498 | * @len: the length of the string in bytes, or -1 if the string is |
||
499 | * nul-terminated (Note that some encodings may allow nul |
||
500 | * bytes to occur inside strings. In that case, using -1 |
||
501 | * for the @len parameter is unsafe) |
||
502 | * @to_codeset: name of character set into which to convert @str |
||
503 | * @from_codeset: character set of @str. |
||
504 | * @bytes_read: (out): location to store the number of bytes in the |
||
505 | * input string that were successfully converted, or %NULL. |
||
506 | * Even if the conversion was successful, this may be |
||
507 | * less than @len if there were partial characters |
||
508 | * at the end of the input. If the error |
||
509 | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
||
510 | * stored will the byte offset after the last valid |
||
511 | * input sequence. |
||
512 | * @bytes_written: (out): the number of bytes stored in the output buffer (not |
||
513 | * including the terminating nul). |
||
514 | * @error: location to store the error occurring, or %NULL to ignore |
||
515 | * errors. Any of the errors in #GConvertError may occur. |
||
516 | * |
||
517 | * Converts a string from one character set to another. |
||
518 | * |
||
519 | * Note that you should use g_iconv() for streaming conversions. |
||
520 | * Despite the fact that @byes_read can return information about partial |
||
521 | * characters, the g_convert_... functions are not generally suitable |
||
522 | * for streaming. If the underlying converter maintains internal state, |
||
523 | * then this won't be preserved across successive calls to g_convert(), |
||
524 | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
||
525 | * this is the GNU C converter for CP1255 which does not emit a base |
||
526 | * character until it knows that the next character is not a mark that |
||
527 | * could combine with the base character.) |
||
528 | * |
||
529 | * Using extensions such as "//TRANSLIT" may not work (or may not work |
||
530 | * well) on many platforms. Consider using g_str_to_ascii() instead. |
||
531 | * |
||
532 | * Returns: If the conversion was successful, a newly allocated |
||
533 | * nul-terminated string, which must be freed with |
||
534 | * g_free(). Otherwise %NULL and @error will be set. |
||
535 | **/ |
||
536 | gchar* |
||
537 | g_convert (const gchar *str, |
||
538 | gssize len, |
||
539 | const gchar *to_codeset, |
||
540 | const gchar *from_codeset, |
||
541 | gsize *bytes_read, |
||
542 | gsize *bytes_written, |
||
543 | GError **error) |
||
544 | { |
||
545 | gchar *res; |
||
546 | GIConv cd; |
||
547 | |||
548 | g_return_val_if_fail (str != NULL, NULL); |
||
549 | g_return_val_if_fail (to_codeset != NULL, NULL); |
||
550 | g_return_val_if_fail (from_codeset != NULL, NULL); |
||
551 | |||
552 | cd = open_converter (to_codeset, from_codeset, error); |
||
553 | |||
554 | if (cd == (GIConv) -1) |
||
555 | { |
||
556 | if (bytes_read) |
||
557 | *bytes_read = 0; |
||
558 | |||
559 | if (bytes_written) |
||
560 | *bytes_written = 0; |
||
561 | |||
562 | return NULL; |
||
563 | } |
||
564 | |||
565 | res = g_convert_with_iconv (str, len, cd, |
||
566 | bytes_read, bytes_written, |
||
567 | error); |
||
568 | |||
569 | close_converter (cd); |
||
570 | |||
571 | return res; |
||
572 | } |
||
573 | |||
574 | /** |
||
575 | * g_convert_with_fallback: |
||
576 | * @str: the string to convert |
||
577 | * @len: the length of the string in bytes, or -1 if the string is |
||
578 | * nul-terminated (Note that some encodings may allow nul |
||
579 | * bytes to occur inside strings. In that case, using -1 |
||
580 | * for the @len parameter is unsafe) |
||
581 | * @to_codeset: name of character set into which to convert @str |
||
582 | * @from_codeset: character set of @str. |
||
583 | * @fallback: UTF-8 string to use in place of character not |
||
584 | * present in the target encoding. (The string must be |
||
585 | * representable in the target encoding). |
||
586 | If %NULL, characters not in the target encoding will |
||
587 | be represented as Unicode escapes \uxxxx or \Uxxxxyyyy. |
||
588 | * @bytes_read: location to store the number of bytes in the |
||
589 | * input string that were successfully converted, or %NULL. |
||
590 | * Even if the conversion was successful, this may be |
||
591 | * less than @len if there were partial characters |
||
592 | * at the end of the input. |
||
593 | * @bytes_written: the number of bytes stored in the output buffer (not |
||
594 | * including the terminating nul). |
||
595 | * @error: location to store the error occurring, or %NULL to ignore |
||
596 | * errors. Any of the errors in #GConvertError may occur. |
||
597 | * |
||
598 | * Converts a string from one character set to another, possibly |
||
599 | * including fallback sequences for characters not representable |
||
600 | * in the output. Note that it is not guaranteed that the specification |
||
601 | * for the fallback sequences in @fallback will be honored. Some |
||
602 | * systems may do an approximate conversion from @from_codeset |
||
603 | * to @to_codeset in their iconv() functions, |
||
604 | * in which case GLib will simply return that approximate conversion. |
||
605 | * |
||
606 | * Note that you should use g_iconv() for streaming conversions. |
||
607 | * Despite the fact that @byes_read can return information about partial |
||
608 | * characters, the g_convert_... functions are not generally suitable |
||
609 | * for streaming. If the underlying converter maintains internal state, |
||
610 | * then this won't be preserved across successive calls to g_convert(), |
||
611 | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
||
612 | * this is the GNU C converter for CP1255 which does not emit a base |
||
613 | * character until it knows that the next character is not a mark that |
||
614 | * could combine with the base character.) |
||
615 | * |
||
616 | * Returns: If the conversion was successful, a newly allocated |
||
617 | * nul-terminated string, which must be freed with |
||
618 | * g_free(). Otherwise %NULL and @error will be set. |
||
619 | **/ |
||
620 | gchar* |
||
621 | g_convert_with_fallback (const gchar *str, |
||
622 | gssize len, |
||
623 | const gchar *to_codeset, |
||
624 | const gchar *from_codeset, |
||
625 | const gchar *fallback, |
||
626 | gsize *bytes_read, |
||
627 | gsize *bytes_written, |
||
628 | GError **error) |
||
629 | { |
||
630 | gchar *utf8; |
||
631 | gchar *dest; |
||
632 | gchar *outp; |
||
633 | const gchar *insert_str = NULL; |
||
634 | const gchar *p; |
||
635 | gsize inbytes_remaining; |
||
636 | const gchar *save_p = NULL; |
||
637 | gsize save_inbytes = 0; |
||
638 | gsize outbytes_remaining; |
||
639 | gsize err; |
||
640 | GIConv cd; |
||
641 | gsize outbuf_size; |
||
642 | gboolean have_error = FALSE; |
||
643 | gboolean done = FALSE; |
||
644 | |||
645 | GError *local_error = NULL; |
||
646 | |||
647 | g_return_val_if_fail (str != NULL, NULL); |
||
648 | g_return_val_if_fail (to_codeset != NULL, NULL); |
||
649 | g_return_val_if_fail (from_codeset != NULL, NULL); |
||
650 | |||
651 | if (len < 0) |
||
652 | len = strlen (str); |
||
653 | |||
654 | /* Try an exact conversion; we only proceed if this fails |
||
655 | * due to an illegal sequence in the input string. |
||
656 | */ |
||
657 | dest = g_convert (str, len, to_codeset, from_codeset, |
||
658 | bytes_read, bytes_written, &local_error); |
||
659 | if (!local_error) |
||
660 | return dest; |
||
661 | |||
662 | if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE)) |
||
663 | { |
||
664 | g_propagate_error (error, local_error); |
||
665 | return NULL; |
||
666 | } |
||
667 | else |
||
668 | g_error_free (local_error); |
||
669 | |||
670 | local_error = NULL; |
||
671 | |||
672 | /* No go; to proceed, we need a converter from "UTF-8" to |
||
673 | * to_codeset, and the string as UTF-8. |
||
674 | */ |
||
675 | cd = open_converter (to_codeset, "UTF-8", error); |
||
676 | if (cd == (GIConv) -1) |
||
677 | { |
||
678 | if (bytes_read) |
||
679 | *bytes_read = 0; |
||
680 | |||
681 | if (bytes_written) |
||
682 | *bytes_written = 0; |
||
683 | |||
684 | return NULL; |
||
685 | } |
||
686 | |||
687 | utf8 = g_convert (str, len, "UTF-8", from_codeset, |
||
688 | bytes_read, &inbytes_remaining, error); |
||
689 | if (!utf8) |
||
690 | { |
||
691 | close_converter (cd); |
||
692 | if (bytes_written) |
||
693 | *bytes_written = 0; |
||
694 | return NULL; |
||
695 | } |
||
696 | |||
697 | /* Now the heart of the code. We loop through the UTF-8 string, and |
||
698 | * whenever we hit an offending character, we form fallback, convert |
||
699 | * the fallback to the target codeset, and then go back to |
||
700 | * converting the original string after finishing with the fallback. |
||
701 | * |
||
702 | * The variables save_p and save_inbytes store the input state |
||
703 | * for the original string while we are converting the fallback |
||
704 | */ |
||
705 | p = utf8; |
||
706 | |||
707 | outbuf_size = len + NUL_TERMINATOR_LENGTH; |
||
708 | outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH; |
||
709 | outp = dest = g_malloc (outbuf_size); |
||
710 | |||
711 | while (!done && !have_error) |
||
712 | { |
||
713 | gsize inbytes_tmp = inbytes_remaining; |
||
714 | err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining); |
||
715 | inbytes_remaining = inbytes_tmp; |
||
716 | |||
717 | if (err == (gsize) -1) |
||
718 | { |
||
719 | switch (errno) |
||
720 | { |
||
721 | case EINVAL: |
||
722 | g_assert_not_reached(); |
||
723 | break; |
||
724 | case E2BIG: |
||
725 | { |
||
726 | gsize used = outp - dest; |
||
727 | |||
728 | outbuf_size *= 2; |
||
729 | dest = g_realloc (dest, outbuf_size); |
||
730 | |||
731 | outp = dest + used; |
||
732 | outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH; |
||
733 | |||
734 | break; |
||
735 | } |
||
736 | case EILSEQ: |
||
737 | if (save_p) |
||
738 | { |
||
739 | /* Error converting fallback string - fatal |
||
740 | */ |
||
741 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
742 | _("Cannot convert fallback '%s' to codeset '%s'"), |
||
743 | insert_str, to_codeset); |
||
744 | have_error = TRUE; |
||
745 | break; |
||
746 | } |
||
747 | else if (p) |
||
748 | { |
||
749 | if (!fallback) |
||
750 | { |
||
751 | gunichar ch = g_utf8_get_char (p); |
||
752 | insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x", |
||
753 | ch); |
||
754 | } |
||
755 | else |
||
756 | insert_str = fallback; |
||
757 | |||
758 | save_p = g_utf8_next_char (p); |
||
759 | save_inbytes = inbytes_remaining - (save_p - p); |
||
760 | p = insert_str; |
||
761 | inbytes_remaining = strlen (p); |
||
762 | break; |
||
763 | } |
||
764 | /* fall thru if p is NULL */ |
||
765 | default: |
||
766 | { |
||
767 | int errsv = errno; |
||
768 | |||
769 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
||
770 | _("Error during conversion: %s"), |
||
771 | g_strerror (errsv)); |
||
772 | } |
||
773 | |||
774 | have_error = TRUE; |
||
775 | break; |
||
776 | } |
||
777 | } |
||
778 | else |
||
779 | { |
||
780 | if (save_p) |
||
781 | { |
||
782 | if (!fallback) |
||
783 | g_free ((gchar *)insert_str); |
||
784 | p = save_p; |
||
785 | inbytes_remaining = save_inbytes; |
||
786 | save_p = NULL; |
||
787 | } |
||
788 | else if (p) |
||
789 | { |
||
790 | /* call g_iconv with NULL inbuf to cleanup shift state */ |
||
791 | p = NULL; |
||
792 | inbytes_remaining = 0; |
||
793 | } |
||
794 | else |
||
795 | done = TRUE; |
||
796 | } |
||
797 | } |
||
798 | |||
799 | /* Cleanup |
||
800 | */ |
||
801 | memset (outp, 0, NUL_TERMINATOR_LENGTH); |
||
802 | |||
803 | close_converter (cd); |
||
804 | |||
805 | if (bytes_written) |
||
806 | *bytes_written = outp - dest; /* Doesn't include '\0' */ |
||
807 | |||
808 | g_free (utf8); |
||
809 | |||
810 | if (have_error) |
||
811 | { |
||
812 | if (save_p && !fallback) |
||
813 | g_free ((gchar *)insert_str); |
||
814 | g_free (dest); |
||
815 | return NULL; |
||
816 | } |
||
817 | else |
||
818 | return dest; |
||
819 | } |
||
820 | |||
821 | /* |
||
822 | * g_locale_to_utf8 |
||
823 | * |
||
824 | * |
||
825 | */ |
||
826 | |||
827 | static gchar * |
||
828 | strdup_len (const gchar *string, |
||
829 | gssize len, |
||
830 | gsize *bytes_written, |
||
831 | gsize *bytes_read, |
||
832 | GError **error) |
||
833 | |||
834 | { |
||
835 | gsize real_len; |
||
836 | |||
837 | if (!g_utf8_validate (string, len, NULL)) |
||
838 | { |
||
839 | if (bytes_read) |
||
840 | *bytes_read = 0; |
||
841 | if (bytes_written) |
||
842 | *bytes_written = 0; |
||
843 | |||
844 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
845 | _("Invalid byte sequence in conversion input")); |
||
846 | return NULL; |
||
847 | } |
||
848 | |||
849 | if (len < 0) |
||
850 | real_len = strlen (string); |
||
851 | else |
||
852 | { |
||
853 | real_len = 0; |
||
854 | |||
855 | while (real_len < len && string[real_len]) |
||
856 | real_len++; |
||
857 | } |
||
858 | |||
859 | if (bytes_read) |
||
860 | *bytes_read = real_len; |
||
861 | if (bytes_written) |
||
862 | *bytes_written = real_len; |
||
863 | |||
864 | return g_strndup (string, real_len); |
||
865 | } |
||
866 | |||
867 | /** |
||
868 | * g_locale_to_utf8: |
||
869 | * @opsysstring: a string in the encoding of the current locale. On Windows |
||
870 | * this means the system codepage. |
||
871 | * @len: the length of the string, or -1 if the string is |
||
872 | * nul-terminated (Note that some encodings may allow nul |
||
873 | * bytes to occur inside strings. In that case, using -1 |
||
874 | * for the @len parameter is unsafe) |
||
875 | * @bytes_read: (out) (optional): location to store the number of bytes in the |
||
876 | * input string that were successfully converted, or %NULL. |
||
877 | * Even if the conversion was successful, this may be |
||
878 | * less than @len if there were partial characters |
||
879 | * at the end of the input. If the error |
||
880 | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
||
881 | * stored will the byte offset after the last valid |
||
882 | * input sequence. |
||
883 | * @bytes_written: (out) (optional): the number of bytes stored in the output |
||
884 | * buffer (not including the terminating nul). |
||
885 | * @error: location to store the error occurring, or %NULL to ignore |
||
886 | * errors. Any of the errors in #GConvertError may occur. |
||
887 | * |
||
888 | * Converts a string which is in the encoding used for strings by |
||
889 | * the C runtime (usually the same as that used by the operating |
||
890 | * system) in the [current locale][setlocale] into a UTF-8 string. |
||
891 | * |
||
892 | * Returns: A newly-allocated buffer containing the converted string, |
||
893 | * or %NULL on an error, and error will be set. |
||
894 | **/ |
||
895 | gchar * |
||
896 | g_locale_to_utf8 (const gchar *opsysstring, |
||
897 | gssize len, |
||
898 | gsize *bytes_read, |
||
899 | gsize *bytes_written, |
||
900 | GError **error) |
||
901 | { |
||
902 | const char *charset; |
||
903 | |||
904 | if (g_get_charset (&charset)) |
||
905 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
||
906 | else |
||
907 | return g_convert (opsysstring, len, |
||
908 | "UTF-8", charset, bytes_read, bytes_written, error); |
||
909 | } |
||
910 | |||
911 | /** |
||
912 | * g_locale_from_utf8: |
||
913 | * @utf8string: a UTF-8 encoded string |
||
914 | * @len: the length of the string, or -1 if the string is |
||
915 | * nul-terminated (Note that some encodings may allow nul |
||
916 | * bytes to occur inside strings. In that case, using -1 |
||
917 | * for the @len parameter is unsafe) |
||
918 | * @bytes_read: (out) (optional): location to store the number of bytes in the |
||
919 | * input string that were successfully converted, or %NULL. |
||
920 | * Even if the conversion was successful, this may be |
||
921 | * less than @len if there were partial characters |
||
922 | * at the end of the input. If the error |
||
923 | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
||
924 | * stored will the byte offset after the last valid |
||
925 | * input sequence. |
||
926 | * @bytes_written: (out) (optional): the number of bytes stored in the output |
||
927 | * buffer (not including the terminating nul). |
||
928 | * @error: location to store the error occurring, or %NULL to ignore |
||
929 | * errors. Any of the errors in #GConvertError may occur. |
||
930 | * |
||
931 | * Converts a string from UTF-8 to the encoding used for strings by |
||
932 | * the C runtime (usually the same as that used by the operating |
||
933 | * system) in the [current locale][setlocale]. On Windows this means |
||
934 | * the system codepage. |
||
935 | * |
||
936 | * Returns: A newly-allocated buffer containing the converted string, |
||
937 | * or %NULL on an error, and error will be set. |
||
938 | **/ |
||
939 | gchar * |
||
940 | g_locale_from_utf8 (const gchar *utf8string, |
||
941 | gssize len, |
||
942 | gsize *bytes_read, |
||
943 | gsize *bytes_written, |
||
944 | GError **error) |
||
945 | { |
||
946 | const gchar *charset; |
||
947 | |||
948 | if (g_get_charset (&charset)) |
||
949 | return strdup_len (utf8string, len, bytes_read, bytes_written, error); |
||
950 | else |
||
951 | return g_convert (utf8string, len, |
||
952 | charset, "UTF-8", bytes_read, bytes_written, error); |
||
953 | } |
||
954 | |||
955 | #ifndef G_PLATFORM_WIN32 |
||
956 | |||
957 | typedef struct _GFilenameCharsetCache GFilenameCharsetCache; |
||
958 | |||
959 | struct _GFilenameCharsetCache { |
||
960 | gboolean is_utf8; |
||
961 | gchar *charset; |
||
962 | gchar **filename_charsets; |
||
963 | }; |
||
964 | |||
965 | static void |
||
966 | filename_charset_cache_free (gpointer data) |
||
967 | { |
||
968 | GFilenameCharsetCache *cache = data; |
||
969 | g_free (cache->charset); |
||
970 | g_strfreev (cache->filename_charsets); |
||
971 | g_free (cache); |
||
972 | } |
||
973 | |||
974 | /** |
||
975 | * g_get_filename_charsets: |
||
976 | * @charsets: return location for the %NULL-terminated list of encoding names |
||
977 | * |
||
978 | * Determines the preferred character sets used for filenames. |
||
979 | * The first character set from the @charsets is the filename encoding, the |
||
980 | * subsequent character sets are used when trying to generate a displayable |
||
981 | * representation of a filename, see g_filename_display_name(). |
||
982 | * |
||
983 | * On Unix, the character sets are determined by consulting the |
||
984 | * environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`. |
||
985 | * On Windows, the character set used in the GLib API is always UTF-8 |
||
986 | * and said environment variables have no effect. |
||
987 | * |
||
988 | * `G_FILENAME_ENCODING` may be set to a comma-separated list of |
||
989 | * character set names. The special token "\@locale" is taken |
||
990 | * to mean the character set for the [current locale][setlocale]. |
||
991 | * If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is, |
||
992 | * the character set of the current locale is taken as the filename |
||
993 | * encoding. If neither environment variable is set, UTF-8 is taken |
||
994 | * as the filename encoding, but the character set of the current locale |
||
995 | * is also put in the list of encodings. |
||
996 | * |
||
997 | * The returned @charsets belong to GLib and must not be freed. |
||
998 | * |
||
999 | * Note that on Unix, regardless of the locale character set or |
||
1000 | * `G_FILENAME_ENCODING` value, the actual file names present |
||
1001 | * on a system might be in any random encoding or just gibberish. |
||
1002 | * |
||
1003 | * Returns: %TRUE if the filename encoding is UTF-8. |
||
1004 | * |
||
1005 | * Since: 2.6 |
||
1006 | */ |
||
1007 | gboolean |
||
1008 | g_get_filename_charsets (const gchar ***filename_charsets) |
||
1009 | { |
||
1010 | static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free); |
||
1011 | GFilenameCharsetCache *cache = g_private_get (&cache_private); |
||
1012 | const gchar *charset; |
||
1013 | |||
1014 | if (!cache) |
||
1015 | { |
||
1016 | cache = g_new0 (GFilenameCharsetCache, 1); |
||
1017 | g_private_set (&cache_private, cache); |
||
1018 | } |
||
1019 | |||
1020 | g_get_charset (&charset); |
||
1021 | |||
1022 | if (!(cache->charset && strcmp (cache->charset, charset) == 0)) |
||
1023 | { |
||
1024 | const gchar *new_charset; |
||
1025 | gchar *p; |
||
1026 | gint i; |
||
1027 | |||
1028 | g_free (cache->charset); |
||
1029 | g_strfreev (cache->filename_charsets); |
||
1030 | cache->charset = g_strdup (charset); |
||
1031 | |||
1032 | p = getenv ("G_FILENAME_ENCODING"); |
||
1033 | if (p != NULL && p[0] != '\0') |
||
1034 | { |
||
1035 | cache->filename_charsets = g_strsplit (p, ",", 0); |
||
1036 | cache->is_utf8 = (strcmp (cache->filename_charsets[0], "UTF-8") == 0); |
||
1037 | |||
1038 | for (i = 0; cache->filename_charsets[i]; i++) |
||
1039 | { |
||
1040 | if (strcmp ("@locale", cache->filename_charsets[i]) == 0) |
||
1041 | { |
||
1042 | g_get_charset (&new_charset); |
||
1043 | g_free (cache->filename_charsets[i]); |
||
1044 | cache->filename_charsets[i] = g_strdup (new_charset); |
||
1045 | } |
||
1046 | } |
||
1047 | } |
||
1048 | else if (getenv ("G_BROKEN_FILENAMES") != NULL) |
||
1049 | { |
||
1050 | cache->filename_charsets = g_new0 (gchar *, 2); |
||
1051 | cache->is_utf8 = g_get_charset (&new_charset); |
||
1052 | cache->filename_charsets[0] = g_strdup (new_charset); |
||
1053 | } |
||
1054 | else |
||
1055 | { |
||
1056 | cache->filename_charsets = g_new0 (gchar *, 3); |
||
1057 | cache->is_utf8 = TRUE; |
||
1058 | cache->filename_charsets[0] = g_strdup ("UTF-8"); |
||
1059 | if (!g_get_charset (&new_charset)) |
||
1060 | cache->filename_charsets[1] = g_strdup (new_charset); |
||
1061 | } |
||
1062 | } |
||
1063 | |||
1064 | if (filename_charsets) |
||
1065 | *filename_charsets = (const gchar **)cache->filename_charsets; |
||
1066 | |||
1067 | return cache->is_utf8; |
||
1068 | } |
||
1069 | |||
1070 | #else /* G_PLATFORM_WIN32 */ |
||
1071 | |||
1072 | gboolean |
||
1073 | g_get_filename_charsets (const gchar ***filename_charsets) |
||
1074 | { |
||
1075 | static const gchar *charsets[] = { |
||
1076 | "UTF-8", |
||
1077 | NULL |
||
1078 | }; |
||
1079 | |||
1080 | #ifdef G_OS_WIN32 |
||
1081 | /* On Windows GLib pretends that the filename charset is UTF-8 */ |
||
1082 | if (filename_charsets) |
||
1083 | *filename_charsets = charsets; |
||
1084 | |||
1085 | return TRUE; |
||
1086 | #else |
||
1087 | gboolean result; |
||
1088 | |||
1089 | /* Cygwin works like before */ |
||
1090 | result = g_get_charset (&(charsets[0])); |
||
1091 | |||
1092 | if (filename_charsets) |
||
1093 | *filename_charsets = charsets; |
||
1094 | |||
1095 | return result; |
||
1096 | #endif |
||
1097 | } |
||
1098 | |||
1099 | #endif /* G_PLATFORM_WIN32 */ |
||
1100 | |||
1101 | static gboolean |
||
1102 | get_filename_charset (const gchar **filename_charset) |
||
1103 | { |
||
1104 | const gchar **charsets; |
||
1105 | gboolean is_utf8; |
||
1106 | |||
1107 | is_utf8 = g_get_filename_charsets (&charsets); |
||
1108 | |||
1109 | if (filename_charset) |
||
1110 | *filename_charset = charsets[0]; |
||
1111 | |||
1112 | return is_utf8; |
||
1113 | } |
||
1114 | |||
1115 | /** |
||
1116 | * g_filename_to_utf8: |
||
1117 | * @opsysstring: a string in the encoding for filenames |
||
1118 | * @len: the length of the string, or -1 if the string is |
||
1119 | * nul-terminated (Note that some encodings may allow nul |
||
1120 | * bytes to occur inside strings. In that case, using -1 |
||
1121 | * for the @len parameter is unsafe) |
||
1122 | * @bytes_read: (out) (optional): location to store the number of bytes in the |
||
1123 | * input string that were successfully converted, or %NULL. |
||
1124 | * Even if the conversion was successful, this may be |
||
1125 | * less than @len if there were partial characters |
||
1126 | * at the end of the input. If the error |
||
1127 | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
||
1128 | * stored will the byte offset after the last valid |
||
1129 | * input sequence. |
||
1130 | * @bytes_written: (out) (optional): the number of bytes stored in the output |
||
1131 | * buffer (not including the terminating nul). |
||
1132 | * @error: location to store the error occurring, or %NULL to ignore |
||
1133 | * errors. Any of the errors in #GConvertError may occur. |
||
1134 | * |
||
1135 | * Converts a string which is in the encoding used by GLib for |
||
1136 | * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8 |
||
1137 | * for filenames; on other platforms, this function indirectly depends on |
||
1138 | * the [current locale][setlocale]. |
||
1139 | * |
||
1140 | * Returns: The converted string, or %NULL on an error. |
||
1141 | **/ |
||
1142 | gchar* |
||
1143 | g_filename_to_utf8 (const gchar *opsysstring, |
||
1144 | gssize len, |
||
1145 | gsize *bytes_read, |
||
1146 | gsize *bytes_written, |
||
1147 | GError **error) |
||
1148 | { |
||
1149 | const gchar *charset; |
||
1150 | |||
1151 | g_return_val_if_fail (opsysstring != NULL, NULL); |
||
1152 | |||
1153 | if (get_filename_charset (&charset)) |
||
1154 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
||
1155 | else |
||
1156 | return g_convert (opsysstring, len, |
||
1157 | "UTF-8", charset, bytes_read, bytes_written, error); |
||
1158 | } |
||
1159 | |||
1160 | #if defined (G_OS_WIN32) && !defined (_WIN64) |
||
1161 | |||
1162 | #undef g_filename_to_utf8 |
||
1163 | |||
1164 | /* Binary compatibility version. Not for newly compiled code. Also not needed for |
||
1165 | * 64-bit versions as there should be no old deployed binaries that would use |
||
1166 | * the old versions. |
||
1167 | */ |
||
1168 | |||
1169 | gchar* |
||
1170 | g_filename_to_utf8 (const gchar *opsysstring, |
||
1171 | gssize len, |
||
1172 | gsize *bytes_read, |
||
1173 | gsize *bytes_written, |
||
1174 | GError **error) |
||
1175 | { |
||
1176 | const gchar *charset; |
||
1177 | |||
1178 | g_return_val_if_fail (opsysstring != NULL, NULL); |
||
1179 | |||
1180 | if (g_get_charset (&charset)) |
||
1181 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
||
1182 | else |
||
1183 | return g_convert (opsysstring, len, |
||
1184 | "UTF-8", charset, bytes_read, bytes_written, error); |
||
1185 | } |
||
1186 | |||
1187 | #endif |
||
1188 | |||
1189 | /** |
||
1190 | * g_filename_from_utf8: |
||
1191 | * @utf8string: a UTF-8 encoded string. |
||
1192 | * @len: the length of the string, or -1 if the string is |
||
1193 | * nul-terminated. |
||
1194 | * @bytes_read: (out) (optional): location to store the number of bytes in |
||
1195 | * the input string that were successfully converted, or %NULL. |
||
1196 | * Even if the conversion was successful, this may be |
||
1197 | * less than @len if there were partial characters |
||
1198 | * at the end of the input. If the error |
||
1199 | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
||
1200 | * stored will the byte offset after the last valid |
||
1201 | * input sequence. |
||
1202 | * @bytes_written: (out): the number of bytes stored in the output buffer (not |
||
1203 | * including the terminating nul). |
||
1204 | * @error: location to store the error occurring, or %NULL to ignore |
||
1205 | * errors. Any of the errors in #GConvertError may occur. |
||
1206 | * |
||
1207 | * Converts a string from UTF-8 to the encoding GLib uses for |
||
1208 | * filenames. Note that on Windows GLib uses UTF-8 for filenames; |
||
1209 | * on other platforms, this function indirectly depends on the |
||
1210 | * [current locale][setlocale]. |
||
1211 | * |
||
1212 | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
||
1213 | * The converted string, or %NULL on an error. |
||
1214 | **/ |
||
1215 | gchar* |
||
1216 | g_filename_from_utf8 (const gchar *utf8string, |
||
1217 | gssize len, |
||
1218 | gsize *bytes_read, |
||
1219 | gsize *bytes_written, |
||
1220 | GError **error) |
||
1221 | { |
||
1222 | const gchar *charset; |
||
1223 | |||
1224 | if (get_filename_charset (&charset)) |
||
1225 | return strdup_len (utf8string, len, bytes_read, bytes_written, error); |
||
1226 | else |
||
1227 | return g_convert (utf8string, len, |
||
1228 | charset, "UTF-8", bytes_read, bytes_written, error); |
||
1229 | } |
||
1230 | |||
1231 | #if defined (G_OS_WIN32) && !defined (_WIN64) |
||
1232 | |||
1233 | #undef g_filename_from_utf8 |
||
1234 | |||
1235 | /* Binary compatibility version. Not for newly compiled code. */ |
||
1236 | |||
1237 | gchar* |
||
1238 | g_filename_from_utf8 (const gchar *utf8string, |
||
1239 | gssize len, |
||
1240 | gsize *bytes_read, |
||
1241 | gsize *bytes_written, |
||
1242 | GError **error) |
||
1243 | { |
||
1244 | const gchar *charset; |
||
1245 | |||
1246 | if (g_get_charset (&charset)) |
||
1247 | return strdup_len (utf8string, len, bytes_read, bytes_written, error); |
||
1248 | else |
||
1249 | return g_convert (utf8string, len, |
||
1250 | charset, "UTF-8", bytes_read, bytes_written, error); |
||
1251 | } |
||
1252 | |||
1253 | #endif |
||
1254 | |||
1255 | /* Test of haystack has the needle prefix, comparing case |
||
1256 | * insensitive. haystack may be UTF-8, but needle must |
||
1257 | * contain only ascii. */ |
||
1258 | static gboolean |
||
1259 | has_case_prefix (const gchar *haystack, const gchar *needle) |
||
1260 | { |
||
1261 | const gchar *h, *n; |
||
1262 | |||
1263 | /* Eat one character at a time. */ |
||
1264 | h = haystack; |
||
1265 | n = needle; |
||
1266 | |||
1267 | while (*n && *h && |
||
1268 | g_ascii_tolower (*n) == g_ascii_tolower (*h)) |
||
1269 | { |
||
1270 | n++; |
||
1271 | h++; |
||
1272 | } |
||
1273 | |||
1274 | return *n == '\0'; |
||
1275 | } |
||
1276 | |||
1277 | typedef enum { |
||
1278 | UNSAFE_ALL = 0x1, /* Escape all unsafe characters */ |
||
1279 | UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */ |
||
1280 | UNSAFE_PATH = 0x8, /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */ |
||
1281 | UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */ |
||
1282 | UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */ |
||
1283 | } UnsafeCharacterSet; |
||
1284 | |||
1285 | static const guchar acceptable[96] = { |
||
1286 | /* A table of the ASCII chars from space (32) to DEL (127) */ |
||
1287 | /* ! " # $ % & ' ( ) * + , - . / */ |
||
1288 | 0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C, |
||
1289 | /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ |
||
1290 | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20, |
||
1291 | /* @ A B C D E F G H I J K L M N O */ |
||
1292 | 0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, |
||
1293 | /* P Q R S T U V W X Y Z [ \ ] ^ _ */ |
||
1294 | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F, |
||
1295 | /* ` a b c d e f g h i j k l m n o */ |
||
1296 | 0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, |
||
1297 | /* p q r s t u v w x y z { | } ~ DEL */ |
||
1298 | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20 |
||
1299 | }; |
||
1300 | |||
1301 | static const gchar hex[16] = "0123456789ABCDEF"; |
||
1302 | |||
1303 | /* Note: This escape function works on file: URIs, but if you want to |
||
1304 | * escape something else, please read RFC-2396 */ |
||
1305 | static gchar * |
||
1306 | g_escape_uri_string (const gchar *string, |
||
1307 | UnsafeCharacterSet mask) |
||
1308 | { |
||
1309 | #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask)) |
||
1310 | |||
1311 | const gchar *p; |
||
1312 | gchar *q; |
||
1313 | gchar *result; |
||
1314 | int c; |
||
1315 | gint unacceptable; |
||
1316 | UnsafeCharacterSet use_mask; |
||
1317 | |||
1318 | g_return_val_if_fail (mask == UNSAFE_ALL |
||
1319 | || mask == UNSAFE_ALLOW_PLUS |
||
1320 | || mask == UNSAFE_PATH |
||
1321 | || mask == UNSAFE_HOST |
||
1322 | || mask == UNSAFE_SLASHES, NULL); |
||
1323 | |||
1324 | unacceptable = 0; |
||
1325 | use_mask = mask; |
||
1326 | for (p = string; *p != '\0'; p++) |
||
1327 | { |
||
1328 | c = (guchar) *p; |
||
1329 | if (!ACCEPTABLE (c)) |
||
1330 | unacceptable++; |
||
1331 | } |
||
1332 | |||
1333 | result = g_malloc (p - string + unacceptable * 2 + 1); |
||
1334 | |||
1335 | use_mask = mask; |
||
1336 | for (q = result, p = string; *p != '\0'; p++) |
||
1337 | { |
||
1338 | c = (guchar) *p; |
||
1339 | |||
1340 | if (!ACCEPTABLE (c)) |
||
1341 | { |
||
1342 | *q++ = '%'; /* means hex coming */ |
||
1343 | *q++ = hex[c >> 4]; |
||
1344 | *q++ = hex[c & 15]; |
||
1345 | } |
||
1346 | else |
||
1347 | *q++ = *p; |
||
1348 | } |
||
1349 | |||
1350 | *q = '\0'; |
||
1351 | |||
1352 | return result; |
||
1353 | } |
||
1354 | |||
1355 | |||
1356 | static gchar * |
||
1357 | g_escape_file_uri (const gchar *hostname, |
||
1358 | const gchar *pathname) |
||
1359 | { |
||
1360 | char *escaped_hostname = NULL; |
||
1361 | char *escaped_path; |
||
1362 | char *res; |
||
1363 | |||
1364 | #ifdef G_OS_WIN32 |
||
1365 | char *p, *backslash; |
||
1366 | |||
1367 | /* Turn backslashes into forward slashes. That's what Netscape |
||
1368 | * does, and they are actually more or less equivalent in Windows. |
||
1369 | */ |
||
1370 | |||
1371 | pathname = g_strdup (pathname); |
||
1372 | p = (char *) pathname; |
||
1373 | |||
1374 | while ((backslash = strchr (p, '\\')) != NULL) |
||
1375 | { |
||
1376 | *backslash = '/'; |
||
1377 | p = backslash + 1; |
||
1378 | } |
||
1379 | #endif |
||
1380 | |||
1381 | if (hostname && *hostname != '\0') |
||
1382 | { |
||
1383 | escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST); |
||
1384 | } |
||
1385 | |||
1386 | escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH); |
||
1387 | |||
1388 | res = g_strconcat ("file://", |
||
1389 | (escaped_hostname) ? escaped_hostname : "", |
||
1390 | (*escaped_path != '/') ? "/" : "", |
||
1391 | escaped_path, |
||
1392 | NULL); |
||
1393 | |||
1394 | #ifdef G_OS_WIN32 |
||
1395 | g_free ((char *) pathname); |
||
1396 | #endif |
||
1397 | |||
1398 | g_free (escaped_hostname); |
||
1399 | g_free (escaped_path); |
||
1400 | |||
1401 | return res; |
||
1402 | } |
||
1403 | |||
1404 | static int |
||
1405 | unescape_character (const char *scanner) |
||
1406 | { |
||
1407 | int first_digit; |
||
1408 | int second_digit; |
||
1409 | |||
1410 | first_digit = g_ascii_xdigit_value (scanner[0]); |
||
1411 | if (first_digit < 0) |
||
1412 | return -1; |
||
1413 | |||
1414 | second_digit = g_ascii_xdigit_value (scanner[1]); |
||
1415 | if (second_digit < 0) |
||
1416 | return -1; |
||
1417 | |||
1418 | return (first_digit << 4) | second_digit; |
||
1419 | } |
||
1420 | |||
1421 | static gchar * |
||
1422 | g_unescape_uri_string (const char *escaped, |
||
1423 | int len, |
||
1424 | const char *illegal_escaped_characters, |
||
1425 | gboolean ascii_must_not_be_escaped) |
||
1426 | { |
||
1427 | const gchar *in, *in_end; |
||
1428 | gchar *out, *result; |
||
1429 | int c; |
||
1430 | |||
1431 | if (escaped == NULL) |
||
1432 | return NULL; |
||
1433 | |||
1434 | if (len < 0) |
||
1435 | len = strlen (escaped); |
||
1436 | |||
1437 | result = g_malloc (len + 1); |
||
1438 | |||
1439 | out = result; |
||
1440 | for (in = escaped, in_end = escaped + len; in < in_end; in++) |
||
1441 | { |
||
1442 | c = *in; |
||
1443 | |||
1444 | if (c == '%') |
||
1445 | { |
||
1446 | /* catch partial escape sequences past the end of the substring */ |
||
1447 | if (in + 3 > in_end) |
||
1448 | break; |
||
1449 | |||
1450 | c = unescape_character (in + 1); |
||
1451 | |||
1452 | /* catch bad escape sequences and NUL characters */ |
||
1453 | if (c <= 0) |
||
1454 | break; |
||
1455 | |||
1456 | /* catch escaped ASCII */ |
||
1457 | if (ascii_must_not_be_escaped && c <= 0x7F) |
||
1458 | break; |
||
1459 | |||
1460 | /* catch other illegal escaped characters */ |
||
1461 | if (strchr (illegal_escaped_characters, c) != NULL) |
||
1462 | break; |
||
1463 | |||
1464 | in += 2; |
||
1465 | } |
||
1466 | |||
1467 | *out++ = c; |
||
1468 | } |
||
1469 | |||
1470 | g_assert (out - result <= len); |
||
1471 | *out = '\0'; |
||
1472 | |||
1473 | if (in != in_end) |
||
1474 | { |
||
1475 | g_free (result); |
||
1476 | return NULL; |
||
1477 | } |
||
1478 | |||
1479 | return result; |
||
1480 | } |
||
1481 | |||
1482 | static gboolean |
||
1483 | is_asciialphanum (gunichar c) |
||
1484 | { |
||
1485 | return c <= 0x7F && g_ascii_isalnum (c); |
||
1486 | } |
||
1487 | |||
1488 | static gboolean |
||
1489 | is_asciialpha (gunichar c) |
||
1490 | { |
||
1491 | return c <= 0x7F && g_ascii_isalpha (c); |
||
1492 | } |
||
1493 | |||
1494 | /* allows an empty string */ |
||
1495 | static gboolean |
||
1496 | hostname_validate (const char *hostname) |
||
1497 | { |
||
1498 | const char *p; |
||
1499 | gunichar c, first_char, last_char; |
||
1500 | |||
1501 | p = hostname; |
||
1502 | if (*p == '\0') |
||
1503 | return TRUE; |
||
1504 | do |
||
1505 | { |
||
1506 | /* read in a label */ |
||
1507 | c = g_utf8_get_char (p); |
||
1508 | p = g_utf8_next_char (p); |
||
1509 | if (!is_asciialphanum (c)) |
||
1510 | return FALSE; |
||
1511 | first_char = c; |
||
1512 | do |
||
1513 | { |
||
1514 | last_char = c; |
||
1515 | c = g_utf8_get_char (p); |
||
1516 | p = g_utf8_next_char (p); |
||
1517 | } |
||
1518 | while (is_asciialphanum (c) || c == '-'); |
||
1519 | if (last_char == '-') |
||
1520 | return FALSE; |
||
1521 | |||
1522 | /* if that was the last label, check that it was a toplabel */ |
||
1523 | if (c == '\0' || (c == '.' && *p == '\0')) |
||
1524 | return is_asciialpha (first_char); |
||
1525 | } |
||
1526 | while (c == '.'); |
||
1527 | return FALSE; |
||
1528 | } |
||
1529 | |||
1530 | /** |
||
1531 | * g_filename_from_uri: |
||
1532 | * @uri: a uri describing a filename (escaped, encoded in ASCII). |
||
1533 | * @hostname: (out) (optional) (nullable): Location to store hostname for the |
||
1534 | * URI. |
||
1535 | * If there is no hostname in the URI, %NULL will be |
||
1536 | * stored in this location. |
||
1537 | * @error: location to store the error occurring, or %NULL to ignore |
||
1538 | * errors. Any of the errors in #GConvertError may occur. |
||
1539 | * |
||
1540 | * Converts an escaped ASCII-encoded URI to a local filename in the |
||
1541 | * encoding used for filenames. |
||
1542 | * |
||
1543 | * Returns: (type filename): a newly-allocated string holding |
||
1544 | * the resulting filename, or %NULL on an error. |
||
1545 | **/ |
||
1546 | gchar * |
||
1547 | g_filename_from_uri (const gchar *uri, |
||
1548 | gchar **hostname, |
||
1549 | GError **error) |
||
1550 | { |
||
1551 | const char *path_part; |
||
1552 | const char *host_part; |
||
1553 | char *unescaped_hostname; |
||
1554 | char *result; |
||
1555 | char *filename; |
||
1556 | int offs; |
||
1557 | #ifdef G_OS_WIN32 |
||
1558 | char *p, *slash; |
||
1559 | #endif |
||
1560 | |||
1561 | if (hostname) |
||
1562 | *hostname = NULL; |
||
1563 | |||
1564 | if (!has_case_prefix (uri, "file:/")) |
||
1565 | { |
||
1566 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
||
1567 | _("The URI '%s' is not an absolute URI using the \"file\" scheme"), |
||
1568 | uri); |
||
1569 | return NULL; |
||
1570 | } |
||
1571 | |||
1572 | path_part = uri + strlen ("file:"); |
||
1573 | |||
1574 | if (strchr (path_part, '#') != NULL) |
||
1575 | { |
||
1576 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
||
1577 | _("The local file URI '%s' may not include a '#'"), |
||
1578 | uri); |
||
1579 | return NULL; |
||
1580 | } |
||
1581 | |||
1582 | if (has_case_prefix (path_part, "///")) |
||
1583 | path_part += 2; |
||
1584 | else if (has_case_prefix (path_part, "//")) |
||
1585 | { |
||
1586 | path_part += 2; |
||
1587 | host_part = path_part; |
||
1588 | |||
1589 | path_part = strchr (path_part, '/'); |
||
1590 | |||
1591 | if (path_part == NULL) |
||
1592 | { |
||
1593 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
||
1594 | _("The URI '%s' is invalid"), |
||
1595 | uri); |
||
1596 | return NULL; |
||
1597 | } |
||
1598 | |||
1599 | unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE); |
||
1600 | |||
1601 | if (unescaped_hostname == NULL || |
||
1602 | !hostname_validate (unescaped_hostname)) |
||
1603 | { |
||
1604 | g_free (unescaped_hostname); |
||
1605 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
||
1606 | _("The hostname of the URI '%s' is invalid"), |
||
1607 | uri); |
||
1608 | return NULL; |
||
1609 | } |
||
1610 | |||
1611 | if (hostname) |
||
1612 | *hostname = unescaped_hostname; |
||
1613 | else |
||
1614 | g_free (unescaped_hostname); |
||
1615 | } |
||
1616 | |||
1617 | filename = g_unescape_uri_string (path_part, -1, "/", FALSE); |
||
1618 | |||
1619 | if (filename == NULL) |
||
1620 | { |
||
1621 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
||
1622 | _("The URI '%s' contains invalidly escaped characters"), |
||
1623 | uri); |
||
1624 | return NULL; |
||
1625 | } |
||
1626 | |||
1627 | offs = 0; |
||
1628 | #ifdef G_OS_WIN32 |
||
1629 | /* Drop localhost */ |
||
1630 | if (hostname && *hostname != NULL && |
||
1631 | g_ascii_strcasecmp (*hostname, "localhost") == 0) |
||
1632 | { |
||
1633 | g_free (*hostname); |
||
1634 | *hostname = NULL; |
||
1635 | } |
||
1636 | |||
1637 | /* Turn slashes into backslashes, because that's the canonical spelling */ |
||
1638 | p = filename; |
||
1639 | while ((slash = strchr (p, '/')) != NULL) |
||
1640 | { |
||
1641 | *slash = '\\'; |
||
1642 | p = slash + 1; |
||
1643 | } |
||
1644 | |||
1645 | /* Windows URIs with a drive letter can be like "file://host/c:/foo" |
||
1646 | * or "file://host/c|/foo" (some Netscape versions). In those cases, start |
||
1647 | * the filename from the drive letter. |
||
1648 | */ |
||
1649 | if (g_ascii_isalpha (filename[1])) |
||
1650 | { |
||
1651 | if (filename[2] == ':') |
||
1652 | offs = 1; |
||
1653 | else if (filename[2] == '|') |
||
1654 | { |
||
1655 | filename[2] = ':'; |
||
1656 | offs = 1; |
||
1657 | } |
||
1658 | } |
||
1659 | #endif |
||
1660 | |||
1661 | result = g_strdup (filename + offs); |
||
1662 | g_free (filename); |
||
1663 | |||
1664 | return result; |
||
1665 | } |
||
1666 | |||
1667 | #if defined (G_OS_WIN32) && !defined (_WIN64) |
||
1668 | |||
1669 | #undef g_filename_from_uri |
||
1670 | |||
1671 | gchar * |
||
1672 | g_filename_from_uri (const gchar *uri, |
||
1673 | gchar **hostname, |
||
1674 | GError **error) |
||
1675 | { |
||
1676 | gchar *utf8_filename; |
||
1677 | gchar *retval = NULL; |
||
1678 | |||
1679 | utf8_filename = g_filename_from_uri_utf8 (uri, hostname, error); |
||
1680 | if (utf8_filename) |
||
1681 | { |
||
1682 | retval = g_locale_from_utf8 (utf8_filename, -1, NULL, NULL, error); |
||
1683 | g_free (utf8_filename); |
||
1684 | } |
||
1685 | return retval; |
||
1686 | } |
||
1687 | |||
1688 | #endif |
||
1689 | |||
1690 | /** |
||
1691 | * g_filename_to_uri: |
||
1692 | * @filename: an absolute filename specified in the GLib file name encoding, |
||
1693 | * which is the on-disk file name bytes on Unix, and UTF-8 on |
||
1694 | * Windows |
||
1695 | * @hostname: (allow-none): A UTF-8 encoded hostname, or %NULL for none. |
||
1696 | * @error: location to store the error occurring, or %NULL to ignore |
||
1697 | * errors. Any of the errors in #GConvertError may occur. |
||
1698 | * |
||
1699 | * Converts an absolute filename to an escaped ASCII-encoded URI, with the path |
||
1700 | * component following Section 3.3. of RFC 2396. |
||
1701 | * |
||
1702 | * Returns: a newly-allocated string holding the resulting |
||
1703 | * URI, or %NULL on an error. |
||
1704 | **/ |
||
1705 | gchar * |
||
1706 | g_filename_to_uri (const gchar *filename, |
||
1707 | const gchar *hostname, |
||
1708 | GError **error) |
||
1709 | { |
||
1710 | char *escaped_uri; |
||
1711 | |||
1712 | g_return_val_if_fail (filename != NULL, NULL); |
||
1713 | |||
1714 | if (!g_path_is_absolute (filename)) |
||
1715 | { |
||
1716 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH, |
||
1717 | _("The pathname '%s' is not an absolute path"), |
||
1718 | filename); |
||
1719 | return NULL; |
||
1720 | } |
||
1721 | |||
1722 | if (hostname && |
||
1723 | !(g_utf8_validate (hostname, -1, NULL) |
||
1724 | && hostname_validate (hostname))) |
||
1725 | { |
||
1726 | g_set_error_literal (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
||
1727 | _("Invalid hostname")); |
||
1728 | return NULL; |
||
1729 | } |
||
1730 | |||
1731 | #ifdef G_OS_WIN32 |
||
1732 | /* Don't use localhost unnecessarily */ |
||
1733 | if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0) |
||
1734 | hostname = NULL; |
||
1735 | #endif |
||
1736 | |||
1737 | escaped_uri = g_escape_file_uri (hostname, filename); |
||
1738 | |||
1739 | return escaped_uri; |
||
1740 | } |
||
1741 | |||
1742 | #if defined (G_OS_WIN32) && !defined (_WIN64) |
||
1743 | |||
1744 | #undef g_filename_to_uri |
||
1745 | |||
1746 | gchar * |
||
1747 | g_filename_to_uri (const gchar *filename, |
||
1748 | const gchar *hostname, |
||
1749 | GError **error) |
||
1750 | { |
||
1751 | gchar *utf8_filename; |
||
1752 | gchar *retval = NULL; |
||
1753 | |||
1754 | utf8_filename = g_locale_to_utf8 (filename, -1, NULL, NULL, error); |
||
1755 | |||
1756 | if (utf8_filename) |
||
1757 | { |
||
1758 | retval = g_filename_to_uri_utf8 (utf8_filename, hostname, error); |
||
1759 | g_free (utf8_filename); |
||
1760 | } |
||
1761 | |||
1762 | return retval; |
||
1763 | } |
||
1764 | |||
1765 | #endif |
||
1766 | |||
1767 | /** |
||
1768 | * g_uri_list_extract_uris: |
||
1769 | * @uri_list: an URI list |
||
1770 | * |
||
1771 | * Splits an URI list conforming to the text/uri-list |
||
1772 | * mime type defined in RFC 2483 into individual URIs, |
||
1773 | * discarding any comments. The URIs are not validated. |
||
1774 | * |
||
1775 | * Returns: (transfer full): a newly allocated %NULL-terminated list |
||
1776 | * of strings holding the individual URIs. The array should be freed |
||
1777 | * with g_strfreev(). |
||
1778 | * |
||
1779 | * Since: 2.6 |
||
1780 | */ |
||
1781 | gchar ** |
||
1782 | g_uri_list_extract_uris (const gchar *uri_list) |
||
1783 | { |
||
1784 | GSList *uris, *u; |
||
1785 | const gchar *p, *q; |
||
1786 | gchar **result; |
||
1787 | gint n_uris = 0; |
||
1788 | |||
1789 | uris = NULL; |
||
1790 | |||
1791 | p = uri_list; |
||
1792 | |||
1793 | /* We don't actually try to validate the URI according to RFC |
||
1794 | * 2396, or even check for allowed characters - we just ignore |
||
1795 | * comments and trim whitespace off the ends. We also |
||
1796 | * allow LF delimination as well as the specified CRLF. |
||
1797 | * |
||
1798 | * We do allow comments like specified in RFC 2483. |
||
1799 | */ |
||
1800 | while (p) |
||
1801 | { |
||
1802 | if (*p != '#') |
||
1803 | { |
||
1804 | while (g_ascii_isspace (*p)) |
||
1805 | p++; |
||
1806 | |||
1807 | q = p; |
||
1808 | while (*q && (*q != '\n') && (*q != '\r')) |
||
1809 | q++; |
||
1810 | |||
1811 | if (q > p) |
||
1812 | { |
||
1813 | q--; |
||
1814 | while (q > p && g_ascii_isspace (*q)) |
||
1815 | q--; |
||
1816 | |||
1817 | if (q > p) |
||
1818 | { |
||
1819 | uris = g_slist_prepend (uris, g_strndup (p, q - p + 1)); |
||
1820 | n_uris++; |
||
1821 | } |
||
1822 | } |
||
1823 | } |
||
1824 | p = strchr (p, '\n'); |
||
1825 | if (p) |
||
1826 | p++; |
||
1827 | } |
||
1828 | |||
1829 | result = g_new (gchar *, n_uris + 1); |
||
1830 | |||
1831 | result[n_uris--] = NULL; |
||
1832 | for (u = uris; u; u = u->next) |
||
1833 | result[n_uris--] = u->data; |
||
1834 | |||
1835 | g_slist_free (uris); |
||
1836 | |||
1837 | return result; |
||
1838 | } |
||
1839 | |||
1840 | /** |
||
1841 | * g_filename_display_basename: |
||
1842 | * @filename: an absolute pathname in the GLib file name encoding |
||
1843 | * |
||
1844 | * Returns the display basename for the particular filename, guaranteed |
||
1845 | * to be valid UTF-8. The display name might not be identical to the filename, |
||
1846 | * for instance there might be problems converting it to UTF-8, and some files |
||
1847 | * can be translated in the display. |
||
1848 | * |
||
1849 | * If GLib cannot make sense of the encoding of @filename, as a last resort it |
||
1850 | * replaces unknown characters with U+FFFD, the Unicode replacement character. |
||
1851 | * You can search the result for the UTF-8 encoding of this character (which is |
||
1852 | * "\357\277\275" in octal notation) to find out if @filename was in an invalid |
||
1853 | * encoding. |
||
1854 | * |
||
1855 | * You must pass the whole absolute pathname to this functions so that |
||
1856 | * translation of well known locations can be done. |
||
1857 | * |
||
1858 | * This function is preferred over g_filename_display_name() if you know the |
||
1859 | * whole path, as it allows translation. |
||
1860 | * |
||
1861 | * Returns: a newly allocated string containing |
||
1862 | * a rendition of the basename of the filename in valid UTF-8 |
||
1863 | * |
||
1864 | * Since: 2.6 |
||
1865 | **/ |
||
1866 | gchar * |
||
1867 | g_filename_display_basename (const gchar *filename) |
||
1868 | { |
||
1869 | char *basename; |
||
1870 | char *display_name; |
||
1871 | |||
1872 | g_return_val_if_fail (filename != NULL, NULL); |
||
1873 | |||
1874 | basename = g_path_get_basename (filename); |
||
1875 | display_name = g_filename_display_name (basename); |
||
1876 | g_free (basename); |
||
1877 | return display_name; |
||
1878 | } |
||
1879 | |||
1880 | /** |
||
1881 | * g_filename_display_name: |
||
1882 | * @filename: a pathname hopefully in the GLib file name encoding |
||
1883 | * |
||
1884 | * Converts a filename into a valid UTF-8 string. The conversion is |
||
1885 | * not necessarily reversible, so you should keep the original around |
||
1886 | * and use the return value of this function only for display purposes. |
||
1887 | * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL |
||
1888 | * even if the filename actually isn't in the GLib file name encoding. |
||
1889 | * |
||
1890 | * If GLib cannot make sense of the encoding of @filename, as a last resort it |
||
1891 | * replaces unknown characters with U+FFFD, the Unicode replacement character. |
||
1892 | * You can search the result for the UTF-8 encoding of this character (which is |
||
1893 | * "\357\277\275" in octal notation) to find out if @filename was in an invalid |
||
1894 | * encoding. |
||
1895 | * |
||
1896 | * If you know the whole pathname of the file you should use |
||
1897 | * g_filename_display_basename(), since that allows location-based |
||
1898 | * translation of filenames. |
||
1899 | * |
||
1900 | * Returns: a newly allocated string containing |
||
1901 | * a rendition of the filename in valid UTF-8 |
||
1902 | * |
||
1903 | * Since: 2.6 |
||
1904 | **/ |
||
1905 | gchar * |
||
1906 | g_filename_display_name (const gchar *filename) |
||
1907 | { |
||
1908 | gint i; |
||
1909 | const gchar **charsets; |
||
1910 | gchar *display_name = NULL; |
||
1911 | gboolean is_utf8; |
||
1912 | |||
1913 | is_utf8 = g_get_filename_charsets (&charsets); |
||
1914 | |||
1915 | if (is_utf8) |
||
1916 | { |
||
1917 | if (g_utf8_validate (filename, -1, NULL)) |
||
1918 | display_name = g_strdup (filename); |
||
1919 | } |
||
1920 | |||
1921 | if (!display_name) |
||
1922 | { |
||
1923 | /* Try to convert from the filename charsets to UTF-8. |
||
1924 | * Skip the first charset if it is UTF-8. |
||
1925 | */ |
||
1926 | for (i = is_utf8 ? 1 : 0; charsets[i]; i++) |
||
1927 | { |
||
1928 | display_name = g_convert (filename, -1, "UTF-8", charsets[i], |
||
1929 | NULL, NULL, NULL); |
||
1930 | |||
1931 | if (display_name) |
||
1932 | break; |
||
1933 | } |
||
1934 | } |
||
1935 | |||
1936 | /* if all conversions failed, we replace invalid UTF-8 |
||
1937 | * by a question mark |
||
1938 | */ |
||
1939 | if (!display_name) |
||
1940 | display_name = _g_utf8_make_valid (filename); |
||
1941 | |||
1942 | return display_name; |
||
1943 | } |