WebSVN – nexmon – Blame – Rev 1 – /utilities/glib/glib/ghostutils.c

#define idna_is_junk(ch) ((ch) == 0x00AD || (ch) == 0x1806 || (ch) == 0x200B || (ch) == 0x2060 || (ch) == 0xFEFF || (ch) == 0x034F || (ch) == 0x180B || (ch) == 0x180C || (ch) == 0x180D || (ch) == 0x200C || (ch) == 0x200D || ((ch) >= 0xFE00 && (ch) <= 0xFE0F))

208

209

/* Scan @str for "junk" and return a cleaned-up string if any junk

210

* is found. Else return %NULL.

211

*/

212

static gchar *

213

remove_junk (const gchar *str,

214

gint len)

215

{

216

GString *cleaned = NULL;

const gchar *p;

gunichar ch;

for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p))

221

{

222

ch = g_utf8_get_char (p);

223

if (idna_is_junk (ch))

{

if (!cleaned)

{

cleaned = g_string_new (NULL);

228

g_string_append_len (cleaned, str, p - str);

}

}

else if (cleaned)

g_string_append_unichar (cleaned, ch);

}

if (cleaned)

return g_string_free (cleaned, FALSE);

else

return NULL;

}

static inline gboolean

242

contains_uppercase_letters (const gchar *str,

gint len)

{

const gchar *p;

for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p))

248

{

249

if (g_unichar_isupper (g_utf8_get_char (p)))

return TRUE;

}

return FALSE;

}

static inline gboolean

256

contains_non_ascii (const gchar *str,

gint len)

{

const gchar *p;

for (p = str; len == -1 ? *p : p < str + len; p++)

262

{

263

if ((guchar)*p > 0x80)

return TRUE;

}

return FALSE;

}

/* RFC 3454, Appendix C. ish. */

270

static inline gboolean

271

idna_is_prohibited (gunichar ch)

272

{

273

switch (g_unichar_type (ch))

274

{

275

case G_UNICODE_CONTROL:

276

case G_UNICODE_FORMAT:

277

case G_UNICODE_UNASSIGNED:

278

case G_UNICODE_PRIVATE_USE:

279

case G_UNICODE_SURROGATE:

280

case G_UNICODE_LINE_SEPARATOR:

281

case G_UNICODE_PARAGRAPH_SEPARATOR:

282

case G_UNICODE_SPACE_SEPARATOR:

283

return TRUE;

284

285

case G_UNICODE_OTHER_SYMBOL:

286

if (ch == 0xFFFC || ch == 0xFFFD ||

287

(ch >= 0x2FF0 && ch <= 0x2FFB))

return TRUE;

return FALSE;

case G_UNICODE_NON_SPACING_MARK:

292

if (ch == 0x0340 || ch == 0x0341)

return TRUE;

return FALSE;

default:

return FALSE;

}

}

/* RFC 3491 IDN cleanup algorithm. */

302

static gchar *

303

nameprep (const gchar *hostname,

304

gint len,

305

gboolean *is_unicode)

306

{

307

gchar *name, *tmp = NULL, *p;

308

309

/* It would be nice if we could do this without repeatedly

310

* allocating strings and converting back and forth between

311

* gunichars and UTF-8... The code does at least avoid doing most of

312

* the sub-operations when they would just be equivalent to a

* g_strdup().

*/

/* Remove presentation-only characters */

317

name = remove_junk (hostname, len);

if (name)

{

tmp = name;

len = -1;

}

else

name = (gchar *)hostname;

325

326

/* Convert to lowercase */

327

if (contains_uppercase_letters (name, len))

328

{

329

name = g_utf8_strdown (name, len);

g_free (tmp);

tmp = name;

len = -1;

}

/* If there are no UTF8 characters, we're done. */

336

if (!contains_non_ascii (name, len))

337

{

338

*is_unicode = FALSE;

339

if (name == (gchar *)hostname)

340

return len == -1 ? g_strdup (hostname) : g_strndup (hostname, len);

else

return name;

}

*is_unicode = TRUE;

346

347

/* Normalize */

348

name = g_utf8_normalize (name, len, G_NORMALIZE_NFKC);

g_free (tmp);

tmp = name;

if (!name)

return NULL;

/* KC normalization may have created more capital letters (eg,

356

* angstrom -> capital A with ring). So we have to lowercasify a

357

* second time. (This is more-or-less how the nameprep algorithm

358

* does it. If tolower(nfkc(tolower(X))) is guaranteed to be the

359

* same as tolower(nfkc(X)), then we could skip the first tolower,

360

* but I'm not sure it is.)

361

*/

362

if (contains_uppercase_letters (name, -1))

363

{

364

name = g_utf8_strdown (name, -1);

g_free (tmp);

tmp = name;

}

/* Check for prohibited characters */

370

for (p = name; *p; p = g_utf8_next_char (p))

371

{

372

if (idna_is_prohibited (g_utf8_get_char (p)))

{

name = NULL;

g_free (tmp);

goto done;

}

}

/* FIXME: We're supposed to verify certain constraints on bidi

381

* characters, but glib does not appear to have that information.

*/

done:

return name;

}

/* RFC 3490, section 3.1 says '.', 0x3002, 0xFF0E, and 0xFF61 count as

389

* label-separating dots. @str must be '\0'-terminated.

390

*/

391

#define idna_is_dot(str) ( \

392

((guchar)(str)[0] == '.') || \

393

((guchar)(str)[0] == 0xE3 && (guchar)(str)[1] == 0x80 && (guchar)(str)[2] == 0x82) || \

394

((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBC && (guchar)(str)[2] == 0x8E) || \

395

((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBD && (guchar)(str)[2] == 0xA1) )

396

397

static const gchar *

398

idna_end_of_label (const gchar *str)

399

{

400

for (; *str; str = g_utf8_next_char (str))

401

{

402

if (idna_is_dot (str))

return str;

}

return str;

}

/**

* g_hostname_to_ascii:

410

* @hostname: a valid UTF-8 or ASCII hostname

411

*

412

* Converts @hostname to its canonical ASCII form; an ASCII-only

413

* string containing no uppercase letters and not ending with a

414

* trailing dot.

415

*

416

* Returns: an ASCII hostname, which must be freed, or %NULL if

417

* @hostname is in some way invalid.

*

* Since: 2.22

**/

gchar *

g_hostname_to_ascii (const gchar *hostname)

423

{

424

gchar *name, *label, *p;

425

GString *out;

426

gssize llen, oldlen;

427

gboolean unicode;

428

429

label = name = nameprep (hostname, -1, &unicode);

430

if (!name || !unicode)

431

return name;

432

433

out = g_string_new (NULL);

do

{

unicode = FALSE;

for (p = label; *p && !idna_is_dot (p); p++)

439

{

440

if ((guchar)*p > 0x80)

unicode = TRUE;

}

oldlen = out->len;

llen = p - label;

if (unicode)

{

if (!strncmp (label, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN))

449

goto fail;

450

451

g_string_append (out, IDNA_ACE_PREFIX);

452

if (!punycode_encode (label, llen, out))

goto fail;

}

else

g_string_append_len (out, label, llen);

457

458

if (out->len - oldlen > 63)

goto fail;

label += llen;

if (*label)

label = g_utf8_next_char (label);

464

if (*label)

465

g_string_append_c (out, '.');

}

while (*label);

g_free (name);

return g_string_free (out, FALSE);

fail:

g_free (name);

g_string_free (out, TRUE);

return NULL;

}

/**

* g_hostname_is_non_ascii:

480

* @hostname: a hostname

481

*

482

* Tests if @hostname contains Unicode characters. If this returns

483

* %TRUE, you need to encode the hostname with g_hostname_to_ascii()

484

* before using it in non-IDN-aware contexts.

485

*

486

* Note that a hostname might contain a mix of encoded and unencoded

487

* segments, and so it is possible for g_hostname_is_non_ascii() and

488

* g_hostname_is_ascii_encoded() to both return %TRUE for a name.

489

*

490

* Returns: %TRUE if @hostname contains any non-ASCII characters

*

* Since: 2.22

**/

gboolean

g_hostname_is_non_ascii (const gchar *hostname)

496

{

497

return contains_non_ascii (hostname, -1);

498

}

499

500

/* Punycode decoder, RFC 3492 section 6.2. As with punycode_encode(),

501

* read the RFC if you want to understand what this is actually doing.

502

*/

503

static gboolean

504

punycode_decode (const gchar *input,

505

gsize input_length,

506

GString *output)

507

{

508

GArray *output_chars;

509

gunichar n;

510

guint i, bias;

511

guint oldi, w, k, digit, t;

512

const gchar *split;

513

514

n = PUNYCODE_INITIAL_N;

515

i = 0;

516

bias = PUNYCODE_INITIAL_BIAS;

517

518

split = input + input_length - 1;

519

while (split > input && *split != '-')

split--;

if (split > input)

{

output_chars = g_array_sized_new (FALSE, FALSE, sizeof (gunichar),

524

split - input);

525

input_length -= (split - input) + 1;

526

while (input < split)

527

{

528

gunichar ch = (gunichar)*input++;

529

if (!PUNYCODE_IS_BASIC (ch))

530

goto fail;

531

g_array_append_val (output_chars, ch);

}

input++;

}

else

output_chars = g_array_new (FALSE, FALSE, sizeof (gunichar));

537

538

while (input_length)

{

oldi = i;

w = 1;

for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE)

543

{

544

if (!input_length--)

545

goto fail;

546

digit = decode_digit (*input++);

547

if (digit >= PUNYCODE_BASE)

548

goto fail;

549

if (digit > (G_MAXUINT - i) / w)

goto fail;

i += digit * w;

if (k <= bias)

t = PUNYCODE_TMIN;

else if (k >= bias + PUNYCODE_TMAX)

t = PUNYCODE_TMAX;

else

t = k - bias;

if (digit < t)

break;

if (w > G_MAXUINT / (PUNYCODE_BASE - t))

561

goto fail;

562

w *= (PUNYCODE_BASE - t);

563

}

564

565

bias = adapt (i - oldi, output_chars->len + 1, oldi == 0);

566

567

if (i / (output_chars->len + 1) > G_MAXUINT - n)

568

goto fail;

569

n += i / (output_chars->len + 1);

570

i %= (output_chars->len + 1);

571

572

g_array_insert_val (output_chars, i++, n);

573

}

574

575

for (i = 0; i < output_chars->len; i++)

576

g_string_append_unichar (output, g_array_index (output_chars, gunichar, i));

577

g_array_free (output_chars, TRUE);

return TRUE;

fail:

g_array_free (output_chars, TRUE);

return FALSE;

}

/**

* g_hostname_to_unicode:

587

* @hostname: a valid UTF-8 or ASCII hostname

588

*

589

* Converts @hostname to its canonical presentation form; a UTF-8

590

* string in Unicode normalization form C, containing no uppercase

591

* letters, no forbidden characters, and no ASCII-encoded segments,

592

* and not ending with a trailing dot.

593

*

594

* Of course if @hostname is not an internationalized hostname, then

595

* the canonical presentation form will be entirely ASCII.

596

*

597

* Returns: a UTF-8 hostname, which must be freed, or %NULL if

598

* @hostname is in some way invalid.

*

* Since: 2.22

**/

gchar *

g_hostname_to_unicode (const gchar *hostname)

{

GString *out;

gssize llen;

out = g_string_new (NULL);

do

{

llen = idna_end_of_label (hostname) - hostname;

613

if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN))

614

{

615

hostname += IDNA_ACE_PREFIX_LEN;

616

llen -= IDNA_ACE_PREFIX_LEN;

617

if (!punycode_decode (hostname, llen, out))

618

{

619

g_string_free (out, TRUE);

return NULL;

}

}

else

{

gboolean unicode;

gchar *canonicalized = nameprep (hostname, llen, &unicode);

627

628

if (!canonicalized)

629

{

630

g_string_free (out, TRUE);

631

return NULL;

632

}

633

g_string_append (out, canonicalized);

634

g_free (canonicalized);

}

hostname += llen;

if (*hostname)

hostname = g_utf8_next_char (hostname);

640

if (*hostname)

641

g_string_append_c (out, '.');

}

while (*hostname);

return g_string_free (out, FALSE);

}

/**

* g_hostname_is_ascii_encoded:

650

* @hostname: a hostname

651

*

652

* Tests if @hostname contains segments with an ASCII-compatible

653

* encoding of an Internationalized Domain Name. If this returns

654

* %TRUE, you should decode the hostname with g_hostname_to_unicode()

655

* before displaying it to the user.

656

*

657

* Note that a hostname might contain a mix of encoded and unencoded

658

* segments, and so it is possible for g_hostname_is_non_ascii() and

659

* g_hostname_is_ascii_encoded() to both return %TRUE for a name.

660

*

661

* Returns: %TRUE if @hostname contains any ASCII-encoded

* segments.

*

* Since: 2.22

**/

gboolean

g_hostname_is_ascii_encoded (const gchar *hostname)

{

while (1)

{

if (!g_ascii_strncasecmp (hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN))

672

return TRUE;

673

hostname = idna_end_of_label (hostname);

674

if (*hostname)

675

hostname = g_utf8_next_char (hostname);

if (!*hostname)

return FALSE;

}

}

/**

* g_hostname_is_ip_address:

683

* @hostname: a hostname (or IP address in string form)

684

*

685

* Tests if @hostname is the string form of an IPv4 or IPv6 address.

686

* (Eg, "192.168.0.1".)

687

*

688

* Returns: %TRUE if @hostname is an IP address

*

* Since: 2.22

**/

gboolean

g_hostname_is_ip_address (const gchar *hostname)

694

{

695

gchar *p, *end;

696

gint nsegments, octet;

697

698

/* On Linux we could implement this using inet_pton, but the Windows

699

* equivalent of that requires linking against winsock, so we just

700

* figure this out ourselves. Tested by tests/hostutils.c.

701

*/

702

703

p = (char *)hostname;

704

705

if (strchr (p, ':'))

{

gboolean skipped;

/* If it contains a ':', it's an IPv6 address (assuming it's an

710

* IP address at all). This consists of eight ':'-separated

711

* segments, each containing a 1-4 digit hex number, except that

712

* optionally: (a) the last two segments can be replaced by an

713

* IPv4 address, and (b) a single span of 1 to 8 "0000" segments

714

* can be replaced with just "::".

*/

nsegments = 0;

skipped = FALSE;

while (*p && nsegments < 8)

720

{

721

/* Each segment after the first must be preceded by a ':'.

722

* (We also handle half of the "string starts with ::" case

723

* here.)

724

*/

725

if (p != (char *)hostname || (p[0] == ':' && p[1] == ':'))

{

if (*p != ':')

return FALSE;

p++;

}

/* If there's another ':', it means we're skipping some segments */

733

if (*p == ':' && !skipped)

{

skipped = TRUE;

nsegments++;

/* Handle the "string ends with ::" case */

if (!p[1])

p++;

continue;

}

/* Read the segment, make sure it's valid. */

746

for (end = p; g_ascii_isxdigit (*end); end++)

747

;

748

if (end == p || end > p + 4)

return FALSE;

if (*end == '.')

{

if ((nsegments == 6 && !skipped) || (nsegments <= 6 && skipped))

goto parse_ipv4;

else

return FALSE;

}

nsegments++;

p = end;

}

return !*p && (nsegments == 8 || skipped);

}

parse_ipv4:

/* Parse IPv4: N.N.N.N, where each N <= 255 and doesn't have leading 0s. */

769

for (nsegments = 0; nsegments < 4; nsegments++)

770

{

771

if (nsegments != 0)

{

if (*p != '.')

return FALSE;

p++;

}

/* Check the segment; a little tricker than the IPv6 case since

779

* we can't allow extra leading 0s, and we can't assume that all

780

* strings of valid length are within range.

*/

octet = 0;

if (*p == '0')

end = p + 1;

else

{

for (end = p; g_ascii_isdigit (*end); end++)

788

octet = 10 * octet + (*end - '0');

789

}

790

if (end == p || end > p + 3 || octet > 255)

return FALSE;

p = end;

}

/* If there's nothing left to parse, then it's ok. */

797

return !*p;

798

}

nexmon – Blame information for rev 1