WebSVN – nexmon – Blame – Rev 1 – /utilities/glib/glib/gregex.c

1

office

1

/* GRegex -- regular expression API wrapper around PCRE.

*

*

* This library is free software; you can redistribute it and/or

8

* modify it under the terms of the GNU Lesser General Public

9

* License as published by the Free Software Foundation; either

10

* version 2.1 of the License, or (at your option) any later version.

11

*

12

* This library is distributed in the hope that it will be useful,

13

* but WITHOUT ANY WARRANTY; without even the implied warranty of

14

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

15

* Lesser General Public License for more details.

16

*

17

* You should have received a copy of the GNU Lesser General Public

18

* License along with this library; if not, write to the Free Software

19

* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA

20

*/

21

22

#include "config.h"

23

24

#include <string.h>

25

26

#ifdef USE_SYSTEM_PCRE

27

#include <pcre.h>

28

#else

29

#include "pcre/pcre.h"

30

#endif

31

32

#include "gtypes.h"

33

#include "gregex.h"

34

#include "glibintl.h"

35

#include "glist.h"

36

#include "gmessages.h"

37

#include "gstrfuncs.h"

38

#include "gatomic.h"

39

#include "gthread.h"

/**

* SECTION:gregex

* @title: Perl-compatible regular expressions

44

* @short_description: matches strings against regular expressions

45

* @see_also: [Regular expression syntax][glib-regex-syntax]

46

*

47

* The g_regex_*() functions implement regular

48

* expression pattern matching using syntax and semantics similar to

49

* Perl regular expression.

50

*

51

* Some functions accept a @start_position argument, setting it differs

52

* from just passing over a shortened string and setting #G_REGEX_MATCH_NOTBOL

53

* in the case of a pattern that begins with any kind of lookbehind assertion.

54

* For example, consider the pattern "\Biss\B" which finds occurrences of "iss"

55

* in the middle of words. ("\B" matches only if the current position in the

56

* subject is not a word boundary.) When applied to the string "Mississipi"

57

* from the fourth byte, namely "issipi", it does not match, because "\B" is

58

* always false at the start of the subject, which is deemed to be a word

59

* boundary. However, if the entire string is passed , but with

60

* @start_position set to 4, it finds the second occurrence of "iss" because

61

* it is able to look behind the starting point to discover that it is

62

* preceded by a letter.

63

*

64

* Note that, unless you set the #G_REGEX_RAW flag, all the strings passed

65

* to these functions must be encoded in UTF-8. The lengths and the positions

66

* inside the strings are in bytes and not in characters, so, for instance,

67

* "\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a

68

* single character. If you set #G_REGEX_RAW the strings can be non-valid

69

* UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two

70

* bytes and two characters long.

71

*

72

* When matching a pattern, "\n" matches only against a "\n" character in

73

* the string, and "\r" matches only a "\r" character. To match any newline

74

* sequence use "\R". This particular group matches either the two-character

75

* sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed,

76

* U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"),

77

* CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line

78

* separator, U+2028), or PS (paragraph separator, U+2029).

79

*

80

* The behaviour of the dot, circumflex, and dollar metacharacters are

81

* affected by newline characters, the default is to recognize any newline

82

* character (the same characters recognized by "\R"). This can be changed

83

* with #G_REGEX_NEWLINE_CR, #G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF

84

* compile options, and with #G_REGEX_MATCH_NEWLINE_ANY,

85

* #G_REGEX_MATCH_NEWLINE_CR, #G_REGEX_MATCH_NEWLINE_LF and

86

* #G_REGEX_MATCH_NEWLINE_CRLF match options. These settings are also

87

* relevant when compiling a pattern if #G_REGEX_EXTENDED is set, and an

88

* unescaped "#" outside a character class is encountered. This indicates

89

* a comment that lasts until after the next newline.

90

*

91

* When setting the %G_REGEX_JAVASCRIPT_COMPAT flag, pattern syntax and pattern

92

* matching is changed to be compatible with the way that regular expressions

93

* work in JavaScript. More precisely, a lonely ']' character in the pattern

94

* is a syntax error; the '\x' escape only allows 0 to 2 hexadecimal digits, and

95

* you must use the '\u' escape sequence with 4 hex digits to specify a unicode

96

* codepoint instead of '\x' or 'x{....}'. If '\x' or '\u' are not followed by

97

* the specified number of hex digits, they match 'x' and 'u' literally; also

98

* '\U' always matches 'U' instead of being an error in the pattern. Finally,

99

* pattern matching is modified so that back references to an unset subpattern

100

* group produces a match with the empty string instead of an error. See

101

* pcreapi(3) for more information.

102

*

103

* Creating and manipulating the same #GRegex structure from different

104

* threads is not a problem as #GRegex does not modify its internal

105

* state between creation and destruction, on the other hand #GMatchInfo

106

* is not threadsafe.

107

*

108

* The regular expressions low-level functionalities are obtained through

109

* the excellent

110

* [PCRE](http://www.pcre.org/)

111

* library written by Philip Hazel.

112

*/

113

114

/* Mask of all the possible values for GRegexCompileFlags. */

115

#define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS | \

116

G_REGEX_MULTILINE | \

117

G_REGEX_DOTALL | \

118

G_REGEX_EXTENDED | \

119

G_REGEX_ANCHORED | \

120

G_REGEX_DOLLAR_ENDONLY | \

121

G_REGEX_UNGREEDY | \

122

G_REGEX_RAW | \

123

G_REGEX_NO_AUTO_CAPTURE | \

124

G_REGEX_OPTIMIZE | \

125

G_REGEX_FIRSTLINE | \

126

G_REGEX_DUPNAMES | \

127

G_REGEX_NEWLINE_CR | \

128

G_REGEX_NEWLINE_LF | \

129

G_REGEX_NEWLINE_CRLF | \

130

G_REGEX_NEWLINE_ANYCRLF | \

131

G_REGEX_BSR_ANYCRLF | \

132

G_REGEX_JAVASCRIPT_COMPAT)

133

134

/* Mask of all GRegexCompileFlags values that are (not) passed trough to PCRE */

135

#define G_REGEX_COMPILE_PCRE_MASK (G_REGEX_COMPILE_MASK & ~G_REGEX_COMPILE_NONPCRE_MASK)

136

#define G_REGEX_COMPILE_NONPCRE_MASK (G_REGEX_RAW | \

137

G_REGEX_OPTIMIZE)

138

139

/* Mask of all the possible values for GRegexMatchFlags. */

140

#define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED | \

141

G_REGEX_MATCH_NOTBOL | \

142

G_REGEX_MATCH_NOTEOL | \

143

G_REGEX_MATCH_NOTEMPTY | \

144

G_REGEX_MATCH_PARTIAL | \

145

G_REGEX_MATCH_NEWLINE_CR | \

146

G_REGEX_MATCH_NEWLINE_LF | \

147

G_REGEX_MATCH_NEWLINE_CRLF | \

148

G_REGEX_MATCH_NEWLINE_ANY | \

149

G_REGEX_MATCH_NEWLINE_ANYCRLF | \

150

G_REGEX_MATCH_BSR_ANYCRLF | \

151

G_REGEX_MATCH_BSR_ANY | \

152

G_REGEX_MATCH_PARTIAL_SOFT | \

153

G_REGEX_MATCH_PARTIAL_HARD | \

154

G_REGEX_MATCH_NOTEMPTY_ATSTART)

155

156

/* we rely on these flags having the same values */

157

G_STATIC_ASSERT (G_REGEX_CASELESS == PCRE_CASELESS);

158

G_STATIC_ASSERT (G_REGEX_MULTILINE == PCRE_MULTILINE);

159

G_STATIC_ASSERT (G_REGEX_DOTALL == PCRE_DOTALL);

160

G_STATIC_ASSERT (G_REGEX_EXTENDED == PCRE_EXTENDED);

161

G_STATIC_ASSERT (G_REGEX_ANCHORED == PCRE_ANCHORED);

162

G_STATIC_ASSERT (G_REGEX_DOLLAR_ENDONLY == PCRE_DOLLAR_ENDONLY);

163

G_STATIC_ASSERT (G_REGEX_UNGREEDY == PCRE_UNGREEDY);

164

G_STATIC_ASSERT (G_REGEX_NO_AUTO_CAPTURE == PCRE_NO_AUTO_CAPTURE);

165

G_STATIC_ASSERT (G_REGEX_FIRSTLINE == PCRE_FIRSTLINE);

166

G_STATIC_ASSERT (G_REGEX_DUPNAMES == PCRE_DUPNAMES);

167

G_STATIC_ASSERT (G_REGEX_NEWLINE_CR == PCRE_NEWLINE_CR);

168

G_STATIC_ASSERT (G_REGEX_NEWLINE_LF == PCRE_NEWLINE_LF);

169

G_STATIC_ASSERT (G_REGEX_NEWLINE_CRLF == PCRE_NEWLINE_CRLF);

170

G_STATIC_ASSERT (G_REGEX_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF);

171

G_STATIC_ASSERT (G_REGEX_BSR_ANYCRLF == PCRE_BSR_ANYCRLF);

172

G_STATIC_ASSERT (G_REGEX_JAVASCRIPT_COMPAT == PCRE_JAVASCRIPT_COMPAT);

173

174

G_STATIC_ASSERT (G_REGEX_MATCH_ANCHORED == PCRE_ANCHORED);

175

G_STATIC_ASSERT (G_REGEX_MATCH_NOTBOL == PCRE_NOTBOL);

176

G_STATIC_ASSERT (G_REGEX_MATCH_NOTEOL == PCRE_NOTEOL);

177

G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY == PCRE_NOTEMPTY);

178

G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL == PCRE_PARTIAL);

179

G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CR == PCRE_NEWLINE_CR);

180

G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_LF == PCRE_NEWLINE_LF);

181

G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CRLF == PCRE_NEWLINE_CRLF);

182

G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANY == PCRE_NEWLINE_ANY);

183

G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF);

184

G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANYCRLF == PCRE_BSR_ANYCRLF);

185

G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANY == PCRE_BSR_UNICODE);

186

G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL_SOFT == PCRE_PARTIAL_SOFT);

187

G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL_HARD == PCRE_PARTIAL_HARD);

188

G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY_ATSTART == PCRE_NOTEMPTY_ATSTART);

189

190

/* These PCRE flags are unused or not exposed publically in GRegexFlags, so

191

* it should be ok to reuse them for different things.

192

*/

193

G_STATIC_ASSERT (G_REGEX_OPTIMIZE == PCRE_NO_UTF8_CHECK);

194

G_STATIC_ASSERT (G_REGEX_RAW == PCRE_UTF8);

195

196

/* if the string is in UTF-8 use g_utf8_ functions, else use

197

* use just +/- 1. */

198

#define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \

199

((s) + 1) : \

200

g_utf8_next_char (s))

201

#define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \

202

((s) - 1) : \

203

g_utf8_prev_char (s))

struct _GMatchInfo

{

volatile gint ref_count; /* the ref count */

208

GRegex *regex; /* the regex */

209

GRegexMatchFlags match_opts; /* options used at match time on the regex */

210

gint matches; /* number of matching sub patterns */

211

gint pos; /* position in the string where last match left off */

212

gint n_offsets; /* number of offsets */

213

gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */

214

gint *workspace; /* workspace for pcre_dfa_exec() */

215

gint n_workspace; /* number of workspace elements */

216

const gchar *string; /* string passed to the match function */

217

gssize string_len; /* length of string */

};

struct _GRegex

{

volatile gint ref_count; /* the ref count for the immutable part */

223

gchar *pattern; /* the pattern */

224

pcre *pcre_re; /* compiled form of the pattern */

225

GRegexCompileFlags compile_opts; /* options used at compile time on the pattern */

226

GRegexMatchFlags match_opts; /* options used at match time on the regex */

227

pcre_extra *extra; /* data stored when G_REGEX_OPTIMIZE is used */

228

};

229

230

/* TRUE if ret is an error code, FALSE otherwise. */

231

#define IS_PCRE_ERROR(ret) ((ret) < PCRE_ERROR_NOMATCH && (ret) != PCRE_ERROR_PARTIAL)

232

233

typedef struct _InterpolationData InterpolationData;

234

static gboolean interpolation_list_needs_match (GList *list);

235

static gboolean interpolate_replacement (const GMatchInfo *match_info,

236

GString *result,

237

gpointer data);

238

static GList *split_replacement (const gchar *replacement,

239

GError **error);

240

static void free_interpolation_data (InterpolationData *data);

241

242

243

static const gchar *

244

match_error (gint errcode)

{

switch (errcode)

{

case PCRE_ERROR_NOMATCH:

249

/* not an error */

250

break;

251

case PCRE_ERROR_NULL:

252

/* NULL argument, this should not happen in GRegex */

253

g_warning ("A NULL argument was passed to PCRE");

254

break;

255

case PCRE_ERROR_BADOPTION:

256

return "bad options";

257

case PCRE_ERROR_BADMAGIC:

258

return _("corrupted object");

259

case PCRE_ERROR_UNKNOWN_OPCODE:

260

return N_("internal error or corrupted object");

261

case PCRE_ERROR_NOMEMORY:

262

return _("out of memory");

263

case PCRE_ERROR_NOSUBSTRING:

264

/* not used by pcre_exec() */

265

break;

266

case PCRE_ERROR_MATCHLIMIT:

267

return _("backtracking limit reached");

268

case PCRE_ERROR_CALLOUT:

269

/* callouts are not implemented */

270

break;

271

case PCRE_ERROR_BADUTF8:

272

case PCRE_ERROR_BADUTF8_OFFSET:

273

/* we do not check if strings are valid */

274

break;

275

case PCRE_ERROR_PARTIAL:

276

/* not an error */

277

break;

278

case PCRE_ERROR_BADPARTIAL:

279

return _("the pattern contains items not supported for partial matching");

280

case PCRE_ERROR_INTERNAL:

281

return _("internal error");

282

case PCRE_ERROR_BADCOUNT:

283

/* negative ovecsize, this should not happen in GRegex */

284

g_warning ("A negative ovecsize was passed to PCRE");

285

break;

286

case PCRE_ERROR_DFA_UITEM:

287

return _("the pattern contains items not supported for partial matching");

288

case PCRE_ERROR_DFA_UCOND:

289

return _("back references as conditions are not supported for partial matching");

290

case PCRE_ERROR_DFA_UMLIMIT:

291

/* the match_field field is not used in GRegex */

292

break;

293

case PCRE_ERROR_DFA_WSSIZE:

294

/* handled expanding the workspace */

295

break;

296

case PCRE_ERROR_DFA_RECURSE:

297

case PCRE_ERROR_RECURSIONLIMIT:

298

return _("recursion limit reached");

299

case PCRE_ERROR_BADNEWLINE:

300

return _("invalid combination of newline flags");

301

case PCRE_ERROR_BADOFFSET:

302

return _("bad offset");

303

case PCRE_ERROR_SHORTUTF8:

304

return _("short utf8");

305

case PCRE_ERROR_RECURSELOOP:

306

return _("recursion loop");

default:

break;

}

return _("unknown error");

}

static void

translate_compile_error (gint *errcode, const gchar **errmsg)

315

{

316

/* Compile errors are created adding 100 to the error code returned

317

* by PCRE.

318

* If errcode is known we put the translatable error message in

319

* erromsg. If errcode is unknown we put the generic

320

* G_REGEX_ERROR_COMPILE error code in errcode and keep the

321

* untranslated error message returned by PCRE.

322

* Note that there can be more PCRE errors with the same GRegexError

323

* and that some PCRE errors are useless for us.

*/

*errcode += 100;

switch (*errcode)

{

case G_REGEX_ERROR_STRAY_BACKSLASH:

330

*errmsg = _("\\ at end of pattern");

331

break;

332

case G_REGEX_ERROR_MISSING_CONTROL_CHAR:

333

*errmsg = _("\\c at end of pattern");

334

break;

335

case G_REGEX_ERROR_UNRECOGNIZED_ESCAPE:

336

*errmsg = _("unrecognized character following \\");

337

break;

338

case G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER:

339

*errmsg = _("numbers out of order in {} quantifier");

340

break;

341

case G_REGEX_ERROR_QUANTIFIER_TOO_BIG:

342

*errmsg = _("number too big in {} quantifier");

343

break;

344

case G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS:

345

*errmsg = _("missing terminating ] for character class");

346

break;

347

case G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS:

348

*errmsg = _("invalid escape sequence in character class");

349

break;

350

case G_REGEX_ERROR_RANGE_OUT_OF_ORDER:

351

*errmsg = _("range out of order in character class");

352

break;

353

case G_REGEX_ERROR_NOTHING_TO_REPEAT:

354

*errmsg = _("nothing to repeat");

355

break;

356

case 111: /* internal error: unexpected repeat */

357

*errcode = G_REGEX_ERROR_INTERNAL;

358

*errmsg = _("unexpected repeat");

359

break;

360

case G_REGEX_ERROR_UNRECOGNIZED_CHARACTER:

361

*errmsg = _("unrecognized character after (? or (?-");

362

break;

363

case G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS:

364

*errmsg = _("POSIX named classes are supported only within a class");

365

break;

366

case G_REGEX_ERROR_UNMATCHED_PARENTHESIS:

367

*errmsg = _("missing terminating )");

368

break;

369

case G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE:

370

*errmsg = _("reference to non-existent subpattern");

371

break;

372

case G_REGEX_ERROR_UNTERMINATED_COMMENT:

373

*errmsg = _("missing ) after comment");

374

break;

375

case G_REGEX_ERROR_EXPRESSION_TOO_LARGE:

376

*errmsg = _("regular expression is too large");

377

break;

378

case G_REGEX_ERROR_MEMORY_ERROR:

379

*errmsg = _("failed to get memory");

380

break;

381

case 122: /* unmatched parentheses */

382

*errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;

383

*errmsg = _(") without opening (");

384

break;

385

case 123: /* internal error: code overflow */

386

*errcode = G_REGEX_ERROR_INTERNAL;

387

*errmsg = _("code overflow");

388

break;

389

case 124: /* "unrecognized character after (?<\0 */

390

*errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;

391

*errmsg = _("unrecognized character after (?<");

392

break;

393

case G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND:

394

*errmsg = _("lookbehind assertion is not fixed length");

395

break;

396

case G_REGEX_ERROR_MALFORMED_CONDITION:

397

*errmsg = _("malformed number or name after (?(");

398

break;

399

case G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES:

400

*errmsg = _("conditional group contains more than two branches");

401

break;

402

case G_REGEX_ERROR_ASSERTION_EXPECTED:

403

*errmsg = _("assertion expected after (?(");

404

break;

405

case 129:

406

*errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS;

407

/* translators: '(?R' and '(?[+-]digits' are both meant as (groups of)

408

* sequences here, '(?-54' would be an example for the second group.

409

*/

410

*errmsg = _("(?R or (?[+-]digits must be followed by )");

411

break;

412

case G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME:

413

*errmsg = _("unknown POSIX class name");

414

break;

415

case G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED:

416

*errmsg = _("POSIX collating elements are not supported");

417

break;

418

case G_REGEX_ERROR_HEX_CODE_TOO_LARGE:

419

*errmsg = _("character value in \\x{...} sequence is too large");

420

break;

421

case G_REGEX_ERROR_INVALID_CONDITION:

422

*errmsg = _("invalid condition (?(0)");

423

break;

424

case G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND:

425

*errmsg = _("\\C not allowed in lookbehind assertion");

426

break;

427

case 137: /* PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0 */

428

/* A number of Perl escapes are not handled by PCRE.

429

* Therefore it explicitly raises ERR37.

430

*/

431

*errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE;

432

*errmsg = _("escapes \\L, \\l, \\N{name}, \\U, and \\u are not supported");

433

break;

434

case G_REGEX_ERROR_INFINITE_LOOP:

435

*errmsg = _("recursive call could loop indefinitely");

436

break;

437

case 141: /* unrecognized character after (?P\0 */

438

*errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER;

439

*errmsg = _("unrecognized character after (?P");

440

break;

441

case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR:

442

*errmsg = _("missing terminator in subpattern name");

443

break;

444

case G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME:

445

*errmsg = _("two named subpatterns have the same name");

446

break;

447

case G_REGEX_ERROR_MALFORMED_PROPERTY:

448

*errmsg = _("malformed \\P or \\p sequence");

449

break;

450

case G_REGEX_ERROR_UNKNOWN_PROPERTY:

451

*errmsg = _("unknown property name after \\P or \\p");

452

break;

453

case G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG:

454

*errmsg = _("subpattern name is too long (maximum 32 characters)");

455

break;

456

case G_REGEX_ERROR_TOO_MANY_SUBPATTERNS:

457

*errmsg = _("too many named subpatterns (maximum 10,000)");

458

break;

459

case G_REGEX_ERROR_INVALID_OCTAL_VALUE:

460

*errmsg = _("octal value is greater than \\377");

461

break;

462

case 152: /* internal error: overran compiling workspace */

463

*errcode = G_REGEX_ERROR_INTERNAL;

464

*errmsg = _("overran compiling workspace");

465

break;

466

case 153: /* internal error: previously-checked referenced subpattern not found */

467

*errcode = G_REGEX_ERROR_INTERNAL;

468

*errmsg = _("previously-checked referenced subpattern not found");

469

break;

470

case G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE:

471

*errmsg = _("DEFINE group contains more than one branch");

472

break;

473

case G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS:

474

*errmsg = _("inconsistent NEWLINE options");

475

break;

476

case G_REGEX_ERROR_MISSING_BACK_REFERENCE:

477

*errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or "

478

"number, or by a plain number");

479

break;

480

case G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE:

481

*errmsg = _("a numbered reference must not be zero");

482

break;

483

case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN:

484

*errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)");

485

break;

486

case G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB:

487

*errmsg = _("(*VERB) not recognized");

488

break;

489

case G_REGEX_ERROR_NUMBER_TOO_BIG:

490

*errmsg = _("number is too big");

491

break;

492

case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME:

493

*errmsg = _("missing subpattern name after (?&");

494

break;

495

case G_REGEX_ERROR_MISSING_DIGIT:

496

*errmsg = _("digit expected after (?+");

497

break;

498

case G_REGEX_ERROR_INVALID_DATA_CHARACTER:

499

*errmsg = _("] is an invalid data character in JavaScript compatibility mode");

500

break;

501

case G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME:

502

*errmsg = _("different names for subpatterns of the same number are not allowed");

503

break;

504

case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED:

505

*errmsg = _("(*MARK) must have an argument");

506

break;

507

case G_REGEX_ERROR_INVALID_CONTROL_CHAR:

508

*errmsg = _( "\\c must be followed by an ASCII character");

509

break;

510

case G_REGEX_ERROR_MISSING_NAME:

511

*errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name");

512

break;

513

case G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS:

514

*errmsg = _("\\N is not supported in a class");

515

break;

516

case G_REGEX_ERROR_TOO_MANY_FORWARD_REFERENCES:

517

*errmsg = _("too many forward references");

518

break;

519

case G_REGEX_ERROR_NAME_TOO_LONG:

520

*errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)");

521

break;

522

case G_REGEX_ERROR_CHARACTER_VALUE_TOO_LARGE:

523

*errmsg = _("character value in \\u.... sequence is too large");

524

break;

525

526

case 116: /* erroffset passed as NULL */

527

/* This should not happen as we never pass a NULL erroffset */

528

g_warning ("erroffset passed as NULL");

529

*errcode = G_REGEX_ERROR_COMPILE;

530

break;

531

case 117: /* unknown option bit(s) set */

532

/* This should not happen as we check options before passing them

533

* to pcre_compile2() */

534

g_warning ("unknown option bit(s) set");

535

*errcode = G_REGEX_ERROR_COMPILE;

536

break;

537

case 132: /* this version of PCRE is compiled without UTF support */

538

case 144: /* invalid UTF-8 string */

539

case 145: /* support for \\P, \\p, and \\X has not been compiled */

540

case 167: /* this version of PCRE is not compiled with Unicode property support */

541

case 173: /* disallowed Unicode code point (>= 0xd800 && <= 0xdfff) */

542

case 174: /* invalid UTF-16 string */

543

/* These errors should not happen as we are using an UTF-8 and UCP-enabled PCRE

544

* and we do not check if strings are valid */

545

case 170: /* internal error: unknown opcode in find_fixedlength() */

546

*errcode = G_REGEX_ERROR_INTERNAL;

break;

default:

*errcode = G_REGEX_ERROR_COMPILE;

}

}

/* GMatchInfo */

static GMatchInfo *

557

match_info_new (const GRegex *regex,

558

const gchar *string,

559

gint string_len,

560

gint start_position,

561

gint match_options,

562

gboolean is_dfa)

563

{

564

GMatchInfo *match_info;

565

566

if (string_len < 0)

567

string_len = strlen (string);

568

569

match_info = g_new0 (GMatchInfo, 1);

570

match_info->ref_count = 1;

571

match_info->regex = g_regex_ref ((GRegex *)regex);

572

match_info->string = string;

573

match_info->string_len = string_len;

574

match_info->matches = PCRE_ERROR_NOMATCH;

575

match_info->pos = start_position;

576

match_info->match_opts = match_options;

if (is_dfa)

{

/* These values should be enough for most cases, if they are not

581

* enough g_regex_match_all_full() will expand them. */

582

match_info->n_offsets = 24;

583

match_info->n_workspace = 100;

584

match_info->workspace = g_new (gint, match_info->n_workspace);

}

else

{

gint capture_count;

589

pcre_fullinfo (regex->pcre_re, regex->extra,

590

PCRE_INFO_CAPTURECOUNT, &capture_count);

591

match_info->n_offsets = (capture_count + 1) * 3;

592

}

593

594

match_info->offsets = g_new0 (gint, match_info->n_offsets);

595

/* Set an invalid position for the previous match. */

596

match_info->offsets[0] = -1;

597

match_info->offsets[1] = -1;

return match_info;

}

/**

* g_match_info_get_regex:

604

* @match_info: a #GMatchInfo

605

*

606

* Returns #GRegex object used in @match_info. It belongs to Glib

607

* and must not be freed. Use g_regex_ref() if you need to keep it

608

* after you free @match_info object.

609

*

610

* Returns: #GRegex object used in @match_info

*

* Since: 2.14

*/

GRegex *

g_match_info_get_regex (const GMatchInfo *match_info)

616

{

617

g_return_val_if_fail (match_info != NULL, NULL);

618

return match_info->regex;

}

/**

* g_match_info_get_string:

623

* @match_info: a #GMatchInfo

624

*

625

* Returns the string searched with @match_info. This is the

626

* string passed to g_regex_match() or g_regex_replace() so

627

* you may not free it before calling this function.

628

*

629

* Returns: the string searched with @match_info

*

* Since: 2.14

*/

const gchar *

g_match_info_get_string (const GMatchInfo *match_info)

635

{

636

g_return_val_if_fail (match_info != NULL, NULL);

637

return match_info->string;

}

/**

* g_match_info_ref:

642

* @match_info: a #GMatchInfo

643

*

644

* Increases reference count of @match_info by 1.

645

*

646

* Returns: @match_info

*

* Since: 2.30

*/

GMatchInfo *

g_match_info_ref (GMatchInfo *match_info)

652

{

653

g_return_val_if_fail (match_info != NULL, NULL);

654

g_atomic_int_inc (&match_info->ref_count);

return match_info;

}

/**

* g_match_info_unref:

660

* @match_info: a #GMatchInfo

661

*

662

* Decreases reference count of @match_info by 1. When reference count drops

663

* to zero, it frees all the memory associated with the match_info structure.

*

* Since: 2.30

*/

void

g_match_info_unref (GMatchInfo *match_info)

669

{

670

if (g_atomic_int_dec_and_test (&match_info->ref_count))

671

{

672

g_regex_unref (match_info->regex);

673

g_free (match_info->offsets);

674

g_free (match_info->workspace);

675

g_free (match_info);

}

}

/**

* g_match_info_free:

681

* @match_info: (allow-none): a #GMatchInfo, or %NULL

682

*

683

* If @match_info is not %NULL, calls g_match_info_unref(); otherwise does

* nothing.

*

* Since: 2.14

*/

void

g_match_info_free (GMatchInfo *match_info)

690

{

691

if (match_info == NULL)

692

return;

693

694

g_match_info_unref (match_info);

}

/**

* g_match_info_next:

699

* @match_info: a #GMatchInfo structure

700

* @error: location to store the error occurring, or %NULL to ignore errors

701

*

702

* Scans for the next match using the same parameters of the previous

703

* call to g_regex_match_full() or g_regex_match() that returned

704

* @match_info.

705

*

706

* The match is done on the string passed to the match function, so you

707

* cannot free it before calling this function.

708

*

709

* Returns: %TRUE is the string matched, %FALSE otherwise

*

* Since: 2.14

*/

gboolean

g_match_info_next (GMatchInfo *match_info,

715

GError **error)

716

{

717

gint prev_match_start;

718

gint prev_match_end;

719

720

g_return_val_if_fail (match_info != NULL, FALSE);

721

g_return_val_if_fail (error == NULL || *error == NULL, FALSE);

722

g_return_val_if_fail (match_info->pos >= 0, FALSE);

723

724

prev_match_start = match_info->offsets[0];

725

prev_match_end = match_info->offsets[1];

726

727

if (match_info->pos > match_info->string_len)

728

{

729

/* we have reached the end of the string */

730

match_info->pos = -1;

731

match_info->matches = PCRE_ERROR_NOMATCH;

return FALSE;

}

match_info->matches = pcre_exec (match_info->regex->pcre_re,

736

match_info->regex->extra,

737

match_info->string,

738

match_info->string_len,

739

match_info->pos,

740

match_info->regex->match_opts | match_info->match_opts,

741

match_info->offsets,

742

match_info->n_offsets);

743

if (IS_PCRE_ERROR (match_info->matches))

744

{

745

g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,

746

_("Error while matching regular expression %s: %s"),

747

match_info->regex->pattern, match_error (match_info->matches));

return FALSE;

}

/* avoid infinite loops if the pattern is an empty string or something

752

* equivalent */

753

if (match_info->pos == match_info->offsets[1])

754

{

755

if (match_info->pos > match_info->string_len)

756

{

757

/* we have reached the end of the string */

758

match_info->pos = -1;

759

match_info->matches = PCRE_ERROR_NOMATCH;

return FALSE;

}

match_info->pos = NEXT_CHAR (match_info->regex,

764

&match_info->string[match_info->pos]) -

765

match_info->string;

}

else

{

match_info->pos = match_info->offsets[1];

770

}

771

772

/* it's possible to get two identical matches when we are matching

773

* empty strings, for instance if the pattern is "(?=[A-Z0-9])" and

774

* the string is "RegExTest" we have:

775

* - search at position 0: match from 0 to 0

776

* - search at position 1: match from 3 to 3

777

* - search at position 3: match from 3 to 3 (duplicate)

778

* - search at position 4: match from 5 to 5

779

* - search at position 5: match from 5 to 5 (duplicate)

780

* - search at position 6: no match -> stop

781

* so we have to ignore the duplicates.

782

* see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */

783

if (match_info->matches >= 0 &&

784

prev_match_start == match_info->offsets[0] &&

785

prev_match_end == match_info->offsets[1])

786

{

787

/* ignore this match and search the next one */

788

return g_match_info_next (match_info, error);

789

}

790

791

return match_info->matches >= 0;

}

/**

* g_match_info_matches:

796

* @match_info: a #GMatchInfo structure

797

*

798

* Returns whether the previous match operation succeeded.

799

*

800

* Returns: %TRUE if the previous match operation succeeded,

* %FALSE otherwise

*

* Since: 2.14

*/

gboolean

g_match_info_matches (const GMatchInfo *match_info)

807

{

808

g_return_val_if_fail (match_info != NULL, FALSE);

809

810

return match_info->matches >= 0;

}

/**

* g_match_info_get_match_count:

815

* @match_info: a #GMatchInfo structure

816

*

817

* Retrieves the number of matched substrings (including substring 0,

818

* that is the whole matched text), so 1 is returned if the pattern

819

* has no substrings in it and 0 is returned if the match failed.

820

*

821

* If the last match was obtained using the DFA algorithm, that is

822

* using g_regex_match_all() or g_regex_match_all_full(), the retrieved

823

* count is not that of the number of capturing parentheses but that of

824

* the number of matched substrings.

825

*

826

* Returns: Number of matched substrings, or -1 if an error occurred

*

* Since: 2.14

*/

gint

g_match_info_get_match_count (const GMatchInfo *match_info)

832

{

833

g_return_val_if_fail (match_info, -1);

834

835

if (match_info->matches == PCRE_ERROR_NOMATCH)

836

/* no match */

837

return 0;

838

else if (match_info->matches < PCRE_ERROR_NOMATCH)

/* error */

return -1;

else

/* match */

return match_info->matches;

}

/**

* g_match_info_is_partial_match:

848

* @match_info: a #GMatchInfo structure

849

*

850

* Usually if the string passed to g_regex_match*() matches as far as

851

* it goes, but is too short to match the entire pattern, %FALSE is

852

* returned. There are circumstances where it might be helpful to

853

* distinguish this case from other cases in which there is no match.

854

*

855

* Consider, for example, an application where a human is required to

856

* type in data for a field with specific formatting requirements. An

857

* example might be a date in the form ddmmmyy, defined by the pattern

858

* "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$".

859

* If the application sees the user’s keystrokes one by one, and can

860

* check that what has been typed so far is potentially valid, it is

861

* able to raise an error as soon as a mistake is made.

862

*

863

* GRegex supports the concept of partial matching by means of the

864

* #G_REGEX_MATCH_PARTIAL_SOFT and #G_REGEX_MATCH_PARTIAL_HARD flags.

865

* When they are used, the return code for

866

* g_regex_match() or g_regex_match_full() is, as usual, %TRUE

867

* for a complete match, %FALSE otherwise. But, when these functions

868

* return %FALSE, you can check if the match was partial calling

869

* g_match_info_is_partial_match().

870

*

871

* The difference between #G_REGEX_MATCH_PARTIAL_SOFT and

872

* #G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered

873

* with #G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a

874

* possible complete match, while with #G_REGEX_MATCH_PARTIAL_HARD matching

875

* stops at the partial match.

876

* When both #G_REGEX_MATCH_PARTIAL_SOFT and #G_REGEX_MATCH_PARTIAL_HARD

877

* are set, the latter takes precedence.

878

*

879

* There were formerly some restrictions on the pattern for partial matching.

880

* The restrictions no longer apply.

881

*

882

* See pcrepartial(3) for more information on partial matching.

883

*

884

* Returns: %TRUE if the match was partial, %FALSE otherwise

*

* Since: 2.14

*/

gboolean

g_match_info_is_partial_match (const GMatchInfo *match_info)

890

{

891

g_return_val_if_fail (match_info != NULL, FALSE);

892

893

return match_info->matches == PCRE_ERROR_PARTIAL;

}

/**

* g_match_info_expand_references:

898

* @match_info: (allow-none): a #GMatchInfo or %NULL

899

* @string_to_expand: the string to expand

900

* @error: location to store the error occurring, or %NULL to ignore errors

901

*

902

* Returns a new string containing the text in @string_to_expand with

903

* references and escape sequences expanded. References refer to the last

904

* match done with @string against @regex and have the same syntax used by

905

* g_regex_replace().

906

*

907

* The @string_to_expand must be UTF-8 encoded even if #G_REGEX_RAW was

908

* passed to g_regex_new().

909

*

910

* The backreferences are extracted from the string passed to the match

911

* function, so you cannot call this function after freeing the string.

912

*

913

* @match_info may be %NULL in which case @string_to_expand must not

914

* contain references. For instance "foo\n" does not refer to an actual

915

* pattern and '\n' merely will be replaced with \n character,

916

* while to expand "\0" (whole match) one needs the result of a match.

917

* Use g_regex_check_replacement() to find out whether @string_to_expand

918

* contains references.

919

*

920

* Returns: (allow-none): the expanded string, or %NULL if an error occurred

*

* Since: 2.14

*/

gchar *

g_match_info_expand_references (const GMatchInfo *match_info,

926

const gchar *string_to_expand,

GError **error)

{

GString *result;

GList *list;

GError *tmp_error = NULL;

932

933

g_return_val_if_fail (string_to_expand != NULL, NULL);

934

g_return_val_if_fail (error == NULL || *error == NULL, NULL);

935

936

list = split_replacement (string_to_expand, &tmp_error);

937

if (tmp_error != NULL)

938

{

939

g_propagate_error (error, tmp_error);

return NULL;

}

if (!match_info && interpolation_list_needs_match (list))

944

{

945

g_critical ("String '%s' contains references to the match, can't "

946

"expand references without GMatchInfo object",

string_to_expand);

return NULL;

}

result = g_string_sized_new (strlen (string_to_expand));

952

interpolate_replacement (match_info, result, list);

953

954

g_list_free_full (list, (GDestroyNotify) free_interpolation_data);

955

956

return g_string_free (result, FALSE);

}

/**

* g_match_info_fetch:

961

* @match_info: #GMatchInfo structure

962

* @match_num: number of the sub expression

963

*

964

* Retrieves the text matching the @match_num'th capturing

965

* parentheses. 0 is the full text of the match, 1 is the first paren

966

* set, 2 the second, and so on.

967

*

968

* If @match_num is a valid sub pattern but it didn't match anything

969

* (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty

970

* string is returned.

971

*

972

* If the match was obtained using the DFA algorithm, that is using

973

* g_regex_match_all() or g_regex_match_all_full(), the retrieved

974

* string is not that of a set of parentheses but that of a matched

975

* substring. Substrings are matched in reverse order of length, so

976

* 0 is the longest match.

977

*

978

* The string is fetched from the string passed to the match function,

979

* so you cannot call this function after freeing the string.

980

*

981

* Returns: (allow-none): The matched substring, or %NULL if an error

982

* occurred. You have to free the string yourself

*

* Since: 2.14

*/

gchar *

g_match_info_fetch (const GMatchInfo *match_info,

988

gint match_num)

989

{

990

/* we cannot use pcre_get_substring() because it allocates the

991

* string using pcre_malloc(). */

992

gchar *match = NULL;

993

gint start, end;

994

995

g_return_val_if_fail (match_info != NULL, NULL);

996

g_return_val_if_fail (match_num >= 0, NULL);

997

998

/* match_num does not exist or it didn't matched, i.e. matching "b"

999

* against "(a)?b" then group 0 is empty. */

1000

if (!g_match_info_fetch_pos (match_info, match_num, &start, &end))

1001

match = NULL;

1002

else if (start == -1)

1003

match = g_strdup ("");

1004

else

1005

match = g_strndup (&match_info->string[start], end - start);

return match;

}

/**

* g_match_info_fetch_pos:

1012

* @match_info: #GMatchInfo structure

1013

* @match_num: number of the sub expression

1014

* @start_pos: (out) (allow-none): pointer to location where to store

1015

* the start position, or %NULL

1016

* @end_pos: (out) (allow-none): pointer to location where to store

1017

* the end position, or %NULL

1018

*

1019

* Retrieves the position in bytes of the @match_num'th capturing

1020

* parentheses. 0 is the full text of the match, 1 is the first

1021

* paren set, 2 the second, and so on.

1022

*

1023

* If @match_num is a valid sub pattern but it didn't match anything

1024

* (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos

1025

* and @end_pos are set to -1 and %TRUE is returned.

1026

*

1027

* If the match was obtained using the DFA algorithm, that is using

1028

* g_regex_match_all() or g_regex_match_all_full(), the retrieved

1029

* position is not that of a set of parentheses but that of a matched

1030

* substring. Substrings are matched in reverse order of length, so

1031

* 0 is the longest match.

1032

*

1033

* Returns: %TRUE if the position was fetched, %FALSE otherwise. If

1034

* the position cannot be fetched, @start_pos and @end_pos are left

* unchanged

*

* Since: 2.14

*/

gboolean

g_match_info_fetch_pos (const GMatchInfo *match_info,

gint match_num,

gint *start_pos,

gint *end_pos)

{

g_return_val_if_fail (match_info != NULL, FALSE);

1046

g_return_val_if_fail (match_num >= 0, FALSE);

1047

1048

/* make sure the sub expression number they're requesting is less than

1049

* the total number of sub expressions that were matched. */

1050

if (match_num >= match_info->matches)

1051

return FALSE;

1052

1053

if (start_pos != NULL)

1054

*start_pos = match_info->offsets[2 * match_num];

1055

1056

if (end_pos != NULL)

1057

*end_pos = match_info->offsets[2 * match_num + 1];

return TRUE;

}

/*

* Returns number of first matched subpattern with name @name.

1064

* There may be more than one in case when DUPNAMES is used,

1065

* and not all subpatterns with that name match;

1066

* pcre_get_stringnumber() does not work in that case.

1067

*/

1068

static gint

1069

get_matched_substring_number (const GMatchInfo *match_info,

const gchar *name)

{

gint entrysize;

gchar *first, *last;

1074

guchar *entry;

1075

1076

if (!(match_info->regex->compile_opts & G_REGEX_DUPNAMES))

1077

return pcre_get_stringnumber (match_info->regex->pcre_re, name);

1078

1079

/* This code is copied from pcre_get.c: get_first_set() */

1080

entrysize = pcre_get_stringtable_entries (match_info->regex->pcre_re,

name,

&first,

&last);

if (entrysize <= 0)

1086

return entrysize;

1087

1088

for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize)

1089

{

1090

gint n = (entry[0] << 8) + entry[1];

1091

if (match_info->offsets[n*2] >= 0)

return n;

}

return (first[0] << 8) + first[1];

}

/**

* g_match_info_fetch_named:

1100

* @match_info: #GMatchInfo structure

1101

* @name: name of the subexpression

1102

*

1103

* Retrieves the text matching the capturing parentheses named @name.

1104

*

1105

* If @name is a valid sub pattern name but it didn't match anything

1106

* (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b")

1107

* then an empty string is returned.

1108

*

1109

* The string is fetched from the string passed to the match function,

1110

* so you cannot call this function after freeing the string.

1111

*

1112

* Returns: (allow-none): The matched substring, or %NULL if an error

1113

* occurred. You have to free the string yourself

*

* Since: 2.14

*/

gchar *

g_match_info_fetch_named (const GMatchInfo *match_info,

1119

const gchar *name)

1120

{

1121

/* we cannot use pcre_get_named_substring() because it allocates the

1122

* string using pcre_malloc(). */

1123

gint num;

1124

1125

g_return_val_if_fail (match_info != NULL, NULL);

1126

g_return_val_if_fail (name != NULL, NULL);

1127

1128

num = get_matched_substring_number (match_info, name);

if (num < 0)

return NULL;

else

return g_match_info_fetch (match_info, num);

}

/**

* g_match_info_fetch_named_pos:

1137

* @match_info: #GMatchInfo structure

1138

* @name: name of the subexpression

1139

* @start_pos: (out) (allow-none): pointer to location where to store

1140

* the start position, or %NULL

1141

* @end_pos: (out) (allow-none): pointer to location where to store

1142

* the end position, or %NULL

1143

*

1144

* Retrieves the position in bytes of the capturing parentheses named @name.

1145

*

1146

* If @name is a valid sub pattern name but it didn't match anything

1147

* (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b")

1148

* then @start_pos and @end_pos are set to -1 and %TRUE is returned.

1149

*

1150

* Returns: %TRUE if the position was fetched, %FALSE otherwise.

1151

* If the position cannot be fetched, @start_pos and @end_pos

1152

* are left unchanged.

*

* Since: 2.14

*/

gboolean

g_match_info_fetch_named_pos (const GMatchInfo *match_info,

const gchar *name,

gint *start_pos,

gint *end_pos)

{

gint num;

g_return_val_if_fail (match_info != NULL, FALSE);

1165

g_return_val_if_fail (name != NULL, FALSE);

1166

1167

num = get_matched_substring_number (match_info, name);

if (num < 0)

return FALSE;

return g_match_info_fetch_pos (match_info, num, start_pos, end_pos);

}

/**

* g_match_info_fetch_all:

1176

* @match_info: a #GMatchInfo structure

1177

*

1178

* Bundles up pointers to each of the matching substrings from a match

1179

* and stores them in an array of gchar pointers. The first element in

1180

* the returned array is the match number 0, i.e. the entire matched

1181

* text.

1182

*

1183

* If a sub pattern didn't match anything (e.g. sub pattern 1, matching

1184

* "b" against "(a)?b") then an empty string is inserted.

1185

*

1186

* If the last match was obtained using the DFA algorithm, that is using

1187

* g_regex_match_all() or g_regex_match_all_full(), the retrieved

1188

* strings are not that matched by sets of parentheses but that of the

1189

* matched substring. Substrings are matched in reverse order of length,

1190

* so the first one is the longest match.

1191

*

1192

* The strings are fetched from the string passed to the match function,

1193

* so you cannot call this function after freeing the string.

1194

*

1195

* Returns: (transfer full): a %NULL-terminated array of gchar *

1196

* pointers. It must be freed using g_strfreev(). If the previous

1197

* match failed %NULL is returned

*

* Since: 2.14

*/

gchar **

g_match_info_fetch_all (const GMatchInfo *match_info)

1203

{

1204

/* we cannot use pcre_get_substring_list() because the returned value

1205

* isn't suitable for g_strfreev(). */

gchar **result;

gint i;

g_return_val_if_fail (match_info != NULL, NULL);

1210

1211

if (match_info->matches < 0)

1212

return NULL;

1213

1214

result = g_new (gchar *, match_info->matches + 1);

1215

for (i = 0; i < match_info->matches; i++)

1216

result[i] = g_match_info_fetch (match_info, i);

result[i] = NULL;

return result;

}

/* GRegex */

G_DEFINE_QUARK (g-regex-error-quark, g_regex_error)

/**

* g_regex_ref:

* @regex: a #GRegex

1230

*

1231

* Increases reference count of @regex by 1.

*

* Returns: @regex

*

* Since: 2.14

*/

GRegex *

g_regex_ref (GRegex *regex)

1239

{

1240

g_return_val_if_fail (regex != NULL, NULL);

1241

g_atomic_int_inc (&regex->ref_count);

return regex;

}

/**

* g_regex_unref:

* @regex: a #GRegex

1248

*

1249

* Decreases reference count of @regex by 1. When reference count drops

1250

* to zero, it frees all the memory associated with the regex structure.

*

* Since: 2.14

*/

void

g_regex_unref (GRegex *regex)

1256

{

1257

g_return_if_fail (regex != NULL);

1258

1259

if (g_atomic_int_dec_and_test (&regex->ref_count))

1260

{

1261

g_free (regex->pattern);

1262

if (regex->pcre_re != NULL)

1263

pcre_free (regex->pcre_re);

1264

if (regex->extra != NULL)

1265

pcre_free (regex->extra);

g_free (regex);

}

}

/*

* @match_options: (inout) (optional):

1272

*/

1273

static pcre *regex_compile (const gchar *pattern,

1274

GRegexCompileFlags compile_options,

1275

GRegexCompileFlags *compile_options_out,

1276

GRegexMatchFlags *match_options,

GError **error);

/**

* g_regex_new:

* @pattern: the regular expression

1282

* @compile_options: compile options for the regular expression, or 0

1283

* @match_options: match options for the regular expression, or 0

1284

* @error: return location for a #GError

1285

*

1286

* Compiles the regular expression to an internal form, and does

1287

* the initial setup of the #GRegex structure.

1288

*

1289

* Returns: (nullable): a #GRegex structure or %NULL if an error occured. Call

1290

* g_regex_unref() when you are done with it

*

* Since: 2.14

*/

GRegex *

g_regex_new (const gchar *pattern,

1296

GRegexCompileFlags compile_options,

1297

GRegexMatchFlags match_options,

GError **error)

{

GRegex *regex;

pcre *re;

const gchar *errmsg;

1303

gboolean optimize = FALSE;

1304

static volatile gsize initialised = 0;

1305

1306

g_return_val_if_fail (pattern != NULL, NULL);

1307

g_return_val_if_fail (error == NULL || *error == NULL, NULL);

1308

g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL);

1309

g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);

1310

1311

if (g_once_init_enter (&initialised))

1312

{

1313

int supports_utf8, supports_ucp;

1314

1315

pcre_config (PCRE_CONFIG_UTF8, &supports_utf8);

1316

if (!supports_utf8)

1317

g_critical (_("PCRE library is compiled without UTF8 support"));

1318

1319

pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &supports_ucp);

1320

if (!supports_ucp)

1321

g_critical (_("PCRE library is compiled without UTF8 properties support"));

1322

1323

g_once_init_leave (&initialised, supports_utf8 && supports_ucp ? 1 : 2);

1324

}

1325

1326

if (G_UNLIKELY (initialised != 1))

1327

{

1328

g_set_error_literal (error, G_REGEX_ERROR, G_REGEX_ERROR_COMPILE,

1329

_("PCRE library is compiled with incompatible options"));

return NULL;

}

/* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK,

1334

* as we do not need to wrap PCRE_NO_UTF8_CHECK. */

1335

if (compile_options & G_REGEX_OPTIMIZE)

1336

optimize = TRUE;

1337

1338

re = regex_compile (pattern, compile_options, &compile_options,

1339

&match_options, error);

if (re == NULL)

return NULL;

regex = g_new0 (GRegex, 1);

1345

regex->ref_count = 1;

1346

regex->pattern = g_strdup (pattern);

1347

regex->pcre_re = re;

1348

regex->compile_opts = compile_options;

1349

regex->match_opts = match_options;

if (optimize)

{

regex->extra = pcre_study (regex->pcre_re, 0, &errmsg);

1354

if (errmsg != NULL)

1355

{

1356

GError *tmp_error = g_error_new (G_REGEX_ERROR,

1357

G_REGEX_ERROR_OPTIMIZE,

1358

_("Error while optimizing "

1359

"regular expression %s: %s"),

1360

regex->pattern,

1361

errmsg);

1362

g_propagate_error (error, tmp_error);

1363

1364

g_regex_unref (regex);

return NULL;

}

}

return regex;

}

static pcre *

regex_compile (const gchar *pattern,

1374

GRegexCompileFlags compile_options,

1375

GRegexCompileFlags *compile_options_out,

1376

GRegexMatchFlags *match_options,

GError **error)

{

pcre *re;

const gchar *errmsg;

1381

gint erroffset;

1382

gint errcode;

1383

GRegexCompileFlags nonpcre_compile_options;

1384

unsigned long int pcre_compile_options;

1385

1386

nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK;

1387

1388

/* In GRegex the string are, by default, UTF-8 encoded. PCRE

1389

* instead uses UTF-8 only if required with PCRE_UTF8. */

1390

if (compile_options & G_REGEX_RAW)

1391

{

1392

/* disable utf-8 */

1393

compile_options &= ~G_REGEX_RAW;

}

else

{

/* enable utf-8 */

compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK;

1399

1400

if (match_options != NULL)

1401

*match_options |= PCRE_NO_UTF8_CHECK;

1402

}

1403

1404

/* PCRE_NEWLINE_ANY is the default for the internal PCRE but

1405

* not for the system one. */

1406

if (!(compile_options & G_REGEX_NEWLINE_CR) &&

1407

!(compile_options & G_REGEX_NEWLINE_LF))

1408

{

1409

compile_options |= PCRE_NEWLINE_ANY;

1410

}

1411

1412

compile_options |= PCRE_UCP;

1413

1414

/* PCRE_BSR_UNICODE is the default for the internal PCRE but

1415

* possibly not for the system one.

1416

*/

1417

if (~compile_options & G_REGEX_BSR_ANYCRLF)

1418

compile_options |= PCRE_BSR_UNICODE;

1419

1420

/* compile the pattern */

1421

re = pcre_compile2 (pattern, compile_options, &errcode,

1422

&errmsg, &erroffset, NULL);

1423

1424

/* if the compilation failed, set the error member and return

* immediately */

if (re == NULL)

{

GError *tmp_error;

/* Translate the PCRE error code to GRegexError and use a translated

1431

* error message if possible */

1432

translate_compile_error (&errcode, &errmsg);

1433

1434

/* PCRE uses byte offsets but we want to show character offsets */

1435

erroffset = g_utf8_pointer_to_offset (pattern, &pattern[erroffset]);

1436

1437

tmp_error = g_error_new (G_REGEX_ERROR, errcode,

1438

_("Error while compiling regular "

1439

"expression %s at char %d: %s"),

1440

pattern, erroffset, errmsg);

1441

g_propagate_error (error, tmp_error);

return NULL;

}

/* For options set at the beginning of the pattern, pcre puts them into

1447

* compile options, e.g. "(?i)foo" will make the pcre structure store

1448

* PCRE_CASELESS even though it wasn't explicitly given for compilation. */

1449

pcre_fullinfo (re, NULL, PCRE_INFO_OPTIONS, &pcre_compile_options);

1450

compile_options = pcre_compile_options & G_REGEX_COMPILE_PCRE_MASK;

1451

1452

/* Don't leak PCRE_NEWLINE_ANY, which is part of PCRE_NEWLINE_ANYCRLF */

1453

if ((pcre_compile_options & PCRE_NEWLINE_ANYCRLF) != PCRE_NEWLINE_ANYCRLF)

1454

compile_options &= ~PCRE_NEWLINE_ANY;

1455

1456

compile_options |= nonpcre_compile_options;

1457

1458

if (!(compile_options & G_REGEX_DUPNAMES))

1459

{

1460

gboolean jchanged = FALSE;

1461

pcre_fullinfo (re, NULL, PCRE_INFO_JCHANGED, &jchanged);

1462

if (jchanged)

1463

compile_options |= G_REGEX_DUPNAMES;

1464

}

1465

1466

if (compile_options_out != 0)

1467

*compile_options_out = compile_options;

return re;

}

/**

* g_regex_get_pattern:

1474

* @regex: a #GRegex structure

1475

*

1476

* Gets the pattern string associated with @regex, i.e. a copy of

1477

* the string passed to g_regex_new().

1478

*

1479

* Returns: the pattern of @regex

*

* Since: 2.14

*/

const gchar *

g_regex_get_pattern (const GRegex *regex)

1485

{

1486

g_return_val_if_fail (regex != NULL, NULL);

1487

1488

return regex->pattern;

}

/**

* g_regex_get_max_backref:

1493

* @regex: a #GRegex

1494

*

1495

* Returns the number of the highest back reference

1496

* in the pattern, or 0 if the pattern does not contain

1497

* back references.

1498

*

1499

* Returns: the number of the highest back reference

*

* Since: 2.14

*/

gint

g_regex_get_max_backref (const GRegex *regex)

{

gint value;

pcre_fullinfo (regex->pcre_re, regex->extra,

1509

PCRE_INFO_BACKREFMAX, &value);

return value;

}

/**

* g_regex_get_capture_count:

1516

* @regex: a #GRegex

1517

*

1518

* Returns the number of capturing subpatterns in the pattern.

1519

*

1520

* Returns: the number of capturing subpatterns

*

* Since: 2.14

*/

gint

g_regex_get_capture_count (const GRegex *regex)

{

gint value;

pcre_fullinfo (regex->pcre_re, regex->extra,

1530

PCRE_INFO_CAPTURECOUNT, &value);

return value;

}

/**

* g_regex_get_has_cr_or_lf:

1537

* @regex: a #GRegex structure

1538

*

1539

* Checks whether the pattern contains explicit CR or LF references.

1540

*

1541

* Returns: %TRUE if the pattern contains explicit CR or LF references

*

* Since: 2.34

*/

gboolean

g_regex_get_has_cr_or_lf (const GRegex *regex)

{

gint value;

pcre_fullinfo (regex->pcre_re, regex->extra,

1551

PCRE_INFO_HASCRORLF, &value);

return !!value;

}

/**

* g_regex_get_max_lookbehind:

1558

* @regex: a #GRegex structure

1559

*

1560

* Gets the number of characters in the longest lookbehind assertion in the

1561

* pattern. This information is useful when doing multi-segment matching using

1562

* the partial matching facilities.

1563

*

1564

* Returns: the number of characters in the longest lookbehind assertion.

*

* Since: 2.38

*/

gint

g_regex_get_max_lookbehind (const GRegex *regex)

1570

{

1571

gint max_lookbehind;

1572

1573

pcre_fullinfo (regex->pcre_re, regex->extra,

1574

PCRE_INFO_MAXLOOKBEHIND, &max_lookbehind);

1575

1576

return max_lookbehind;

}

/**

* g_regex_get_compile_flags:

1581

* @regex: a #GRegex

1582

*

1583

* Returns the compile options that @regex was created with.

1584

*

1585

* Returns: flags from #GRegexCompileFlags

*

* Since: 2.26

*/

GRegexCompileFlags

g_regex_get_compile_flags (const GRegex *regex)

1591

{

1592

g_return_val_if_fail (regex != NULL, 0);

1593

1594

return regex->compile_opts;

}

/**

* g_regex_get_match_flags:

1599

* @regex: a #GRegex

1600

*

1601

* Returns the match options that @regex was created with.

1602

*

1603

* Returns: flags from #GRegexMatchFlags

*

* Since: 2.26

*/

GRegexMatchFlags

g_regex_get_match_flags (const GRegex *regex)

1609

{

1610

g_return_val_if_fail (regex != NULL, 0);

1611

1612

return regex->match_opts & G_REGEX_MATCH_MASK;

}

/**

* g_regex_match_simple:

1617

* @pattern: the regular expression

1618

* @string: the string to scan for matches

1619

* @compile_options: compile options for the regular expression, or 0

1620

* @match_options: match options, or 0

1621

*

1622

* Scans for a match in @string for @pattern.

1623

*

1624

* This function is equivalent to g_regex_match() but it does not

1625

* require to compile the pattern with g_regex_new(), avoiding some

1626

* lines of code when you need just to do a match without extracting

1627

* substrings, capture counts, and so on.

1628

*

1629

* If this function is to be called on the same @pattern more than

1630

* once, it's more efficient to compile the pattern once with

1631

* g_regex_new() and then use g_regex_match().

1632

*

1633

* Returns: %TRUE if the string matched, %FALSE otherwise

*

* Since: 2.14

*/

gboolean

g_regex_match_simple (const gchar *pattern,

1639

const gchar *string,

1640

GRegexCompileFlags compile_options,

1641

GRegexMatchFlags match_options)

{

GRegex *regex;

gboolean result;

regex = g_regex_new (pattern, compile_options, 0, NULL);

1647

if (!regex)

1648

return FALSE;

1649

result = g_regex_match_full (regex, string, -1, 0, match_options, NULL, NULL);

1650

g_regex_unref (regex);

return result;

}

/**

* g_regex_match:

* @regex: a #GRegex structure from g_regex_new()

1657

* @string: the string to scan for matches

1658

* @match_options: match options

1659

* @match_info: (out) (allow-none): pointer to location where to store

1660

* the #GMatchInfo, or %NULL if you do not need it

1661

*

1662

* Scans for a match in string for the pattern in @regex.

1663

* The @match_options are combined with the match options specified

1664

* when the @regex structure was created, letting you have more

1665

* flexibility in reusing #GRegex structures.

1666

*

1667

* A #GMatchInfo structure, used to get information on the match,

1668

* is stored in @match_info if not %NULL. Note that if @match_info

1669

* is not %NULL then it is created even if the function returns %FALSE,

1670

* i.e. you must free it regardless if regular expression actually matched.

1671

*

1672

* To retrieve all the non-overlapping matches of the pattern in

1673

* string you can use g_match_info_next().

1674

*

1675

* |[

1676

* static void

1677

* print_uppercase_words (const gchar *string)

1678

* {

1679

* // Print all uppercase-only words.

1680

* GRegex *regex;

1681

* GMatchInfo *match_info;

1682

*

1683

* regex = g_regex_new ("[A-Z]+", 0, 0, NULL);

1684

* g_regex_match (regex, string, 0, &match_info);

1685

* while (g_match_info_matches (match_info))

1686

* {

1687

* gchar *word = g_match_info_fetch (match_info, 0);

1688

* g_print ("Found: %s\n", word);

1689

* g_free (word);

1690

* g_match_info_next (match_info, NULL);

1691

* }

1692

* g_match_info_free (match_info);

1693

* g_regex_unref (regex);

* }

* ]|

*

* @string is not copied and is used in #GMatchInfo internally. If

1698

* you use any #GMatchInfo method (except g_match_info_free()) after

1699

* freeing or modifying @string then the behaviour is undefined.

1700

*

1701

* Returns: %TRUE is the string matched, %FALSE otherwise

*

* Since: 2.14

*/

gboolean

g_regex_match (const GRegex *regex,

1707

const gchar *string,

1708

GRegexMatchFlags match_options,

1709

GMatchInfo **match_info)

1710

{

1711

return g_regex_match_full (regex, string, -1, 0, match_options,

match_info, NULL);

}

/**

* g_regex_match_full:

1717

* @regex: a #GRegex structure from g_regex_new()

1718

* @string: (array length=string_len): the string to scan for matches

1719

* @string_len: the length of @string, or -1 if @string is nul-terminated

1720

* @start_position: starting index of the string to match, in bytes

1721

* @match_options: match options

1722

* @match_info: (out) (allow-none): pointer to location where to store

1723

* the #GMatchInfo, or %NULL if you do not need it

1724

* @error: location to store the error occurring, or %NULL to ignore errors

1725

*

1726

* Scans for a match in string for the pattern in @regex.

1727

* The @match_options are combined with the match options specified

1728

* when the @regex structure was created, letting you have more

1729

* flexibility in reusing #GRegex structures.

1730

*

1731

* Setting @start_position differs from just passing over a shortened

1732

* string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern

1733

* that begins with any kind of lookbehind assertion, such as "\b".

1734

*

1735

* A #GMatchInfo structure, used to get information on the match, is

1736

* stored in @match_info if not %NULL. Note that if @match_info is

1737

* not %NULL then it is created even if the function returns %FALSE,

1738

* i.e. you must free it regardless if regular expression actually

1739

* matched.

1740

*

1741

* @string is not copied and is used in #GMatchInfo internally. If

1742

* you use any #GMatchInfo method (except g_match_info_free()) after

1743

* freeing or modifying @string then the behaviour is undefined.

1744

*

1745

* To retrieve all the non-overlapping matches of the pattern in

1746

* string you can use g_match_info_next().

1747

*

1748

* |[

1749

* static void

1750

* print_uppercase_words (const gchar *string)

1751

* {

1752

* // Print all uppercase-only words.

1753

* GRegex *regex;

1754

* GMatchInfo *match_info;

1755

* GError *error = NULL;

1756

*

1757

* regex = g_regex_new ("[A-Z]+", 0, 0, NULL);

1758

* g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error);

1759

* while (g_match_info_matches (match_info))

1760

* {

1761

* gchar *word = g_match_info_fetch (match_info, 0);

1762

* g_print ("Found: %s\n", word);

1763

* g_free (word);

1764

* g_match_info_next (match_info, &error);

1765

* }

1766

* g_match_info_free (match_info);

1767

* g_regex_unref (regex);

1768

* if (error != NULL)

1769

* {

1770

* g_printerr ("Error while matching: %s\n", error->message);

1771

* g_error_free (error);

* }

* }

* ]|

*

* Returns: %TRUE is the string matched, %FALSE otherwise

*

* Since: 2.14

*/

gboolean

g_regex_match_full (const GRegex *regex,

1782

const gchar *string,

1783

gssize string_len,

1784

gint start_position,

1785

GRegexMatchFlags match_options,

1786

GMatchInfo **match_info,

GError **error)

{

GMatchInfo *info;

gboolean match_ok;

g_return_val_if_fail (regex != NULL, FALSE);

1793

g_return_val_if_fail (string != NULL, FALSE);

1794

g_return_val_if_fail (start_position >= 0, FALSE);

1795

g_return_val_if_fail (error == NULL || *error == NULL, FALSE);

1796

g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);

1797

1798

info = match_info_new (regex, string, string_len, start_position,

1799

match_options, FALSE);

1800

match_ok = g_match_info_next (info, error);

1801

if (match_info != NULL)

1802

*match_info = info;

1803

else

1804

g_match_info_free (info);

return match_ok;

}

/**

* g_regex_match_all:

1811

* @regex: a #GRegex structure from g_regex_new()

1812

* @string: the string to scan for matches

1813

* @match_options: match options

1814

* @match_info: (out) (allow-none): pointer to location where to store

1815

* the #GMatchInfo, or %NULL if you do not need it

1816

*

1817

* Using the standard algorithm for regular expression matching only

1818

* the longest match in the string is retrieved. This function uses

1819

* a different algorithm so it can retrieve all the possible matches.

1820

* For more documentation see g_regex_match_all_full().

1821

*

1822

* A #GMatchInfo structure, used to get information on the match, is

1823

* stored in @match_info if not %NULL. Note that if @match_info is

1824

* not %NULL then it is created even if the function returns %FALSE,

1825

* i.e. you must free it regardless if regular expression actually

1826

* matched.

1827

*

1828

* @string is not copied and is used in #GMatchInfo internally. If

1829

* you use any #GMatchInfo method (except g_match_info_free()) after

1830

* freeing or modifying @string then the behaviour is undefined.

1831

*

1832

* Returns: %TRUE is the string matched, %FALSE otherwise

*

* Since: 2.14

*/

gboolean

g_regex_match_all (const GRegex *regex,

1838

const gchar *string,

1839

GRegexMatchFlags match_options,

1840

GMatchInfo **match_info)

1841

{

1842

return g_regex_match_all_full (regex, string, -1, 0, match_options,

match_info, NULL);

}

/**

* g_regex_match_all_full:

1848

* @regex: a #GRegex structure from g_regex_new()

1849

* @string: (array length=string_len): the string to scan for matches

1850

* @string_len: the length of @string, or -1 if @string is nul-terminated

1851

* @start_position: starting index of the string to match, in bytes

1852

* @match_options: match options

1853

* @match_info: (out) (allow-none): pointer to location where to store

1854

* the #GMatchInfo, or %NULL if you do not need it

1855

* @error: location to store the error occurring, or %NULL to ignore errors

1856

*

1857

* Using the standard algorithm for regular expression matching only

1858

* the longest match in the string is retrieved, it is not possible

1859

* to obtain all the available matches. For instance matching

1860

* "<a> <c>" against the pattern "<.*>"

1861

* you get "<a> <c>".

1862

*

1863

* This function uses a different algorithm (called DFA, i.e. deterministic

1864

* finite automaton), so it can retrieve all the possible matches, all

1865

* starting at the same point in the string. For instance matching

1866

* "<a> <c>" against the pattern "<.*>;"

1867

* you would obtain three matches: "<a> <c>",

1868

* "<a> " and "<a>".

1869

*

1870

* The number of matched strings is retrieved using

1871

* g_match_info_get_match_count(). To obtain the matched strings and

1872

* their position you can use, respectively, g_match_info_fetch() and

1873

* g_match_info_fetch_pos(). Note that the strings are returned in

1874

* reverse order of length; that is, the longest matching string is

1875

* given first.

1876

*

1877

* Note that the DFA algorithm is slower than the standard one and it

1878

* is not able to capture substrings, so backreferences do not work.

1879

*

1880

* Setting @start_position differs from just passing over a shortened

1881

* string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern

1882

* that begins with any kind of lookbehind assertion, such as "\b".

1883

*

1884

* A #GMatchInfo structure, used to get information on the match, is

1885

* stored in @match_info if not %NULL. Note that if @match_info is

1886

* not %NULL then it is created even if the function returns %FALSE,

1887

* i.e. you must free it regardless if regular expression actually

1888

* matched.

1889

*

1890

* @string is not copied and is used in #GMatchInfo internally. If

1891

* you use any #GMatchInfo method (except g_match_info_free()) after

1892

* freeing or modifying @string then the behaviour is undefined.

1893

*

1894

* Returns: %TRUE is the string matched, %FALSE otherwise

*

* Since: 2.14

*/

gboolean

g_regex_match_all_full (const GRegex *regex,

1900

const gchar *string,

1901

gssize string_len,

1902

gint start_position,

1903

GRegexMatchFlags match_options,

1904

GMatchInfo **match_info,

GError **error)

{

GMatchInfo *info;

gboolean done;

pcre *pcre_re;

pcre_extra *extra;

g_return_val_if_fail (regex != NULL, FALSE);

1913

g_return_val_if_fail (string != NULL, FALSE);

1914

g_return_val_if_fail (start_position >= 0, FALSE);

1915

g_return_val_if_fail (error == NULL || *error == NULL, FALSE);

1916

g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE);

1917

1918

#ifdef PCRE_NO_AUTO_POSSESS

1919

/* For PCRE >= 8.34 we need to turn off PCRE_NO_AUTO_POSSESS, which

1920

* is an optimization for normal regex matching, but results in omitting

1921

* some shorter matches here, and an observable behaviour change.

1922

*

1923

* DFA matching is rather niche, and very rarely used according to

1924

* codesearch.debian.net, so don't bother caching the recompiled RE. */

1925

pcre_re = regex_compile (regex->pattern,

1926

regex->compile_opts | PCRE_NO_AUTO_POSSESS,

1927

NULL, NULL, error);

1928

1929

if (pcre_re == NULL)

1930

return FALSE;

1931

1932

/* Not bothering to cache the optimization data either, with similar

* reasoning */

extra = NULL;

#else

/* For PCRE < 8.33 the precompiled regex is fine. */

1937

pcre_re = regex->pcre_re;

1938

extra = regex->extra;

1939

#endif

1940

1941

info = match_info_new (regex, string, string_len, start_position,

1942

match_options, TRUE);

done = FALSE;

while (!done)

{

done = TRUE;

info->matches = pcre_dfa_exec (pcre_re, extra,

1949

info->string, info->string_len,

1950

info->pos,

1951

regex->match_opts | match_options,

1952

info->offsets, info->n_offsets,

1953

info->workspace, info->n_workspace);

1954

if (info->matches == PCRE_ERROR_DFA_WSSIZE)

1955

{

1956

/* info->workspace is too small. */

1957

info->n_workspace *= 2;

1958

info->workspace = g_realloc (info->workspace,

1959

info->n_workspace * sizeof (gint));

1960

done = FALSE;

1961

}

1962

else if (info->matches == 0)

1963

{

1964

/* info->offsets is too small. */

1965

info->n_offsets *= 2;

1966

info->offsets = g_realloc (info->offsets,

1967

info->n_offsets * sizeof (gint));

1968

done = FALSE;

1969

}

1970

else if (IS_PCRE_ERROR (info->matches))

1971

{

1972

g_set_error (error, G_REGEX_ERROR, G_REGEX_ERROR_MATCH,

1973

_("Error while matching regular expression %s: %s"),

1974

regex->pattern, match_error (info->matches));

}

}

#ifdef PCRE_NO_AUTO_POSSESS

1979

pcre_free (pcre_re);

1980

#endif

1981

1982

/* set info->pos to -1 so that a call to g_match_info_next() fails. */

1983

info->pos = -1;

1984

1985

if (match_info != NULL)

1986

*match_info = info;

1987

else

1988

g_match_info_free (info);

1989

1990

return info->matches >= 0;

}

/**

* g_regex_get_string_number:

1995

* @regex: #GRegex structure

1996

* @name: name of the subexpression

1997

*

1998

* Retrieves the number of the subexpression named @name.

1999

*

2000

* Returns: The number of the subexpression or -1 if @name

* does not exists

*

* Since: 2.14

*/

gint

g_regex_get_string_number (const GRegex *regex,

const gchar *name)

{

gint num;

g_return_val_if_fail (regex != NULL, -1);

2012

g_return_val_if_fail (name != NULL, -1);

2013

2014

num = pcre_get_stringnumber (regex->pcre_re, name);

2015

if (num == PCRE_ERROR_NOSUBSTRING)

num = -1;

return num;

}

/**

* g_regex_split_simple:

2023

* @pattern: the regular expression

2024

* @string: the string to scan for matches

2025

* @compile_options: compile options for the regular expression, or 0

2026

* @match_options: match options, or 0

2027

*

2028

* Breaks the string on the pattern, and returns an array of

2029

* the tokens. If the pattern contains capturing parentheses,

2030

* then the text for each of the substrings will also be returned.

2031

* If the pattern does not match anywhere in the string, then the

2032

* whole string is returned as the first token.

2033

*

2034

* This function is equivalent to g_regex_split() but it does

2035

* not require to compile the pattern with g_regex_new(), avoiding

2036

* some lines of code when you need just to do a split without

2037

* extracting substrings, capture counts, and so on.

2038

*

2039

* If this function is to be called on the same @pattern more than

2040

* once, it's more efficient to compile the pattern once with

2041

* g_regex_new() and then use g_regex_split().

2042

*

2043

* As a special case, the result of splitting the empty string ""

2044

* is an empty vector, not a vector containing a single string.

2045

* The reason for this special case is that being able to represent

2046

* a empty vector is typically more useful than consistent handling

2047

* of empty elements. If you do need to represent empty elements,

2048

* you'll need to check for the empty string before calling this

2049

* function.

2050

*

2051

* A pattern that can match empty strings splits @string into

2052

* separate characters wherever it matches the empty string between

2053

* characters. For example splitting "ab c" using as a separator

2054

* "\s*", you will get "a", "b" and "c".

2055

*

2056

* Returns: (transfer full): a %NULL-terminated array of strings. Free

2057

* it using g_strfreev()

*

* Since: 2.14

**/

gchar **

g_regex_split_simple (const gchar *pattern,

2063

const gchar *string,

2064

GRegexCompileFlags compile_options,

2065

GRegexMatchFlags match_options)

{

GRegex *regex;

gchar **result;

regex = g_regex_new (pattern, compile_options, 0, NULL);

if (!regex)

return NULL;

result = g_regex_split_full (regex, string, -1, 0, match_options, 0, NULL);

2075

g_regex_unref (regex);

return result;

}

/**

* g_regex_split:

* @regex: a #GRegex structure

2082

* @string: the string to split with the pattern

2083

* @match_options: match time option flags

2084

*

2085

* Breaks the string on the pattern, and returns an array of the tokens.

2086

* If the pattern contains capturing parentheses, then the text for each

2087

* of the substrings will also be returned. If the pattern does not match

2088

* anywhere in the string, then the whole string is returned as the first

2089

* token.

2090

*

2091

* As a special case, the result of splitting the empty string "" is an

2092

* empty vector, not a vector containing a single string. The reason for

2093

* this special case is that being able to represent a empty vector is

2094

* typically more useful than consistent handling of empty elements. If

2095

* you do need to represent empty elements, you'll need to check for the

2096

* empty string before calling this function.

2097

*

2098

* A pattern that can match empty strings splits @string into separate

2099

* characters wherever it matches the empty string between characters.

2100

* For example splitting "ab c" using as a separator "\s*", you will get

2101

* "a", "b" and "c".

2102

*

2103

* Returns: (transfer full): a %NULL-terminated gchar ** array. Free

2104

* it using g_strfreev()

*

* Since: 2.14

**/

gchar **

g_regex_split (const GRegex *regex,

2110

const gchar *string,

2111

GRegexMatchFlags match_options)

2112

{

2113

return g_regex_split_full (regex, string, -1, 0,

2114

match_options, 0, NULL);

}

/**

* g_regex_split_full:

2119

* @regex: a #GRegex structure

2120

* @string: (array length=string_len): the string to split with the pattern

2121

* @string_len: the length of @string, or -1 if @string is nul-terminated

2122

* @start_position: starting index of the string to match, in bytes

2123

* @match_options: match time option flags

2124

* @max_tokens: the maximum number of tokens to split @string into.

2125

* If this is less than 1, the string is split completely

2126

* @error: return location for a #GError

2127

*

2128

* Breaks the string on the pattern, and returns an array of the tokens.

2129

* If the pattern contains capturing parentheses, then the text for each

2130

* of the substrings will also be returned. If the pattern does not match

2131

* anywhere in the string, then the whole string is returned as the first

2132

* token.

2133

*

2134

* As a special case, the result of splitting the empty string "" is an

2135

* empty vector, not a vector containing a single string. The reason for

2136

* this special case is that being able to represent a empty vector is

2137

* typically more useful than consistent handling of empty elements. If

2138

* you do need to represent empty elements, you'll need to check for the

2139

* empty string before calling this function.

2140

*

2141

* A pattern that can match empty strings splits @string into separate

2142

* characters wherever it matches the empty string between characters.

2143

* For example splitting "ab c" using as a separator "\s*", you will get

2144

* "a", "b" and "c".

2145

*

2146

* Setting @start_position differs from just passing over a shortened

2147

* string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern

2148

* that begins with any kind of lookbehind assertion, such as "\b".

2149

*

2150

* Returns: (transfer full): a %NULL-terminated gchar ** array. Free

2151

* it using g_strfreev()

*

* Since: 2.14

**/

gchar **

g_regex_split_full (const GRegex *regex,

2157

const gchar *string,

2158

gssize string_len,

2159

gint start_position,

2160

GRegexMatchFlags match_options,

gint max_tokens,

GError **error)

{

GError *tmp_error = NULL;

2165

GMatchInfo *match_info;

2166

GList *list, *last;

gint i;

gint token_count;

gboolean match_ok;

/* position of the last separator. */

2171

gint last_separator_end;

2172

/* was the last match 0 bytes long? */

2173

gboolean last_match_is_empty;

2174

/* the returned array of char **s */

2175

gchar **string_list;

2176

2177

g_return_val_if_fail (regex != NULL, NULL);

2178

g_return_val_if_fail (string != NULL, NULL);

2179

g_return_val_if_fail (start_position >= 0, NULL);

2180

g_return_val_if_fail (error == NULL || *error == NULL, NULL);

2181

g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);

2182

2183

if (max_tokens <= 0)

2184

max_tokens = G_MAXINT;

2185

2186

if (string_len < 0)

2187

string_len = strlen (string);

2188

2189

/* zero-length string */

2190

if (string_len - start_position == 0)

2191

return g_new0 (gchar *, 1);

2192

2193

if (max_tokens == 1)

2194

{

2195

string_list = g_new0 (gchar *, 2);

2196

string_list[0] = g_strndup (&string[start_position],

2197

string_len - start_position);

2198

return string_list;

}

list = NULL;

token_count = 0;

last_separator_end = start_position;

2204

last_match_is_empty = FALSE;

2205

2206

match_ok = g_regex_match_full (regex, string, string_len, start_position,

2207

match_options, &match_info, &tmp_error);

2208

2209

while (tmp_error == NULL)

{

if (match_ok)

{

last_match_is_empty =

2214

(match_info->offsets[0] == match_info->offsets[1]);

2215

2216

/* we need to skip empty separators at the same position of the end

2217

* of another separator. e.g. the string is "a b" and the separator

2218

* is " *", so from 1 to 2 we have a match and at position 2 we have

2219

* an empty match. */

2220

if (last_separator_end != match_info->offsets[1])

{

gchar *token;

gint match_count;

token = g_strndup (string + last_separator_end,

2226

match_info->offsets[0] - last_separator_end);

2227

list = g_list_prepend (list, token);

2228

token_count++;

2229

2230

/* if there were substrings, these need to be added to

2231

* the list. */

2232

match_count = g_match_info_get_match_count (match_info);

2233

if (match_count > 1)

2234

{

2235

for (i = 1; i < match_count; i++)

2236

list = g_list_prepend (list, g_match_info_fetch (match_info, i));

}

}

}

else

{

/* if there was no match, copy to end of string. */

2243

if (!last_match_is_empty)

2244

{

2245

gchar *token = g_strndup (string + last_separator_end,

2246

match_info->string_len - last_separator_end);

2247

list = g_list_prepend (list, token);

2248

}

2249

/* no more tokens, end the loop. */

break;

}

/* -1 to leave room for the last part. */

2254

if (token_count >= max_tokens - 1)

2255

{

2256

/* we have reached the maximum number of tokens, so we copy

2257

* the remaining part of the string. */

2258

if (last_match_is_empty)

2259

{

2260

/* the last match was empty, so we have moved one char

2261

* after the real position to avoid empty matches at the

2262

* same position. */

2263

match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string;

2264

}

2265

/* the if is needed in the case we have terminated the available

2266

* tokens, but we are at the end of the string, so there are no

2267

* characters left to copy. */

2268

if (string_len > match_info->pos)

2269

{

2270

gchar *token = g_strndup (string + match_info->pos,

2271

string_len - match_info->pos);

2272

list = g_list_prepend (list, token);

2273

}

2274

/* end the loop. */

break;

}

last_separator_end = match_info->pos;

2279

if (last_match_is_empty)

2280

/* if the last match was empty, g_match_info_next() has moved

2281

* forward to avoid infinite loops, but we still need to copy that

2282

* character. */

2283

last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string;

2284

2285

match_ok = g_match_info_next (match_info, &tmp_error);

2286

}

2287

g_match_info_free (match_info);

2288

if (tmp_error != NULL)

2289

{

2290

g_propagate_error (error, tmp_error);

2291

g_list_free_full (list, g_free);

2292

match_info->pos = -1;

return NULL;

}

string_list = g_new (gchar *, g_list_length (list) + 1);

2297

i = 0;

2298

for (last = g_list_last (list); last; last = g_list_previous (last))

2299

string_list[i++] = last->data;

2300

string_list[i] = NULL;

2301

g_list_free (list);

2302

2303

return string_list;

}

enum

{

REPL_TYPE_STRING,

REPL_TYPE_CHARACTER,

2310

REPL_TYPE_SYMBOLIC_REFERENCE,

2311

REPL_TYPE_NUMERIC_REFERENCE,

2312

REPL_TYPE_CHANGE_CASE

};

typedef enum

{

CHANGE_CASE_NONE = 1 << 0,

2318

CHANGE_CASE_UPPER = 1 << 1,

2319

CHANGE_CASE_LOWER = 1 << 2,

2320

CHANGE_CASE_UPPER_SINGLE = 1 << 3,

2321

CHANGE_CASE_LOWER_SINGLE = 1 << 4,

2322

CHANGE_CASE_SINGLE_MASK = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE,

2323

CHANGE_CASE_LOWER_MASK = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE,

2324

CHANGE_CASE_UPPER_MASK = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE

2325

} ChangeCase;

2326

2327

struct _InterpolationData

{

gchar *text;

gint type;

gint num;

gchar c;

ChangeCase change_case;

};

static void

free_interpolation_data (InterpolationData *data)

2338

{

2339

g_free (data->text);

g_free (data);

}

static const gchar *

2344

expand_escape (const gchar *replacement,

2345

const gchar *p,

2346

InterpolationData *data,

2347

GError **error)

2348

{

2349

const gchar *q, *r;

2350

gint x, d, h, i;

2351

const gchar *error_detail;

2352

gint base = 0;

2353

GError *tmp_error = NULL;

p++;

switch (*p)

{

case 't':

p++;

data->c = '\t';

data->type = REPL_TYPE_CHARACTER;

break;

case 'n':

p++;

data->c = '\n';

data->type = REPL_TYPE_CHARACTER;

break;

case 'v':

p++;

data->c = '\v';

data->type = REPL_TYPE_CHARACTER;

break;

case 'r':

p++;

data->c = '\r';

data->type = REPL_TYPE_CHARACTER;

break;

case 'f':

p++;

data->c = '\f';

data->type = REPL_TYPE_CHARACTER;

break;

case 'a':

p++;

data->c = '\a';

data->type = REPL_TYPE_CHARACTER;

break;

case 'b':

p++;

data->c = '\b';

data->type = REPL_TYPE_CHARACTER;

break;

case '\\':

p++;

data->c = '\\';

data->type = REPL_TYPE_CHARACTER;

break;

case 'x':

p++;

x = 0;

if (*p == '{')

{

p++;

do

{

h = g_ascii_xdigit_value (*p);

2407

if (h < 0)

2408

{

2409

error_detail = _("hexadecimal digit or '}' expected");

goto error;

}

x = x * 16 + h;

p++;

}

while (*p != '}');

p++;

}

else

{

for (i = 0; i < 2; i++)

2421

{

2422

h = g_ascii_xdigit_value (*p);

2423

if (h < 0)

2424

{

2425

error_detail = _("hexadecimal digit expected");

goto error;

}

x = x * 16 + h;

p++;

}

}

data->type = REPL_TYPE_STRING;

2433

data->text = g_new0 (gchar, 8);

2434

g_unichar_to_utf8 (x, data->text);

break;

case 'l':

p++;

data->type = REPL_TYPE_CHANGE_CASE;

2439

data->change_case = CHANGE_CASE_LOWER_SINGLE;

break;

case 'u':

p++;

data->type = REPL_TYPE_CHANGE_CASE;

2444

data->change_case = CHANGE_CASE_UPPER_SINGLE;

break;

case 'L':

p++;

data->type = REPL_TYPE_CHANGE_CASE;

2449

data->change_case = CHANGE_CASE_LOWER;

break;

case 'U':

p++;

data->type = REPL_TYPE_CHANGE_CASE;

2454

data->change_case = CHANGE_CASE_UPPER;

break;

case 'E':

p++;

data->type = REPL_TYPE_CHANGE_CASE;

2459

data->change_case = CHANGE_CASE_NONE;

break;

case 'g':

p++;

if (*p != '<')

{

error_detail = _("missing '<' in symbolic reference");

goto error;

}

q = p + 1;

do

{

p++;

if (!*p)

{

error_detail = _("unfinished symbolic reference");

goto error;

}

}

while (*p != '>');

if (p - q == 0)

{

error_detail = _("zero-length symbolic reference");

2482

goto error;

2483

}

2484

if (g_ascii_isdigit (*q))

{

x = 0;

do

{

h = g_ascii_digit_value (*q);

2490

if (h < 0)

2491

{

2492

error_detail = _("digit expected");

p = q;

goto error;

}

x = x * 10 + h;

q++;

}

while (q != p);

data->num = x;

data->type = REPL_TYPE_NUMERIC_REFERENCE;

}

else

{

r = q;

do

{

if (!g_ascii_isalnum (*r))

2509

{

2510

error_detail = _("illegal symbolic reference");

p = r;

goto error;

}

r++;

}

while (r != p);

data->text = g_strndup (q, p - q);

2518

data->type = REPL_TYPE_SYMBOLIC_REFERENCE;

}

p++;

break;

case '0':

/* if \0 is followed by a number is an octal number representing a

2524

* character, else it is a numeric reference. */

2525

if (g_ascii_digit_value (*g_utf8_next_char (p)) >= 0)

2526

{

2527

base = 8;

2528

p = g_utf8_next_char (p);

}

case '1':

case '2':

case '3':

case '4':

case '5':

case '6':

case '7':

case '8':

case '9':

x = 0;

d = 0;

for (i = 0; i < 3; i++)

2542

{

2543

h = g_ascii_digit_value (*p);

if (h < 0)

break;

if (h > 7)

{

if (base == 8)

break;

else

base = 10;

}

if (i == 2 && base == 10)

break;

x = x * 8 + h;

d = d * 10 + h;

p++;

}

if (base == 8 || i == 3)

2560

{

2561

data->type = REPL_TYPE_STRING;

2562

data->text = g_new0 (gchar, 8);

2563

g_unichar_to_utf8 (x, data->text);

}

else

{

data->type = REPL_TYPE_NUMERIC_REFERENCE;

data->num = d;

}

break;

case 0:

error_detail = _("stray final '\\'");

goto error;

break;

default:

error_detail = _("unknown escape sequence");

goto error;

}

return p;

error:

/* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */

2584

tmp_error = g_error_new (G_REGEX_ERROR,

2585

G_REGEX_ERROR_REPLACE,

2586

_("Error while parsing replacement "

2587

"text \"%s\" at char %lu: %s"),

2588

replacement,

2589

(gulong)(p - replacement),

2590

error_detail);

2591

g_propagate_error (error, tmp_error);

return NULL;

}

static GList *

split_replacement (const gchar *replacement,

2598

GError **error)

2599

{

2600

GList *list = NULL;

2601

InterpolationData *data;

2602

const gchar *p, *start;

2603

2604

start = p = replacement;

while (*p)

{

if (*p == '\\')

{

data = g_new0 (InterpolationData, 1);

2610

start = p = expand_escape (replacement, p, data, error);

2611

if (p == NULL)

2612

{

2613

g_list_free_full (list, (GDestroyNotify) free_interpolation_data);

2614

free_interpolation_data (data);

return NULL;

}

list = g_list_prepend (list, data);

}

else

{

p++;

if (*p == '\\' || *p == '\0')

{

if (p - start > 0)

{

data = g_new0 (InterpolationData, 1);

2628

data->text = g_strndup (start, p - start);

2629

data->type = REPL_TYPE_STRING;

2630

list = g_list_prepend (list, data);

}

}

}

}

return g_list_reverse (list);

2637

}

2638

2639

/* Change the case of c based on change_case. */

2640

#define CHANGE_CASE(c, change_case) \

2641

(((change_case) & CHANGE_CASE_LOWER_MASK) ? \

2642

g_unichar_tolower (c) : \

2643

g_unichar_toupper (c))

2644

2645

static void

2646

string_append (GString *string,

2647

const gchar *text,

2648

ChangeCase *change_case)

{

gunichar c;

if (text[0] == '\0')

2653

return;

2654

2655

if (*change_case == CHANGE_CASE_NONE)

2656

{

2657

g_string_append (string, text);

2658

}

2659

else if (*change_case & CHANGE_CASE_SINGLE_MASK)

2660

{

2661

c = g_utf8_get_char (text);

2662

g_string_append_unichar (string, CHANGE_CASE (c, *change_case));

2663

g_string_append (string, g_utf8_next_char (text));

2664

*change_case = CHANGE_CASE_NONE;

}

else

{

while (*text != '\0')

2669

{

2670

c = g_utf8_get_char (text);

2671

g_string_append_unichar (string, CHANGE_CASE (c, *change_case));

2672

text = g_utf8_next_char (text);

}

}

}

static gboolean

interpolate_replacement (const GMatchInfo *match_info,

GString *result,

gpointer data)

{

GList *list;

InterpolationData *idata;

2684

gchar *match;

2685

ChangeCase change_case = CHANGE_CASE_NONE;

2686

2687

for (list = data; list; list = list->next)

2688

{

2689

idata = list->data;

2690

switch (idata->type)

2691

{

2692

case REPL_TYPE_STRING:

2693

string_append (result, idata->text, &change_case);

2694

break;

2695

case REPL_TYPE_CHARACTER:

2696

g_string_append_c (result, CHANGE_CASE (idata->c, change_case));

2697

if (change_case & CHANGE_CASE_SINGLE_MASK)

2698

change_case = CHANGE_CASE_NONE;

2699

break;

2700

case REPL_TYPE_NUMERIC_REFERENCE:

2701

match = g_match_info_fetch (match_info, idata->num);

2702

if (match)

2703

{

2704

string_append (result, match, &change_case);

g_free (match);

}

break;

case REPL_TYPE_SYMBOLIC_REFERENCE:

2709

match = g_match_info_fetch_named (match_info, idata->text);

2710

if (match)

2711

{

2712

string_append (result, match, &change_case);

g_free (match);

}

break;

case REPL_TYPE_CHANGE_CASE:

2717

change_case = idata->change_case;

break;

}

}

return FALSE;

}

/* whether actual match_info is needed for replacement, i.e.

2726

* whether there are references

2727

*/

2728

static gboolean

2729

interpolation_list_needs_match (GList *list)

2730

{

2731

while (list != NULL)

2732

{

2733

InterpolationData *data = list->data;

2734

2735

if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE ||

2736

data->type == REPL_TYPE_NUMERIC_REFERENCE)

{

return TRUE;

}

list = list->next;

}

return FALSE;

}

/**

* g_regex_replace:

* @regex: a #GRegex structure

2750

* @string: (array length=string_len): the string to perform matches against

2751

* @string_len: the length of @string, or -1 if @string is nul-terminated

2752

* @start_position: starting index of the string to match, in bytes

2753

* @replacement: text to replace each match with

2754

* @match_options: options for the match

2755

* @error: location to store the error occurring, or %NULL to ignore errors

2756

*

2757

* Replaces all occurrences of the pattern in @regex with the

2758

* replacement text. Backreferences of the form '\number' or

2759

* '\g<number>' in the replacement text are interpolated by the

2760

* number-th captured subexpression of the match, '\g<name>' refers

2761

* to the captured subexpression with the given name. '\0' refers

2762

* to the complete match, but '\0' followed by a number is the octal

2763

* representation of a character. To include a literal '\' in the

2764

* replacement, write '\\'.

2765

*

2766

* There are also escapes that changes the case of the following text:

2767

*

2768

* - \l: Convert to lower case the next character

2769

* - \u: Convert to upper case the next character

2770

* - \L: Convert to lower case till \E

2771

* - \U: Convert to upper case till \E

2772

* - \E: End case modification

2773

*

2774

* If you do not need to use backreferences use g_regex_replace_literal().

2775

*

2776

* The @replacement string must be UTF-8 encoded even if #G_REGEX_RAW was

2777

* passed to g_regex_new(). If you want to use not UTF-8 encoded stings

2778

* you can use g_regex_replace_literal().

2779

*

2780

* Setting @start_position differs from just passing over a shortened

2781

* string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that

2782

* begins with any kind of lookbehind assertion, such as "\b".

2783

*

2784

* Returns: a newly allocated string containing the replacements

*

* Since: 2.14

*/

gchar *

g_regex_replace (const GRegex *regex,

2790

const gchar *string,

2791

gssize string_len,

2792

gint start_position,

2793

const gchar *replacement,

2794

GRegexMatchFlags match_options,

GError **error)

{

gchar *result;

GList *list;

GError *tmp_error = NULL;

2800

2801

g_return_val_if_fail (regex != NULL, NULL);

2802

g_return_val_if_fail (string != NULL, NULL);

2803

g_return_val_if_fail (start_position >= 0, NULL);

2804

g_return_val_if_fail (replacement != NULL, NULL);

2805

g_return_val_if_fail (error == NULL || *error == NULL, NULL);

2806

g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);

2807

2808

list = split_replacement (replacement, &tmp_error);

2809

if (tmp_error != NULL)

2810

{

2811

g_propagate_error (error, tmp_error);

return NULL;

}

result = g_regex_replace_eval (regex,

2816

string, string_len, start_position,

2817

match_options,

2818

interpolate_replacement,

2819

(gpointer)list,

2820

&tmp_error);

2821

if (tmp_error != NULL)

2822

g_propagate_error (error, tmp_error);

2823

2824

g_list_free_full (list, (GDestroyNotify) free_interpolation_data);

return result;

}

static gboolean

literal_replacement (const GMatchInfo *match_info,

GString *result,

gpointer data)

{

g_string_append (result, data);

return FALSE;

}

/**

* g_regex_replace_literal:

2840

* @regex: a #GRegex structure

2841

* @string: (array length=string_len): the string to perform matches against

2842

* @string_len: the length of @string, or -1 if @string is nul-terminated

2843

* @start_position: starting index of the string to match, in bytes

2844

* @replacement: text to replace each match with

2845

* @match_options: options for the match

2846

* @error: location to store the error occurring, or %NULL to ignore errors

2847

*

2848

* Replaces all occurrences of the pattern in @regex with the

2849

* replacement text. @replacement is replaced literally, to

2850

* include backreferences use g_regex_replace().

2851

*

2852

* Setting @start_position differs from just passing over a

2853

* shortened string and setting #G_REGEX_MATCH_NOTBOL in the

2854

* case of a pattern that begins with any kind of lookbehind

2855

* assertion, such as "\b".

2856

*

2857

* Returns: a newly allocated string containing the replacements

*

* Since: 2.14

*/

gchar *

g_regex_replace_literal (const GRegex *regex,

2863

const gchar *string,

2864

gssize string_len,

2865

gint start_position,

2866

const gchar *replacement,

2867

GRegexMatchFlags match_options,

2868

GError **error)

2869

{

2870

g_return_val_if_fail (replacement != NULL, NULL);

2871

g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);

2872

2873

return g_regex_replace_eval (regex,

2874

string, string_len, start_position,

2875

match_options,

2876

literal_replacement,

2877

(gpointer)replacement,

error);

}

/**

* g_regex_replace_eval:

2883

* @regex: a #GRegex structure from g_regex_new()

2884

* @string: (array length=string_len): string to perform matches against

2885

* @string_len: the length of @string, or -1 if @string is nul-terminated

2886

* @start_position: starting index of the string to match, in bytes

2887

* @match_options: options for the match

2888

* @eval: a function to call for each match

2889

* @user_data: user data to pass to the function

2890

* @error: location to store the error occurring, or %NULL to ignore errors

2891

*

2892

* Replaces occurrences of the pattern in regex with the output of

2893

* @eval for that occurrence.

2894

*

2895

* Setting @start_position differs from just passing over a shortened

2896

* string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern

2897

* that begins with any kind of lookbehind assertion, such as "\b".

2898

*

2899

* The following example uses g_regex_replace_eval() to replace multiple

2900

* strings at once:

2901

* |[

2902

* static gboolean

2903

* eval_cb (const GMatchInfo *info,

* GString *res,

* gpointer data)

* {

* gchar *match;

* gchar *r;

*

* match = g_match_info_fetch (info, 0);

2911

* r = g_hash_table_lookup ((GHashTable *)data, match);

2912

* g_string_append (res, r);

* g_free (match);

*

* return FALSE;

* }

*

* ...

*

* GRegex *reg;

* GHashTable *h;

* gchar *res;

*

* h = g_hash_table_new (g_str_hash, g_str_equal);

2925

*

2926

* g_hash_table_insert (h, "1", "ONE");

2927

* g_hash_table_insert (h, "2", "TWO");

2928

* g_hash_table_insert (h, "3", "THREE");

2929

* g_hash_table_insert (h, "4", "FOUR");

2930

*

2931

* reg = g_regex_new ("1|2|3|4", 0, 0, NULL);

2932

* res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL);

2933

* g_hash_table_destroy (h);

*

* ...

* ]|

*

* Returns: a newly allocated string containing the replacements

*

* Since: 2.14

*/

gchar *

g_regex_replace_eval (const GRegex *regex,

2944

const gchar *string,

2945

gssize string_len,

2946

gint start_position,

2947

GRegexMatchFlags match_options,

2948

GRegexEvalCallback eval,

2949

gpointer user_data,

2950

GError **error)

2951

{

2952

GMatchInfo *match_info;

2953

GString *result;

2954

gint str_pos = 0;

2955

gboolean done = FALSE;

2956

GError *tmp_error = NULL;

2957

2958

g_return_val_if_fail (regex != NULL, NULL);

2959

g_return_val_if_fail (string != NULL, NULL);

2960

g_return_val_if_fail (start_position >= 0, NULL);

2961

g_return_val_if_fail (eval != NULL, NULL);

2962

g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL);

2963

2964

if (string_len < 0)

2965

string_len = strlen (string);

2966

2967

result = g_string_sized_new (string_len);

2968

2969

/* run down the string making matches. */

2970

g_regex_match_full (regex, string, string_len, start_position,

2971

match_options, &match_info, &tmp_error);

2972

while (!done && g_match_info_matches (match_info))

2973

{

2974

g_string_append_len (result,

2975

string + str_pos,

2976

match_info->offsets[0] - str_pos);

2977

done = (*eval) (match_info, result, user_data);

2978

str_pos = match_info->offsets[1];

2979

g_match_info_next (match_info, &tmp_error);

2980

}

2981

g_match_info_free (match_info);

2982

if (tmp_error != NULL)

2983

{

2984

g_propagate_error (error, tmp_error);

2985

g_string_free (result, TRUE);

return NULL;

}

g_string_append_len (result, string + str_pos, string_len - str_pos);

2990

return g_string_free (result, FALSE);

}

/**

* g_regex_check_replacement:

2995

* @replacement: the replacement string

2996

* @has_references: (out) (allow-none): location to store information about

2997

* references in @replacement or %NULL

2998

* @error: location to store error

2999

*

3000

* Checks whether @replacement is a valid replacement string

3001

* (see g_regex_replace()), i.e. that all escape sequences in

3002

* it are valid.

3003

*

3004

* If @has_references is not %NULL then @replacement is checked

3005

* for pattern references. For instance, replacement text 'foo\n'

3006

* does not contain references and may be evaluated without information

3007

* about actual match, but '\0\1' (whole match followed by first

3008

* subpattern) requires valid #GMatchInfo object.

3009

*

3010

* Returns: whether @replacement is a valid replacement string

*

* Since: 2.14

*/

gboolean

g_regex_check_replacement (const gchar *replacement,

3016

gboolean *has_references,

GError **error)

{

GList *list;

GError *tmp = NULL;

3021

3022

list = split_replacement (replacement, &tmp);

if (tmp)

{

g_propagate_error (error, tmp);

return FALSE;

}

if (has_references)

3031

*has_references = interpolation_list_needs_match (list);

3032

3033

g_list_free_full (list, (GDestroyNotify) free_interpolation_data);

return TRUE;

}

/**

* g_regex_escape_nul:

3040

* @string: the string to escape

3041

* @length: the length of @string

3042

*

3043

* Escapes the nul characters in @string to "\x00". It can be used

3044

* to compile a regex with embedded nul characters.

3045

*

3046

* For completeness, @length can be -1 for a nul-terminated string.

3047

* In this case the output string will be of course equal to @string.

3048

*

3049

* Returns: a newly-allocated escaped string

*

* Since: 2.30

*/

gchar *

g_regex_escape_nul (const gchar *string,

gint length)

{

GString *escaped;

const gchar *p, *piece_start, *end;

3059

gint backslashes;

3060

3061

g_return_val_if_fail (string != NULL, NULL);

3062

3063

if (length < 0)

3064

return g_strdup (string);

3065

3066

end = string + length;

3067

p = piece_start = string;

3068

escaped = g_string_sized_new (length + 1);

backslashes = 0;

while (p < end)

{

switch (*p)

{

case '\0':

if (p != piece_start)

3077

{

3078

/* copy the previous piece. */

3079

g_string_append_len (escaped, piece_start, p - piece_start);

3080

}

3081

if ((backslashes & 1) == 0)

3082

g_string_append_c (escaped, '\\');

3083

g_string_append_c (escaped, 'x');

3084

g_string_append_c (escaped, '0');

3085

g_string_append_c (escaped, '0');

piece_start = ++p;

backslashes = 0;

break;

case '\\':

backslashes++;

++p;

break;

default:

backslashes = 0;

p = g_utf8_next_char (p);

break;

}

}

if (piece_start < end)

3101

g_string_append_len (escaped, piece_start, end - piece_start);

3102

3103

return g_string_free (escaped, FALSE);

}

/**

* g_regex_escape_string:

3108

* @string: (array length=length): the string to escape

3109

* @length: the length of @string, or -1 if @string is nul-terminated

3110

*

3111

* Escapes the special characters used for regular expressions

3112

* in @string, for instance "a.b*c" becomes "a\.b\*c". This

3113

* function is useful to dynamically generate regular expressions.

3114

*

3115

* @string can contain nul characters that are replaced with "\0",

3116

* in this case remember to specify the correct length of @string

3117

* in @length.

3118

*

3119

* Returns: a newly-allocated escaped string

*

* Since: 2.14

*/

gchar *

g_regex_escape_string (const gchar *string,

gint length)

{

GString *escaped;

const char *p, *piece_start, *end;

3129

3130

g_return_val_if_fail (string != NULL, NULL);

3131

3132

if (length < 0)

3133

length = strlen (string);

3134

3135

end = string + length;

3136

p = piece_start = string;

3137

escaped = g_string_sized_new (length + 1);

while (p < end)

{

switch (*p)

{

case '\0':

case '\\':

case '|':

case '(':

case ')':

case '[':

case ']':

case '{':

case '}':

case '^':

case '$':

case '*':

case '+':

case '?':

case '.':

if (p != piece_start)

3159

/* copy the previous piece. */

3160

g_string_append_len (escaped, piece_start, p - piece_start);

3161

g_string_append_c (escaped, '\\');

3162

if (*p == '\0')

3163

g_string_append_c (escaped, '0');

3164

else

3165

g_string_append_c (escaped, *p);

piece_start = ++p;

break;

default:

p = g_utf8_next_char (p);

break;

}

}

if (piece_start < end)

3175

g_string_append_len (escaped, piece_start, end - piece_start);

3176

3177

return g_string_free (escaped, FALSE);

3178

}

nexmon – Blame information for rev 1