WebSVN – wasCSharpSQLite – Blame – Rev 4 – /trunk/TCL/src/regexp

    internal static readonly string[] opnames = new string[] { "END", "BOL", "EOL", "ANY", "ANYOF", "ANYBUT", "BRANCH", "BACK", "EXACTLY", "NOTHING", "STAR", "PLUS" };

261

262

/*

263

* A node is one char of opcode followed by one char of "next" pointer.

264

* The value is a positive offset from the opcode of the node containing

265

* it. An operand, if any, simply follows the node. (Note that much of

266

* the code generation knows about this implicit relationship.)

*

* Opcode notes:

*

* BRANCH The set of branches constituting a single choice are hooked

271

* together with their "next" pointers, since precedence prevents

272

* anything being concatenated to any individual branch. The

273

* "next" pointer of the last BRANCH in a choice points to the

274

* thing following the whole choice. This is also where the

275

* final "next" pointer of each individual branch points; each

276

* branch starts with the operand node of a BRANCH node.

277

*

278

* ANYOF, ANYBUT, EXACTLY

279

* The format of a string operand is one char of length

280

* followed by the characters making up the string.

281

*

282

* BACK Normal "next" pointers all implicitly point forward; BACK

283

* exists to make loop structures possible.

284

*

285

* STAR, PLUS

286

* '?', and complex '*' and '+' are implemented as circular

287

* BRANCH structures using BACK. Simple cases (one character

288

* per match) are implemented with STAR and PLUS for speed

289

* and to minimize recursive plunges.

290

*

291

* OPENn, CLOSEn

292

* are numbered at compile time.

*/

/// <summary> The bytecodes making up the regexp program.</summary>

297

internal char[] program;

298

299

/// <summary> Whether the regexp matching should be case insensitive.</summary>

300

internal bool ignoreCase;

301

302

/// <summary> The number of parenthesized subexpressions in the regexp pattern,

303

/// plus 1 for the match of the whole pattern itself.

/// </summary>

internal int npar;

/// <summary> <code>true</code> if the pattern must match the beginning of the

308

/// string, so we don't have to waste time matching against all possible

309

/// starting locations in the string.

310

/// </summary>

311

internal bool anchored;

312

313

internal int startChar;

314

internal string must;

315

316

/// <summary> Compiles a new Regexp object from the given regular expression

317

/// pattern.

318

/// 

319

/// It takes a certain amount of time to parse and validate a regular

320

/// expression pattern before it can be used to perform matches

321

/// or substitutions. If the caller caches the new Regexp object, that

322

/// parsing time will be saved because the same Regexp can be used with

323

/// respect to many different strings.

324

///

325

/// </summary>

326

/// <param name="">pat

327

/// The string holding the regular expression pattern.

328

///

329

/// @throws IllegalArgumentException if the pattern is malformed.

330

/// The detail message for the exception will be set to a

331

/// string indicating how the pattern was malformed.

332

/// </param>

333

public Regexp( string pat )

{

compile( pat );

}

/// <summary> Compiles a new Regexp object from the given regular expression

/// pattern.

///

/// </summary>

/// <param name="">pat

343

/// The string holding the regular expression pattern.

344

///

345

/// </param>

346

/// <param name="">ignoreCase

347

/// If <code>true</code> then this regular expression will

348

/// do case-insensitive matching. If <code>false</code>, then

349

/// the matches are case-sensitive. Regular expressions

350

/// generated by <code>Regexp(String)</code> are case-sensitive.

351

///

352

/// @throws IllegalArgumentException if the pattern is malformed.

353

/// The detail message for the exception will be set to a

354

/// string indicating how the pattern was malformed.

355

/// </param>

356

public Regexp( string pat, bool ignoreCase )

357

{

358

this.ignoreCase = ignoreCase;

359

if ( ignoreCase )

360

{

361

pat = pat.ToLower();

}

compile( pat );

}

/// <summary> Returns the number of parenthesized subexpressions in this regular

367

/// expression, plus one more for this expression itself.

368

///

369

/// </summary>

370

/// <returns> The number.

371

/// </returns>

372

public int subspecs()

{

return npar;

}

/// <summary> Matches the given string against this regular expression.

378

///

379

/// </summary>

380

/// <param name="">str

381

/// The string to match.

382

///

383

/// </param>

384

/// <returns> The substring of <code>str</code> that matched the entire

385

/// regular expression, or <code>null</code> if the string did not

386

/// match this regular expression.

387

/// </returns>

388

public string match( string str )

389

{

390

Match m = exec( str, 0, 0 );

if ( m == null )

{

return null;

}

return str.Substring( m.indices[0], ( m.indices[1] ) - ( m.indices[0] ) );

397

}

398

399

/// <summary> Matches the given string against this regular expression, and computes

400

/// the set of substrings that matched the parenthesized subexpressions.

401

/// 

402

/// <code>substrs[0]</code> is set to the range of <code>str</code>

403

/// that matched the entire regular expression.

404

/// 

405

/// <code>substrs[1]</code> is set to the range of <code>str</code>

406

/// that matched the first (leftmost) parenthesized subexpression.

407

/// <code>substrs[n]</code> is set to the range that matched the

408

/// <code>n</code>th subexpression, and so on.

409

/// 

410

/// If subexpression <code>n</code> did not match, then

411

/// <code>substrs[n]</code> is set to <code>null</code>. Not to

412

/// be confused with "", which is a valid value for a

413

/// subexpression that matched 0 characters.

414

/// 

415

/// The length that the caller should use when allocating the

416

/// <code>substr</code> array is the return value of

417

/// <code>Regexp.subspecs</code>. The array

418

/// can be shorter (in which case not all the information will

419

/// be returned), or longer (in which case the remainder of the

420

/// elements are initialized to <code>null</code>), or

421

/// <code>null</code> (to ignore the subexpressions).

422

///

423

/// </summary>

424

/// <param name="">str

425

/// The string to match.

426

///

427

/// </param>

428

/// <param name="">substrs

429

/// An array of strings allocated by the caller, and filled in

430

/// with information about the portions of <code>str</code> that

431

/// matched the regular expression. May be <code>null</code>.

432

///

433

/// </param>

434

/// <returns> <code>true</code> if <code>str</code> that matched this

435

/// regular expression, <code>false</code> otherwise.

436

/// If <code>false</code> is returned, then the contents of

437

/// <code>substrs</code> are unchanged.

438

///

439

/// </returns>

440

/// <seealso cref="#subspecs">

441

/// </seealso>

442

public bool match( string str, string[] substrs )

443

{

444

Match m = exec( str, 0, 0 );

if ( m == null )

{

return false;

}

if ( substrs != null )

451

{

452

int max = System.Math.Min( substrs.Length, npar );

453

int i;

454

int j = 0;

455

for ( i = 0; i < max; i++ )

456

{

457

int start = m.indices[j++];

458

int end = m.indices[j++];

if ( start < 0 )

{

substrs[i] = null;

}

else

{

substrs[i] = str.Substring( start, ( end ) - ( start ) );

466

}

467

}

468

for ( ; i < substrs.Length; i++ )

{

substrs[i] = null;

}

}

return true;

}

/// <summary> Matches the given string against this regular expression, and computes

477

/// the set of substrings that matched the parenthesized subexpressions.

478

/// 

479

/// For the indices specified below, the range extends from the character

480

/// at the starting index up to, but not including, the character at the

481

/// ending index.

482

/// 

483

/// <code>indices[0]</code> and <code>indices[1]</code> are set to

484

/// starting and ending indices of the range of <code>str</code>

485

/// that matched the entire regular expression.

486

/// 

487

/// <code>indices[2]</code> and <code>indices[3]</code> are set to the

488

/// starting and ending indices of the range of <code>str</code> that

489

/// matched the first (leftmost) parenthesized subexpression.

490

/// <code>indices[n * 2]</code> and <code>indices[n * 2 + 1]</code>

491

/// are set to the range that matched the <code>n</code>th

492

/// subexpression, and so on.

493

/// 

494

/// If subexpression <code>n</code> did not match, then

495

/// <code>indices[n * 2]</code> and <code>indices[n * 2 + 1]</code>

496

/// are both set to <code>-1</code>.

497

/// 

498

/// The length that the caller should use when allocating the

499

/// <code>indices</code> array is twice the return value of

500

/// <code>Regexp.subspecs</code>. The array

501

/// can be shorter (in which case not all the information will

502

/// be returned), or longer (in which case the remainder of the

503

/// elements are initialized to <code>-1</code>), or

504

/// <code>null</code> (to ignore the subexpressions).

505

///

506

/// </summary>

507

/// <param name="">str

508

/// The string to match.

509

///

510

/// </param>

511

/// <param name="">indices

512

/// An array of integers allocated by the caller, and filled in

513

/// with information about the portions of <code>str</code> that

514

/// matched all the parts of the regular expression.

515

/// May be <code>null</code>.

516

///

517

/// </param>

518

/// <returns> <code>true</code> if the string matched the regular expression,

519

/// <code>false</code> otherwise. If <code>false</code> is

520

/// returned, then the contents of <code>indices</code> are

/// unchanged.

///

/// </returns>

/// <seealso cref="#subspecs">

525

/// </seealso>

526

public bool match( string str, int[] indices )

527

{

528

Match m = exec( str, 0, 0 );

if ( m == null )

{

return false;

}

if ( indices != null )

535

{

536

int max = System.Math.Min( indices.Length, npar * 2 );

537

Array.Copy( (System.Array)m.indices, 0, (System.Array)indices, 0, max );

538

539

for ( int i = max; i < indices.Length; i++ )

{

indices[i] = -1;

}

}

return true;

}

/// <summary> Matches a string against a regular expression and replaces the first

548

/// match with the string generated from the substitution parameter.

549

///

550

/// </summary>

551

/// <param name="">str

552

/// The string to match against this regular expression.

553

///

554

/// </param>

555

/// <param name="">subspec

556

/// The substitution parameter, described in <a href=#regsub>

557

/// REGULAR EXPRESSION SUBSTITUTION</a>.

558

///

559

/// </param>

560

/// <returns> The string formed by replacing the first match in

561

/// <code>str</code> with the string generated from

562

/// <code>subspec</code>. If no matches were found, then

563

/// the return value is <code>null</code>.

564

/// </returns>

565

public string sub( string str, string subspec )

566

{

567

Regsub rs = new Regsub( this, str );

568

if ( rs.nextMatch() )

569

{

570

StringBuilder sb = new StringBuilder( rs.skipped() );

571

applySubspec( rs, subspec, sb );

572

sb.Append( rs.rest() );

573

574

return sb.ToString();

}

else

{

return null;

}

}

/// <summary> Matches a string against a regular expression and replaces all

583

/// matches with the string generated from the substitution parameter.

584

/// After each substutition is done, the portions of the string already

585

/// examined, including the newly substituted region, are not checked

586

/// again for new matches -- only the rest of the string is examined.

587

///

588

/// </summary>

589

/// <param name="">str

590

/// The string to match against this regular expression.

591

///

592

/// </param>

593

/// <param name="">subspec

594

/// The substitution parameter, described in <a href=#regsub>

595

/// REGULAR EXPRESSION SUBSTITUTION</a>.

596

///

597

/// </param>

598

/// <returns> The string formed by replacing all the matches in

599

/// <code>str</code> with the strings generated from

600

/// <code>subspec</code>. If no matches were found, then

601

/// the return value is a copy of <code>str</code>.

602

/// </returns>

603

public string subAll( string str, string subspec )

604

{

605

return sub( str, new SubspecFilter( subspec, true ) );

606

}

607

608

/// <summary> Utility method to give access to the standard substitution algorithm

609

/// used by <code>sub</code> and <code>subAll</code>. Appends to the

610

/// string buffer the string generated by applying the substitution

611

/// parameter to the matched region.

612

///

613

/// </summary>

614

/// <param name="">rs

615

/// Information about the matched region.

616

///

617

/// </param>

618

/// <param name="">subspec

619

/// The substitution parameter.

620

///

621

/// </param>

622

/// <param name="">sb

623

/// StringBuffer to which the generated string is appended.

624

/// </param>

625

public static void applySubspec( Regsub rs, string subspec, StringBuilder sb )

{

try

{

int len = subspec.Length;

630

for ( int i = 0; i < len; i++ )

631

{

632

char ch = subspec[i];

switch ( ch )

{

case '&':

{

sb.Append( rs.matched() );

break;

}

case '\\':

{

i++;

ch = subspec[i];

if ( ( ch >= '0' ) && ( ch <= '9' ) )

647

{

648

string match = rs.submatch( ch - '0' );

649

if ( (System.Object)match != null )

650

{

651

sb.Append( match );

}

break;

}

// fall through.

}

goto default;

default:

{

sb.Append( ch );

}

break;

}

}

}

catch ( System.IndexOutOfRangeException e )

669

{

670

/*

671

* Ignore malformed substitution pattern.

672

* Return string matched so far.

*/

}

}

public string sub( string str, Filter rf )

678

{

679

Regsub rs = new Regsub( this, str );

680

if ( rs.nextMatch() == false )

{

return str;

}

StringBuilder sb = new StringBuilder();

686

do

687

{

688

sb.Append( rs.skipped() );

689

if ( rf.filter( rs, sb ) == false )

{

break;

}

}

while ( rs.nextMatch() );

695

sb.Append( rs.rest() );

696

return sb.ToString();

697

}

698

699

/// <summary> This interface is used by the <code>Regexp</code> class to generate

700

/// the replacement string for each pattern match found in the source

/// string.

///

/// </summary>

/// <author> Colin Stevens (colin.stevens@sun.com)

705

/// </author>

706

/// <version> 1.7, 99/10/14

707

/// </version>

708

public interface Filter

709

{

710

/// <summary> Given the current state of the match, generate the replacement

711

/// string. This method will be called for each match found in

712

/// the source string, unless this filter decides not to handle any

713

/// more matches.

714

/// 

715

/// The implementation can use whatever rules it chooses

716

/// to generate the replacement string. For example, here is an

717

/// example of a filter that replaces the first 5

718

/// occurrences of "%XX" in a string with the ASCII character

719

/// represented by the hex digits "XX":

720

/// <pre>

721

/// String str = ...;

722

///

723

/// Regexp re = new Regexp("%[a-fA-F0-9][a-fA-F0-9]");

724

///

725

/// Regexp.Filter rf = new Regexp.Filter() {

726

/// int count = 5;

727

/// public boolean filter(Regsub rs, StringBuffer sb) {

728

/// String match = rs.matched();

729

/// int hi = Character.digit(match.charAt(1), 16);

730

/// int lo = Character.digit(match.charAt(2), 16);

731

/// sb.append((char) ((hi << 4) | lo));

732

/// return (--count > 0);

/// }

/// }

///

/// String result = re.sub(str, rf);

/// </pre>

///

/// </summary>

/// <param name="">rs

741

/// <code>Regsub</code> containing the state of the current

/// match.

///

/// </param>

/// <param name="">sb

746

/// The string buffer that this filter should append the

747

/// generated string to. This string buffer actually

748

/// contains the results the calling <code>Regexp</code> has

749

/// generated up to this point.

750

///

751

/// </param>

752

/// <returns> <code>false</code> if no further matches should be

753

/// considered in this string, <code>true</code> to allow

754

/// <code>Regexp</code> to continue looking for further

755

/// matches.

756

/// </returns>

757

bool filter( Regsub rs, StringBuilder sb );

758

}

759

760

private class SubspecFilter : Filter

761

{

762

internal string subspec;

763

internal bool all;

764

765

public SubspecFilter( string subspec, bool all )

766

{

767

this.subspec = subspec;

this.all = all;

}

public bool filter( Regsub rs, StringBuilder sb )

772

{

773

sunlabs.brazil.util.regexp.Regexp.applySubspec( rs, subspec, sb );

return all;

}

}

/// <summary> Returns a string representation of this compiled regular

779

/// expression. The format of the string representation is a

780

/// symbolic dump of the bytecodes.

781

///

782

/// </summary>

783

/// <returns> A string representation of this regular expression.

784

/// </returns>

785

public override string ToString()

786

{

787

StringBuilder sb = new StringBuilder();

788

789

sb.Append( "# subs: " + npar + "\n" );

790

sb.Append( "anchor: " + anchored + "\n" );

791

sb.Append( "start: " + (char)startChar + "\n" );

792

sb.Append( "must: " + must + "\n" );

793

794

for ( int i = 0; i < program.Length; )

795

{

796

sb.Append( i + ":\t" );

797

int op = program[i];

798

if ( op >= CLOSE )

799

{

800

sb.Append( "CLOSE" + ( op - CLOSE ) );

801

}

802

else if ( op >= OPEN )

803

{

804

sb.Append( "OPEN" + ( op - OPEN ) );

}

else

{

sb.Append( opnames[op] );

809

}

810

int line;

811

int offset = (int)program[i + 1];

if ( offset == 0 )

{

sb.Append( '\t' );

}

else if ( op == BACK )

817

{

818

sb.Append( "\t-" + offset + "," + ( i - offset ) );

}

else

{

sb.Append( "\t+" + offset + "," + ( i + offset ) );

823

}

824

825

if ( ( op == ANYOF ) || ( op == ANYBUT ) || ( op == EXACTLY ) )

826

{

827

sb.Append( "\t'" );

828

sb.Append( program, i + 3, program[i + 2] );

829

sb.Append( "'" );

830

i += 3 + program[i + 2];

}

else

{

i += 2;

}

sb.Append( '\n' );

}

return sb.ToString();

}

private void compile( string exp )

843

{

844

Compiler rcstate = new Compiler();

845

rcstate.parse = exp.ToCharArray();

846

rcstate.off = 0;

847

rcstate.npar = 1;

848

rcstate.code = new StringBuilder();

849

850

rcstate.reg( false );

851

852

program = rcstate.code.ToString().ToCharArray();

853

npar = rcstate.npar;

startChar = -1;

/* optimize */

if ( program[rcstate.regnext( 0 )] == END )

858

{

859

if ( program[2] == BOL )

{

anchored = true;

}

else if ( program[2] == EXACTLY )

864

{

865

startChar = (int)program[5];

}

}

/*

* If there's something expensive in the r.e., find the

871

* longest literal string that must appear and make it the

872

* regmust. Resolve ties in favor of later strings, since

873

* the regstart check works with the beginning of the r.e.

874

* and avoiding duplication strengthens checking. Not a

875

* strong reason, but sufficient in the absence of others.

876

*/

877

/*

878

if ((rcstate.flagp & Compiler.SPSTART) != 0) {

int index = -1;

int longest = 0;

for (scan = 0; scan < program.length; ) {

883

switch (program[scan]) {

884

case EXACTLY:

885

int length = program[scan + 2];

886

if (length > longest) {

index = scan;

longest = length;

}

// fall through;

case ANYOF:

case ANYBUT:

scan += 3 + program[scan + 2];

break;

default:

scan += 2;

break;

}

}

if (longest > 0) {

must = new String(program, index + 3, longest);

}

}*/

}

internal Match exec( string str, int start, int off )

{

if ( ignoreCase )

{

str = str.ToLower();

913

}

914

915

Match match = new Match();

916

917

match.program = program;

918

919

/* Mark beginning of line for ^ . */

920

match.str = str;

921

match.bol = start;

922

match.length = str.Length;

923

924

match.indices = new int[npar * 2];

if ( anchored )

{

/* Simplest case: anchored match need be tried only once. */

929

if ( match.regtry( off ) )

{

return match;

}

}

else if ( startChar >= 0 )

935

{

936

/* We know what char it must start with. */

937

while ( off < match.length )

938

{

939

off = str.IndexOf( (System.Char)startChar, off );

if ( off < 0 )

{

break;

}

if ( match.regtry( off ) )

{

return match;

}

off++;

}

}

else

{

/* Messy cases: unanchored match. */

954

do

955

{

956

if ( match.regtry( off ) )

{

return match;

}

}

while ( off++ < match.length );

}

return null;

}

internal class Compiler

967

{

968

internal char[] parse;

969

internal int off;

970

internal int npar;

971

internal StringBuilder code;

972

internal int flagp;

973

974

975

internal const string META = "^$.[()|?+*\\";

976

internal const string MULT = "*+?";

977

978

internal const int WORST = 0; /* Worst case. */

979

internal const int HASWIDTH = 1; /* Known never to match null string. */

980

internal const int SIMPLE = 2; /* Simple enough to be STAR/PLUS operand. */

981

internal const int SPSTART = 4; /* Starts with * or +. */

982

983

/*

984

- reg - regular expression, i.e. main body or parenthesized thing

985

*

986

* Caller must absorb opening parenthesis.

987

*

988

* Combining parenthesis handling with the base level of regular expression

989

* is a trifle forced, but the need to tie the tails of the branches to what

990

* follows makes it hard to avoid.

991

*/

992

internal int reg( bool paren )

993

{

994

int netFlags = HASWIDTH;

int parno = 0;

int ret = -1;

if ( paren )

{

parno = npar++;

if ( npar >= sunlabs.brazil.util.regexp.Regexp.NSUBEXP )

1002

{

1003

throw new System.ArgumentException( "too many ()" );

1004

}

1005

ret = regnode( (char)( sunlabs.brazil.util.regexp.Regexp.OPEN + parno ) );

1006

}

1007

1008

/* Pick up the branches, linking them together. */

1009

int br = regbranch();

1010

if ( ret >= 0 )

1011

{

1012

regtail( ret, br );

}

else

{

ret = br;

}

if ( ( flagp & HASWIDTH ) == 0 )

1020

{

1021

netFlags &= ~HASWIDTH;

1022

}

1023

netFlags |= ( flagp & SPSTART );

1024

while ( ( off < parse.Length ) && ( parse[off] == '|' ) )

{

off++;

br = regbranch();

regtail( ret, br );

1029

if ( ( flagp & HASWIDTH ) == 0 )

1030

{

1031

netFlags &= ~HASWIDTH;

1032

}

1033

netFlags |= ( flagp & SPSTART );

1034

}

1035

1036

/* Make a closing node, and hook it on the end. */

1037

        int ender = regnode( ( paren ) ? (char)( sunlabs.brazil.util.regexp.Regexp.CLOSE + parno ) : sunlabs.brazil.util.regexp.Regexp.END );

1038

regtail( ret, ender );

1039

1040

/* Hook the tails of the branches to the closing node. */

1041

for ( br = ret; br >= 0; br = regnext( br ) )

1042

{

1043

regoptail( br, ender );

1044

}

1045

1046

/* Check for proper termination. */

1047

if ( paren && ( ( off >= parse.Length ) || ( parse[off++] != ')' ) ) )

1048

{

1049

throw new System.ArgumentException( "missing )" );

1050

}

1051

else if ( ( paren == false ) && ( off < parse.Length ) )

1052

{

1053

throw new System.ArgumentException( "unexpected )" );

}

flagp = netFlags;

return ret;

}

/*

- regbranch - one alternative of an | operator

1062

*

1063

* Implements the concatenation operator.

1064

*/

1065

internal int regbranch()

1066

{

1067

int netFlags = WORST; /* Tentatively. */

1068

1069

int ret = regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH );

1070

int chain = -1;

1071

while ( ( off < parse.Length ) && ( parse[off] != '|' ) && ( parse[off] != ')' ) )

1072

{

1073

int latest = regpiece();

1074

netFlags |= flagp & HASWIDTH;

if ( chain < 0 )

{

/* First piece. */

netFlags |= ( flagp & SPSTART );

}

else

{

regtail( chain, latest );

}

chain = latest;

}

if ( chain < 0 )

{

/* Loop ran zero times. */

1089

regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING );

}

flagp = netFlags;

return ret;

}

/*

- regpiece - something followed by possible [*+?]

1098

*

1099

* Note that the branching code sequences used for ? and the general cases

1100

* of * and + are somewhat optimized: they use the same NOTHING node as

1101

* both the endmarker for their branch list and the body of the last branch.

1102

* It might seem that this node could be dispensed with entirely, but the

1103

* endmarker role is not redundant.

1104

*/

1105

internal int regpiece()

{

int netFlags;

int ret = regatom();

1110

1111

if ( ( off >= parse.Length ) || ( isMult( parse[off] ) == false ) )

{

return ret;

}

char op = parse[off];

1116

1117

if ( ( ( flagp & HASWIDTH ) == 0 ) && ( op != '?' ) )

1118

{

1119

throw new System.ArgumentException( "*+ operand could be empty" );

1120

}

1121

netFlags = ( op != '+' ) ? ( WORST | SPSTART ) : ( WORST | HASWIDTH );

1122

1123

if ( ( op == '*' ) && ( ( flagp & SIMPLE ) != 0 ) )

1124

{

1125

reginsert( sunlabs.brazil.util.regexp.Regexp.STAR, ret );

1126

}

1127

else if ( op == '*' )

1128

{

1129

/* Emit x* as (x&|), where & means "self". */

1130

reginsert( sunlabs.brazil.util.regexp.Regexp.BRANCH, ret ); /* Either x */

1131

regoptail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.BACK ) ); /* and loop */

1132

regoptail( ret, ret ); /* back */

1133

regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ) ); /* or */

1134

regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING ) ); /* null. */

1135

}

1136

else if ( ( op == '+' ) && ( ( flagp & SIMPLE ) != 0 ) )

1137

{

1138

reginsert( sunlabs.brazil.util.regexp.Regexp.PLUS, ret );

1139

}

1140

else if ( op == '+' )

1141

{

1142

/* Emit x+ as x(&|), where & means "self". */

1143

int next = regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ); /* Either */

1144

regtail( ret, next );

1145

regtail( regnode( sunlabs.brazil.util.regexp.Regexp.BACK ), ret ); /* loop back */

1146

regtail( next, regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ) ); /* or */

1147

regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING ) ); /* null. */

1148

}

1149

else if ( op == '?' )

1150

{

1151

/* Emit x? as (x|) */

1152

reginsert( sunlabs.brazil.util.regexp.Regexp.BRANCH, ret ); /* Either x */

1153

regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ) ); /* or */

1154

int next = regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING ); /* null. */

1155

regtail( ret, next );

1156

regoptail( ret, next );

1157

}

1158

off++;

1159

if ( ( off < parse.Length ) && isMult( parse[off] ) )

1160

{

1161

throw new System.ArgumentException( "nested *?+" );

}

flagp = netFlags;

return ret;

}

/*

- regatom - the lowest level

1170

*

1171

* Optimization: gobbles an entire sequence of ordinary characters so that

1172

* it can turn them into a single node, which is smaller to store and

1173

* faster to run. Backslashed characters are exceptions, each becoming a

1174

* separate node; the code is simpler that way and it's not worth fixing.

1175

*/

1176

internal int regatom()

1177

{

1178

int netFlags = WORST; /* Tentatively. */

1179

int ret;

1180

1181

switch ( parse[off++] )

{

case '^':

ret = regnode( sunlabs.brazil.util.regexp.Regexp.BOL );

break;

case '$':

ret = regnode( sunlabs.brazil.util.regexp.Regexp.EOL );

break;

case '.':

ret = regnode( sunlabs.brazil.util.regexp.Regexp.ANY );

1194

netFlags |= ( HASWIDTH | SIMPLE );

break;

case '[':

{

try

{

if ( parse[off] == '^' )

1202

{

1203

ret = regnode( sunlabs.brazil.util.regexp.Regexp.ANYBUT );

off++;

}

else

{

ret = regnode( sunlabs.brazil.util.regexp.Regexp.ANYOF );

1209

}

1210

1211

int pos = reglen();

1212

regc( '\x0000' );

1213

1214

if ( ( parse[off] == ']' ) || ( parse[off] == '-' ) )

1215

{

1216

regc( parse[off++] );

1217

}

1218

while ( parse[off] != ']' )

1219

{

1220

if ( parse[off] == '-' )

1221

{

1222

off++;

1223

if ( parse[off] == ']' )

{

regc( '-' );

}

else

{

int start = parse[off - 2];

1230

int end = parse[off++];

1231

if ( start > end )

1232

{

1233

throw new System.ArgumentException( "invalid [] range" );

1234

}

1235

for ( int i = start + 1; i <= end; i++ )

{

regc( (char)i );

}

}

}

else

{

regc( parse[off++] );

1244

}

1245

}

1246

regset( pos, (char)( reglen() - pos - 1 ) );

1247

off++;

1248

netFlags |= HASWIDTH | SIMPLE;

1249

}

1250

catch ( System.IndexOutOfRangeException e )

1251

{

1252

throw new System.ArgumentException( "missing ]" );

}

break;

}

case '(':

ret = reg( true );

netFlags |= ( flagp & ( HASWIDTH | SPSTART ) );

break;

case '|':

case ')':

throw new System.ArgumentException( "internal urp" );

case '?':

case '+':

case '*':

throw new System.ArgumentException( "?+* follows nothing" );

1270

1271

case '\\':

1272

if ( off >= parse.Length )

1273

{

1274

throw new System.ArgumentException( "trailing \\" );

1275

}

1276

ret = regnode( sunlabs.brazil.util.regexp.Regexp.EXACTLY );

1277

regc( (char)1 );

1278

regc( parse[off++] );

1279

netFlags |= HASWIDTH | SIMPLE;

break;

default:

{

off--;

int end;

for ( end = off; end < parse.Length; end++ )

1287

{

1288

if ( META.IndexOf( (System.Char)parse[end] ) >= 0 )

{

break;

}

}

if ( ( end > off + 1 ) && ( end < parse.Length ) && isMult( parse[end] ) )

1294

{

1295

end--; /* Back off clear of ?+* operand. */

1296

}

1297

netFlags |= HASWIDTH;

1298

if ( end == off + 1 )

1299

{

1300

netFlags |= SIMPLE;

1301

}

1302

ret = regnode( sunlabs.brazil.util.regexp.Regexp.EXACTLY );

1303

regc( (char)( end - off ) );

1304

for ( ; off < end; off++ )

1305

{

1306

regc( parse[off] );

}

}

break;

}

flagp = netFlags;

return ret;

}

/*

- regnode - emit a node

1319

*/

1320

internal int regnode( char op )

1321

{

1322

int ret = code.Length;

1323

code.Append( op );

1324

code.Append( '\x0000' );

return ret;

}

/*

- regc - emit (if appropriate) a byte of code

1331

*/

1332

internal void regc( char b )

{

code.Append( b );

}

internal int reglen()

1338

{

1339

return code.Length;

1340

}

1341

1342

internal void regset( int pos, char ch )

{

code[pos] = ch;

}

/*

- reginsert - insert an operator in front of already-emitted operand

1350

*

1351

* Means relocating the operand.

1352

*/

1353

internal void reginsert( char op, int pos )

1354

{

1355

char[] tmp = new char[] { op, '\x0000' };

1356

code.Insert( pos, tmp );

}

/*

- regtail - set the next-pointer at the end of a node chain

1361

*/

1362

internal void regtail( int pos, int val )

1363

{

1364

/* Find last node. */

int scan = pos;

while ( true )

{

int tmp = regnext( scan );

if ( tmp < 0 )

{

break;

}

scan = tmp;

}

int offset = ( code[scan] == sunlabs.brazil.util.regexp.Regexp.BACK ) ? scan - val : val - scan;

1378

code[scan + 1] = (char)offset;

}

/*

- regoptail - regtail on operand of first argument; nop if operandless

1383

*/

1384

internal void regoptail( int pos, int val )

1385

{

1386

if ( ( pos < 0 ) || ( code[pos] != sunlabs.brazil.util.regexp.Regexp.BRANCH ) )

{

return;

}

regtail( pos + 2, val );

}

/*

- regnext - dig the "next" pointer out of a node

1396

*/

1397

internal int regnext( int pos )

1398

{

1399

int offset = code[pos + 1];

if ( offset == 0 )

{

return -1;

}

if ( code[pos] == sunlabs.brazil.util.regexp.Regexp.BACK )

1405

{

1406

return pos - offset;

}

else

{

return pos + offset;

}

}

internal static bool isMult( char ch )

1415

{

1416

return ( ch == '*' ) || ( ch == '+' ) || ( ch == '?' );

}

}

internal class Match

1421

{

1422

internal char[] program;

1423

1424

internal string str;

1425

internal int bol;

1426

internal int input;

1427

internal int length;

1428

1429

internal int[] indices;

1430

1431

internal bool regtry( int off )

{

this.input = off;

for ( int i = 0; i < indices.Length; i++ )

{

indices[i] = -1;

}

if ( regmatch( 0 ) )

1441

{

1442

indices[0] = off;

1443

indices[1] = input;

return true;

}

else

{

return false;

}

}

/*

- regmatch - main matching routine

1454

*

1455

* Conceptually the strategy is simple: check to see whether the current

1456

* node matches, call self recursively to see whether the rest matches,

1457

* and then act accordingly. In practice we make some effort to avoid

1458

* recursion, in particular by going through "ordinary" nodes (that don't

1459

* need to know whether the rest of the match failed) by a loop instead of

1460

* by recursion.

1461

*/

1462

internal bool regmatch( int scan )

{

while ( true )

{

int next = regnext( scan );

1467

int op = program[scan];

switch ( op )

{

case sunlabs.brazil.util.regexp.Regexp.BOL:

1472

if ( input != bol )

{

return false;

}

break;

case sunlabs.brazil.util.regexp.Regexp.EOL:

1480

if ( input != length )

{

return false;

}

break;

case sunlabs.brazil.util.regexp.Regexp.ANY:

1488

if ( input >= length )

{

return false;

}

input++;

break;

case sunlabs.brazil.util.regexp.Regexp.EXACTLY:

1497

{

1498

if ( compare( scan ) == false )

{

return false;

}

break;

}

case sunlabs.brazil.util.regexp.Regexp.ANYOF:

1507

if ( input >= length )

{

return false;

}

if ( present( scan ) == false )

{

return false;

}

input++;

break;

case sunlabs.brazil.util.regexp.Regexp.ANYBUT:

1520

if ( input >= length )

{

return false;

}

if ( present( scan ) )

{

return false;

}

input++;

break;

case sunlabs.brazil.util.regexp.Regexp.NOTHING:

1533

case sunlabs.brazil.util.regexp.Regexp.BACK:

break;

case sunlabs.brazil.util.regexp.Regexp.BRANCH:

1538

{

1539

if ( program[next] != sunlabs.brazil.util.regexp.Regexp.BRANCH )

{

next = scan + 2;

}

else

{

do

{

int save = input;

if ( regmatch( scan + 2 ) )

{

return true;

}

input = save;

scan = regnext( scan );

1554

}

1555

while ( ( scan >= 0 ) && ( program[scan] == sunlabs.brazil.util.regexp.Regexp.BRANCH ) );

return false;

}

break;

}

case sunlabs.brazil.util.regexp.Regexp.STAR:

1563

case sunlabs.brazil.util.regexp.Regexp.PLUS:

1564

{

1565

/*

1566

* Lookahead to avoid useless match attempts

1567

* when we know what character comes next.

*/

int ch = -1;

if ( program[next] == sunlabs.brazil.util.regexp.Regexp.EXACTLY )

1572

{

1573

ch = program[next + 3];

1574

}

1575

1576

int min = ( op == sunlabs.brazil.util.regexp.Regexp.STAR ) ? 0 : 1;

1577

int save = input;

1578

int no = regrepeat( scan + 2 );

1579

1580

while ( no >= min )

1581

{

1582

/* If it could work, try it. */

1583

if ( ( ch < 0 ) || ( ( input < length ) && ( str[input] == ch ) ) )

1584

{

1585

if ( regmatch( next ) )

{

return true;

}

}

/* Couldn't or didn't -- back up. */

no--;

input = save + no;

}

return false;

}

case sunlabs.brazil.util.regexp.Regexp.END:

return true;

default:

if ( op >= sunlabs.brazil.util.regexp.Regexp.CLOSE )

1604

{

1605

int no = op - sunlabs.brazil.util.regexp.Regexp.CLOSE;

1606

int save = input;

1607

1608

if ( regmatch( next ) )

1609

{

1610

/*

1611

* Don't set endp if some later

1612

* invocation of the same parentheses

1613

* already has.

1614

*/

1615

if ( indices[no * 2 + 1] <= 0 )

1616

{

1617

indices[no * 2 + 1] = save;

}

return true;

}

}

else if ( op >= sunlabs.brazil.util.regexp.Regexp.OPEN )

1623

{

1624

int no = op - sunlabs.brazil.util.regexp.Regexp.OPEN;

1625

int save = input;

1626

1627

if ( regmatch( next ) )

1628

{

1629

/*

1630

* Don't set startp if some later invocation of the

1631

* same parentheses already has.

1632

*/

1633

if ( indices[no * 2] <= 0 )

1634

{

1635

indices[no * 2] = save;

}

return true;

}

}

return false;

}

scan = next;

}

}

internal bool compare( int scan )

1648

{

1649

int count = program[scan + 2];

1650

if ( input + count > length )

{

return false;

}

int start = scan + 3;

1655

int end = start + count;

1656

for ( int i = start; i < end; i++ )

1657

{

1658

if ( str[input++] != program[i] )

{

return false;

}

}

return true;

}

internal bool present( int scan )

1667

{

1668

char ch = str[input];

1669

1670

int count = program[scan + 2];

1671

int start = scan + 3;

1672

int end = start + count;

1673

1674

for ( int i = start; i < end; i++ )

1675

{

1676

if ( program[i] == ch )

{

return true;

}

}

return false;

}

/*

- regrepeat - repeatedly match something simple, report how many

1687

*/

1688

internal int regrepeat( int scan )

1689

{

1690

int op = program[scan];

int count = 0;

switch ( op )

{

case sunlabs.brazil.util.regexp.Regexp.ANY:

1697

1698

count = length - input;

input = length;

break;

case sunlabs.brazil.util.regexp.Regexp.EXACTLY:

1704

{

1705

// 'g*' matches all the following 'g' characters.

1706

1707

char ch = program[scan + 3];

1708

while ( ( input < length ) && ( str[input] == ch ) )

{

input++;

count++;

}

break;

}

case sunlabs.brazil.util.regexp.Regexp.ANYOF:

1718

1719

while ( ( input < length ) && present( scan ) )

{

input++;

count++;

}

break;

case sunlabs.brazil.util.regexp.Regexp.ANYBUT:

1729

while ( ( input < length ) && !present( scan ) )

{

input++;

count++;

}

break;

}

return count;

}

/*

- regnext - dig the "next" pointer out of a node

1741

*/

1742

internal int regnext( int scan )

1743

{

1744

int offset = program[scan + 1];

1745

if ( program[scan] == sunlabs.brazil.util.regexp.Regexp.BACK )

1746

{

1747

return scan - offset;

}

else

{

return scan + offset;

}

}

}

}

}

wasCSharpSQLite – Blame information for rev 4