wasCSharpSQLite – Blame information for rev 4

Subversion Repositories:
Rev:
Rev Author Line No. Line
1 office 1 /*
2 * Regexp.java
3 *
4 * Copyright (c) 1999 Sun Microsystems, Inc.
5 *
6 * See the file "license.terms" for information on usage and
7 * redistribution of this file, and for a DISCLAIMER OF ALL
8 * WARRANTIES.
9 *
10 * SCCS: %Z% %M% %I% %E% %U%
11 */
12 // Included in SQLite3 port to C# for use in testharness only; 2008 Noah B Hart
13 //$Header$
14  
15 using System;
16 using System.Text;
17 namespace sunlabs.brazil.util.regexp
18 {
19  
20 /// <summary> The <code>Regexp</code> class can be used to match a pattern against a
21 /// string and optionally replace the matched parts with new strings.
22 /// <p>
23 /// Regular expressions were implemented by translating Henry Spencer's
24 /// regular expression package for <a href="http://www.scriptics.com">tcl8.0</a>.
25 /// Much of the description below is copied verbatim from the tcl8.0 regsub
26 /// manual entry.
27 /// <hr>
28 /// REGULAR EXPRESSIONS
29 /// <p>
30 /// A regular expression is zero or more <code>branches</code>, separated by
31 /// "|". It matches anything that matches one of the branches.
32 /// <p>
33 /// A branch is zero or more <code>pieces</code>, concatenated.
34 /// It matches a match for the first piece, followed by a match for the
35 /// second piece, etc.
36 /// <p>
37 /// A piece is an <code>atom</code>, possibly followed by "*", "+", or
38 /// "?". <ul>
39 /// <li> An atom followed by "*" matches a sequence of 0 or more matches of
40 /// the atom.
41 /// <li> An atom followed by "+" matches a sequence of 1 or more matches of
42 /// the atom.
43 /// <li> An atom followed by "?" matches either 0 or 1 matches of the atom.
44 /// </ul>
45 /// <p>
46 /// An atom is <ul>
47 /// <li> a regular expression in parentheses (matching a match for the
48 /// regular expression)
49 /// <li> a <code>range</code> (see below)
50 /// <li> "." (matching any single character)
51 /// <li> "^" (matching the null string at the beginning of the input string)
52 /// <li> "$" (matching the null string at the end of the input string)
53 /// <li> a "\" followed by a single character (matching that character)
54 /// <li> a single character with no other significance (matching that
55 /// character).
56 /// </ul>
57 /// <p>
58 /// A <code>range</code> is a sequence of characters enclosed in "[]".
59 /// The range normally matches any single character from the sequence.
60 /// If the sequence begins with "^", the range matches any single character
61 /// <b>not</b> from the rest of the sequence.
62 /// If two characters in the sequence are separated by "-", this is shorthand
63 /// for the full list of characters between them (e.g. "[0-9]" matches any
64 /// decimal digit). To include a literal "]" in the sequence, make it the
65 /// first character (following a possible "^"). To include a literal "-",
66 /// make it the first or last character.
67 /// <p>
68 /// In general there may be more than one way to match a regular expression
69 /// to an input string. For example, consider the command
70 /// <pre>
71 /// String[] match = new String[2];
72 /// Regexp.match("(a*)b*", "aabaaabb", match);
73 /// </pre>
74 /// Considering only the rules given so far, <code>match[0]</code> and
75 /// <code>match[1]</code> could end up with the values <ul>
76 /// <li> "aabb" and "aa"
77 /// <li> "aaab" and "aaa"
78 /// <li> "ab" and "a"
79 /// </ul>
80 /// or any of several other combinations. To resolve this potential ambiguity,
81 /// Regexp chooses among alternatives using the rule "first then longest".
82 /// In other words, it considers the possible matches in order working
83 /// from left to right across the input string and the pattern, and it
84 /// attempts to match longer pieces of the input string before shorter
85 /// ones. More specifically, the following rules apply in decreasing
86 /// order of priority: <ol>
87 /// <li> If a regular expression could match two different parts of an input
88 /// string then it will match the one that begins earliest.
89 /// <li> If a regular expression contains "|" operators then the
90 /// leftmost matching sub-expression is chosen.
91 /// <li> In "*", "+", and "?" constructs, longer matches are chosen in
92 /// preference to shorter ones.
93 /// <li>
94 /// In sequences of expression components the components are considered
95 /// from left to right.
96 /// </ol>
97 /// <p>
98 /// In the example from above, "(a*)b*" therefore matches exactly "aab"; the
99 /// "(a*)" portion of the pattern is matched first and it consumes the leading
100 /// "aa", then the "b*" portion of the pattern consumes the next "b". Or,
101 /// consider the following example:
102 /// <pre>
103 /// String match = new String[3];
104 /// Regexp.match("(ab|a)(b*)c", "abc", match);
105 /// </pre>
106 /// After this command, <code>match[0]</code> will be "abc",
107 /// <code>match[1]</code> will be "ab", and <code>match[2]</code> will be an
108 /// empty string.
109 /// Rule 4 specifies that the "(ab|a)" component gets first shot at the input
110 /// string and Rule 2 specifies that the "ab" sub-expression
111 /// is checked before the "a" sub-expression.
112 /// Thus the "b" has already been claimed before the "(b*)"
113 /// component is checked and therefore "(b*)" must match an empty string.
114 /// <hr>
115 /// <a name=regsub></a>
116 /// REGULAR EXPRESSION SUBSTITUTION
117 /// <p>
118 /// Regular expression substitution matches a string against a regular
119 /// expression, transforming the string by replacing the matched region(s)
120 /// with new substring(s).
121 /// <p>
122 /// What gets substituted into the result is controlled by a
123 /// <code>subspec</code>. The subspec is a formatting string that specifies
124 /// what portions of the matched region should be substituted into the
125 /// result.
126 /// <ul>
127 /// <li> "&" or "\0" is replaced with a copy of the entire matched region.
128 /// <li> "\<code>n</code>", where <code>n</code> is a digit from 1 to 9,
129 /// is replaced with a copy of the <code>n</code><i>th</i> subexpression.
130 /// <li> "\&" or "\\" are replaced with just "&" or "\" to escape their
131 /// special meaning.
132 /// <li> any other character is passed through.
133 /// </ul>
134 /// In the above, strings like "\2" represents the two characters
135 /// <code>backslash</code> and "2", not the Unicode character 0002.
136 /// <hr>
137 /// Here is an example of how to use Regexp
138 /// <pre>
139 ///
140 /// public static void
141 /// main(String[] args)
142 /// throws Exception
143 /// {
144 /// Regexp re;
145 /// String[] matches;
146 /// String s;
147 ///
148 /// &#47;*
149 /// * A regular expression to match the first line of a HTTP request.
150 /// *
151 /// * 1. ^ - starting at the beginning of the line
152 /// * 2. ([A-Z]+) - match and remember some upper case characters
153 /// * 3. [ \t]+ - skip blank space
154 /// * 4. ([^ \t]*) - match and remember up to the next blank space
155 /// * 5. [ \t]+ - skip more blank space
156 /// * 6. (HTTP/1\\.[01]) - match and remember HTTP/1.0 or HTTP/1.1
157 /// * 7. $ - end of string - no chars left.
158 /// *&#47;
159 ///
160 /// s = "GET http://a.b.com:1234/index.html HTTP/1.1";
161 ///
162 /// Regexp re = new Regexp("^([A-Z]+)[ \t]+([^ \t]+)[ \t]+(HTTP/1\\.[01])$");
163 /// String[] matches = new String[4];
164 /// if (re.match(s, matches)) {
165 /// System.out.println("METHOD " + matches[1]);
166 /// System.out.println("URL " + matches[2]);
167 /// System.out.println("VERSION " + matches[3]);
168 /// }
169 ///
170 /// &#47;*
171 /// * A regular expression to extract some simple comma-separated data,
172 /// * reorder some of the columns, and discard column 2.
173 /// *&#47;
174 ///
175 /// s = "abc,def,ghi,klm,nop,pqr";
176 ///
177 /// Regexp re = new Regexp("^([^,]+),([^,]+),([^,]+),(.*)");
178 /// System.out.println(re.sub(s, "\\3,\\1,\\4"));
179 /// }
180 /// </pre>
181 ///
182 /// </summary>
183 /// <author> Colin Stevens (colin.stevens@sun.com)
184 /// </author>
185 /// <version> 1.7, 99/10/14
186 /// </version>
187 /// <seealso cref="Regsub">
188 /// </seealso>
189  
190 public class Regexp
191 {
192 //[STAThread]
193 //public static void Main(string[] args)
194 //{
195 // if ((args.Length == 2) && (args[0].Equals("compile")))
196 // {
197 // System.Diagnostics.Debug.WriteLine(new Regexp(args[1]));
198 // }
199 // else if ((args.Length == 3) && (args[0].Equals("match")))
200 // {
201 // Regexp r = new Regexp(args[1]);
202 // string[] substrs = new string[r.subspecs()];
203 // bool match = r.match(args[2], substrs);
204 // System.Diagnostics.Debug.WriteLine("match:\t" + match);
205 // for (int i = 0; i < substrs.Length; i++)
206 // {
207 // System.Diagnostics.Debug.WriteLine((i + 1) + ":\t" + substrs[i]);
208 // }
209 // }
210 // else if ((args.Length == 4) && (args[0].Equals("sub")))
211 // {
212 // Regexp r = new Regexp(args[1]);
213 // System.Diagnostics.Debug.WriteLine(r.subAll(args[2], args[3]));
214 // }
215 // else
216 // {
217 // System.Diagnostics.Debug.WriteLine("usage:");
218 // System.Diagnostics.Debug.WriteLine("\tRegexp match <pattern> <string>");
219 // System.Diagnostics.Debug.WriteLine("\tRegexp sub <pattern> <string> <subspec>");
220 // System.Diagnostics.Debug.WriteLine("\tRegexp compile <pattern>");
221 // }
222 //}
223  
224 /*
225 * Structure for regexp "program". This is essentially a linear encoding
226 * of a nondeterministic finite-state machine (aka syntax charts or
227 * "railroad normal form" in parsing technology). Each node is an opcode
228 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
229 * all nodes except BRANCH implement concatenation; a "next" pointer with
230 * a BRANCH on both ends of it is connecting two alternatives. (Here we
231 * have one of the subtle syntax dependencies: an individual BRANCH (as
232 * opposed to a collection of them) is never concatenated with anything
233 * because of operator precedence.) The operand of some types of node is
234 * a literal string; for others, it is a node leading into a sub-FSM. In
235 * particular, the operand of a BRANCH node is the first node of the branch.
236 * (NB this is *not* a tree structure: the tail of the branch connects
237 * to the thing following the set of BRANCHes.) The opcodes are:
238 */
239  
240 internal const int NSUBEXP = 100;
241  
242 /* definition number opnd? meaning */
243  
244 internal const char END = (char)( 0 ); /* no End of program. */
245 internal const char BOL = (char)( 1 ); /* no Match "" at beginning of line. */
246 internal const char EOL = (char)( 2 ); /* no Match "" at end of line. */
247 internal const char ANY = (char)( 3 ); /* no Match any one character. */
248 internal const char ANYOF = (char)( 4 ); /* str Match any character in this string. */
249 internal const char ANYBUT = (char)( 5 ); /* str Match any character not in this string. */
250 internal const char BRANCH = (char)( 6 ); /* node Match this alternative, or the next... */
251 internal const char BACK = (char)( 7 ); /* no Match "", "next" ptr points backward. */
252 internal const char EXACTLY = (char)( 8 ); /* str Match this string. */
253 internal const char NOTHING = (char)( 9 ); /* no Match empty string. */
254 internal const char STAR = (char)( 10 ); /* node Match this (simple) thing 0 or more times. */
255 internal const char PLUS = (char)( 11 ); /* node Match this (simple) thing 1 or more times. */
256 internal const char OPEN = (char)( 20 ); /* no Mark this point in input as start of #n. */
257 /* OPEN+1 is number 1, etc. */
258 internal static readonly char CLOSE = (char)( OPEN + NSUBEXP );
259 /* no Analogous to OPEN. */
260 internal static readonly string[] opnames = new string[] { "END", "BOL", "EOL", "ANY", "ANYOF", "ANYBUT", "BRANCH", "BACK", "EXACTLY", "NOTHING", "STAR", "PLUS" };
261  
262 /*
263 * A node is one char of opcode followed by one char of "next" pointer.
264 * The value is a positive offset from the opcode of the node containing
265 * it. An operand, if any, simply follows the node. (Note that much of
266 * the code generation knows about this implicit relationship.)
267 *
268 * Opcode notes:
269 *
270 * BRANCH The set of branches constituting a single choice are hooked
271 * together with their "next" pointers, since precedence prevents
272 * anything being concatenated to any individual branch. The
273 * "next" pointer of the last BRANCH in a choice points to the
274 * thing following the whole choice. This is also where the
275 * final "next" pointer of each individual branch points; each
276 * branch starts with the operand node of a BRANCH node.
277 *
278 * ANYOF, ANYBUT, EXACTLY
279 * The format of a string operand is one char of length
280 * followed by the characters making up the string.
281 *
282 * BACK Normal "next" pointers all implicitly point forward; BACK
283 * exists to make loop structures possible.
284 *
285 * STAR, PLUS
286 * '?', and complex '*' and '+' are implemented as circular
287 * BRANCH structures using BACK. Simple cases (one character
288 * per match) are implemented with STAR and PLUS for speed
289 * and to minimize recursive plunges.
290 *
291 * OPENn, CLOSEn
292 * are numbered at compile time.
293 */
294  
295  
296 /// <summary> The bytecodes making up the regexp program.</summary>
297 internal char[] program;
298  
299 /// <summary> Whether the regexp matching should be case insensitive.</summary>
300 internal bool ignoreCase;
301  
302 /// <summary> The number of parenthesized subexpressions in the regexp pattern,
303 /// plus 1 for the match of the whole pattern itself.
304 /// </summary>
305 internal int npar;
306  
307 /// <summary> <code>true</code> if the pattern must match the beginning of the
308 /// string, so we don't have to waste time matching against all possible
309 /// starting locations in the string.
310 /// </summary>
311 internal bool anchored;
312  
313 internal int startChar;
314 internal string must;
315  
316 /// <summary> Compiles a new Regexp object from the given regular expression
317 /// pattern.
318 /// <p>
319 /// It takes a certain amount of time to parse and validate a regular
320 /// expression pattern before it can be used to perform matches
321 /// or substitutions. If the caller caches the new Regexp object, that
322 /// parsing time will be saved because the same Regexp can be used with
323 /// respect to many different strings.
324 ///
325 /// </summary>
326 /// <param name="">pat
327 /// The string holding the regular expression pattern.
328 ///
329 /// @throws IllegalArgumentException if the pattern is malformed.
330 /// The detail message for the exception will be set to a
331 /// string indicating how the pattern was malformed.
332 /// </param>
333 public Regexp( string pat )
334 {
335 compile( pat );
336 }
337  
338 /// <summary> Compiles a new Regexp object from the given regular expression
339 /// pattern.
340 ///
341 /// </summary>
342 /// <param name="">pat
343 /// The string holding the regular expression pattern.
344 ///
345 /// </param>
346 /// <param name="">ignoreCase
347 /// If <code>true</code> then this regular expression will
348 /// do case-insensitive matching. If <code>false</code>, then
349 /// the matches are case-sensitive. Regular expressions
350 /// generated by <code>Regexp(String)</code> are case-sensitive.
351 ///
352 /// @throws IllegalArgumentException if the pattern is malformed.
353 /// The detail message for the exception will be set to a
354 /// string indicating how the pattern was malformed.
355 /// </param>
356 public Regexp( string pat, bool ignoreCase )
357 {
358 this.ignoreCase = ignoreCase;
359 if ( ignoreCase )
360 {
361 pat = pat.ToLower();
362 }
363 compile( pat );
364 }
365  
366 /// <summary> Returns the number of parenthesized subexpressions in this regular
367 /// expression, plus one more for this expression itself.
368 ///
369 /// </summary>
370 /// <returns> The number.
371 /// </returns>
372 public int subspecs()
373 {
374 return npar;
375 }
376  
377 /// <summary> Matches the given string against this regular expression.
378 ///
379 /// </summary>
380 /// <param name="">str
381 /// The string to match.
382 ///
383 /// </param>
384 /// <returns> The substring of <code>str</code> that matched the entire
385 /// regular expression, or <code>null</code> if the string did not
386 /// match this regular expression.
387 /// </returns>
388 public string match( string str )
389 {
390 Match m = exec( str, 0, 0 );
391  
392 if ( m == null )
393 {
394 return null;
395 }
396 return str.Substring( m.indices[0], ( m.indices[1] ) - ( m.indices[0] ) );
397 }
398  
399 /// <summary> Matches the given string against this regular expression, and computes
400 /// the set of substrings that matched the parenthesized subexpressions.
401 /// <p>
402 /// <code>substrs[0]</code> is set to the range of <code>str</code>
403 /// that matched the entire regular expression.
404 /// <p>
405 /// <code>substrs[1]</code> is set to the range of <code>str</code>
406 /// that matched the first (leftmost) parenthesized subexpression.
407 /// <code>substrs[n]</code> is set to the range that matched the
408 /// <code>n</code><i>th</i> subexpression, and so on.
409 /// <p>
410 /// If subexpression <code>n</code> did not match, then
411 /// <code>substrs[n]</code> is set to <code>null</code>. Not to
412 /// be confused with "", which is a valid value for a
413 /// subexpression that matched 0 characters.
414 /// <p>
415 /// The length that the caller should use when allocating the
416 /// <code>substr</code> array is the return value of
417 /// <code>Regexp.subspecs</code>. The array
418 /// can be shorter (in which case not all the information will
419 /// be returned), or longer (in which case the remainder of the
420 /// elements are initialized to <code>null</code>), or
421 /// <code>null</code> (to ignore the subexpressions).
422 ///
423 /// </summary>
424 /// <param name="">str
425 /// The string to match.
426 ///
427 /// </param>
428 /// <param name="">substrs
429 /// An array of strings allocated by the caller, and filled in
430 /// with information about the portions of <code>str</code> that
431 /// matched the regular expression. May be <code>null</code>.
432 ///
433 /// </param>
434 /// <returns> <code>true</code> if <code>str</code> that matched this
435 /// regular expression, <code>false</code> otherwise.
436 /// If <code>false</code> is returned, then the contents of
437 /// <code>substrs</code> are unchanged.
438 ///
439 /// </returns>
440 /// <seealso cref="#subspecs">
441 /// </seealso>
442 public bool match( string str, string[] substrs )
443 {
444 Match m = exec( str, 0, 0 );
445  
446 if ( m == null )
447 {
448 return false;
449 }
450 if ( substrs != null )
451 {
452 int max = System.Math.Min( substrs.Length, npar );
453 int i;
454 int j = 0;
455 for ( i = 0; i < max; i++ )
456 {
457 int start = m.indices[j++];
458 int end = m.indices[j++];
459 if ( start < 0 )
460 {
461 substrs[i] = null;
462 }
463 else
464 {
465 substrs[i] = str.Substring( start, ( end ) - ( start ) );
466 }
467 }
468 for ( ; i < substrs.Length; i++ )
469 {
470 substrs[i] = null;
471 }
472 }
473 return true;
474 }
475  
476 /// <summary> Matches the given string against this regular expression, and computes
477 /// the set of substrings that matched the parenthesized subexpressions.
478 /// <p>
479 /// For the indices specified below, the range extends from the character
480 /// at the starting index up to, but not including, the character at the
481 /// ending index.
482 /// <p>
483 /// <code>indices[0]</code> and <code>indices[1]</code> are set to
484 /// starting and ending indices of the range of <code>str</code>
485 /// that matched the entire regular expression.
486 /// <p>
487 /// <code>indices[2]</code> and <code>indices[3]</code> are set to the
488 /// starting and ending indices of the range of <code>str</code> that
489 /// matched the first (leftmost) parenthesized subexpression.
490 /// <code>indices[n * 2]</code> and <code>indices[n * 2 + 1]</code>
491 /// are set to the range that matched the <code>n</code><i>th</i>
492 /// subexpression, and so on.
493 /// <p>
494 /// If subexpression <code>n</code> did not match, then
495 /// <code>indices[n * 2]</code> and <code>indices[n * 2 + 1]</code>
496 /// are both set to <code>-1</code>.
497 /// <p>
498 /// The length that the caller should use when allocating the
499 /// <code>indices</code> array is twice the return value of
500 /// <code>Regexp.subspecs</code>. The array
501 /// can be shorter (in which case not all the information will
502 /// be returned), or longer (in which case the remainder of the
503 /// elements are initialized to <code>-1</code>), or
504 /// <code>null</code> (to ignore the subexpressions).
505 ///
506 /// </summary>
507 /// <param name="">str
508 /// The string to match.
509 ///
510 /// </param>
511 /// <param name="">indices
512 /// An array of integers allocated by the caller, and filled in
513 /// with information about the portions of <code>str</code> that
514 /// matched all the parts of the regular expression.
515 /// May be <code>null</code>.
516 ///
517 /// </param>
518 /// <returns> <code>true</code> if the string matched the regular expression,
519 /// <code>false</code> otherwise. If <code>false</code> is
520 /// returned, then the contents of <code>indices</code> are
521 /// unchanged.
522 ///
523 /// </returns>
524 /// <seealso cref="#subspecs">
525 /// </seealso>
526 public bool match( string str, int[] indices )
527 {
528 Match m = exec( str, 0, 0 );
529  
530 if ( m == null )
531 {
532 return false;
533 }
534 if ( indices != null )
535 {
536 int max = System.Math.Min( indices.Length, npar * 2 );
537 Array.Copy( (System.Array)m.indices, 0, (System.Array)indices, 0, max );
538  
539 for ( int i = max; i < indices.Length; i++ )
540 {
541 indices[i] = -1;
542 }
543 }
544 return true;
545 }
546  
547 /// <summary> Matches a string against a regular expression and replaces the first
548 /// match with the string generated from the substitution parameter.
549 ///
550 /// </summary>
551 /// <param name="">str
552 /// The string to match against this regular expression.
553 ///
554 /// </param>
555 /// <param name="">subspec
556 /// The substitution parameter, described in <a href=#regsub>
557 /// REGULAR EXPRESSION SUBSTITUTION</a>.
558 ///
559 /// </param>
560 /// <returns> The string formed by replacing the first match in
561 /// <code>str</code> with the string generated from
562 /// <code>subspec</code>. If no matches were found, then
563 /// the return value is <code>null</code>.
564 /// </returns>
565 public string sub( string str, string subspec )
566 {
567 Regsub rs = new Regsub( this, str );
568 if ( rs.nextMatch() )
569 {
570 StringBuilder sb = new StringBuilder( rs.skipped() );
571 applySubspec( rs, subspec, sb );
572 sb.Append( rs.rest() );
573  
574 return sb.ToString();
575 }
576 else
577 {
578 return null;
579 }
580 }
581  
582 /// <summary> Matches a string against a regular expression and replaces all
583 /// matches with the string generated from the substitution parameter.
584 /// After each substutition is done, the portions of the string already
585 /// examined, including the newly substituted region, are <b>not</b> checked
586 /// again for new matches -- only the rest of the string is examined.
587 ///
588 /// </summary>
589 /// <param name="">str
590 /// The string to match against this regular expression.
591 ///
592 /// </param>
593 /// <param name="">subspec
594 /// The substitution parameter, described in <a href=#regsub>
595 /// REGULAR EXPRESSION SUBSTITUTION</a>.
596 ///
597 /// </param>
598 /// <returns> The string formed by replacing all the matches in
599 /// <code>str</code> with the strings generated from
600 /// <code>subspec</code>. If no matches were found, then
601 /// the return value is a copy of <code>str</code>.
602 /// </returns>
603 public string subAll( string str, string subspec )
604 {
605 return sub( str, new SubspecFilter( subspec, true ) );
606 }
607  
608 /// <summary> Utility method to give access to the standard substitution algorithm
609 /// used by <code>sub</code> and <code>subAll</code>. Appends to the
610 /// string buffer the string generated by applying the substitution
611 /// parameter to the matched region.
612 ///
613 /// </summary>
614 /// <param name="">rs
615 /// Information about the matched region.
616 ///
617 /// </param>
618 /// <param name="">subspec
619 /// The substitution parameter.
620 ///
621 /// </param>
622 /// <param name="">sb
623 /// StringBuffer to which the generated string is appended.
624 /// </param>
625 public static void applySubspec( Regsub rs, string subspec, StringBuilder sb )
626 {
627 try
628 {
629 int len = subspec.Length;
630 for ( int i = 0; i < len; i++ )
631 {
632 char ch = subspec[i];
633 switch ( ch )
634 {
635  
636 case '&':
637 {
638 sb.Append( rs.matched() );
639 break;
640 }
641  
642 case '\\':
643 {
644 i++;
645 ch = subspec[i];
646 if ( ( ch >= '0' ) && ( ch <= '9' ) )
647 {
648 string match = rs.submatch( ch - '0' );
649 if ( (System.Object)match != null )
650 {
651 sb.Append( match );
652 }
653 break;
654 }
655 // fall through.
656 }
657 goto default;
658  
659 default:
660 {
661 sb.Append( ch );
662 }
663 break;
664  
665 }
666 }
667 }
668 catch ( System.IndexOutOfRangeException e )
669 {
670 /*
671 * Ignore malformed substitution pattern.
672 * Return string matched so far.
673 */
674 }
675 }
676  
677 public string sub( string str, Filter rf )
678 {
679 Regsub rs = new Regsub( this, str );
680 if ( rs.nextMatch() == false )
681 {
682 return str;
683 }
684  
685 StringBuilder sb = new StringBuilder();
686 do
687 {
688 sb.Append( rs.skipped() );
689 if ( rf.filter( rs, sb ) == false )
690 {
691 break;
692 }
693 }
694 while ( rs.nextMatch() );
695 sb.Append( rs.rest() );
696 return sb.ToString();
697 }
698  
699 /// <summary> This interface is used by the <code>Regexp</code> class to generate
700 /// the replacement string for each pattern match found in the source
701 /// string.
702 ///
703 /// </summary>
704 /// <author> Colin Stevens (colin.stevens@sun.com)
705 /// </author>
706 /// <version> 1.7, 99/10/14
707 /// </version>
708 public interface Filter
709 {
710 /// <summary> Given the current state of the match, generate the replacement
711 /// string. This method will be called for each match found in
712 /// the source string, unless this filter decides not to handle any
713 /// more matches.
714 /// <p>
715 /// The implementation can use whatever rules it chooses
716 /// to generate the replacement string. For example, here is an
717 /// example of a filter that replaces the first <b>5</b>
718 /// occurrences of "%XX" in a string with the ASCII character
719 /// represented by the hex digits "XX":
720 /// <pre>
721 /// String str = ...;
722 ///
723 /// Regexp re = new Regexp("%[a-fA-F0-9][a-fA-F0-9]");
724 ///
725 /// Regexp.Filter rf = new Regexp.Filter() {
726 /// int count = 5;
727 /// public boolean filter(Regsub rs, StringBuffer sb) {
728 /// String match = rs.matched();
729 /// int hi = Character.digit(match.charAt(1), 16);
730 /// int lo = Character.digit(match.charAt(2), 16);
731 /// sb.append((char) ((hi &lt;&lt; 4) | lo));
732 /// return (--count > 0);
733 /// }
734 /// }
735 ///
736 /// String result = re.sub(str, rf);
737 /// </pre>
738 ///
739 /// </summary>
740 /// <param name="">rs
741 /// <code>Regsub</code> containing the state of the current
742 /// match.
743 ///
744 /// </param>
745 /// <param name="">sb
746 /// The string buffer that this filter should append the
747 /// generated string to. This string buffer actually
748 /// contains the results the calling <code>Regexp</code> has
749 /// generated up to this point.
750 ///
751 /// </param>
752 /// <returns> <code>false</code> if no further matches should be
753 /// considered in this string, <code>true</code> to allow
754 /// <code>Regexp</code> to continue looking for further
755 /// matches.
756 /// </returns>
757 bool filter( Regsub rs, StringBuilder sb );
758 }
759  
760 private class SubspecFilter : Filter
761 {
762 internal string subspec;
763 internal bool all;
764  
765 public SubspecFilter( string subspec, bool all )
766 {
767 this.subspec = subspec;
768 this.all = all;
769 }
770  
771 public bool filter( Regsub rs, StringBuilder sb )
772 {
773 sunlabs.brazil.util.regexp.Regexp.applySubspec( rs, subspec, sb );
774 return all;
775 }
776 }
777  
778 /// <summary> Returns a string representation of this compiled regular
779 /// expression. The format of the string representation is a
780 /// symbolic dump of the bytecodes.
781 ///
782 /// </summary>
783 /// <returns> A string representation of this regular expression.
784 /// </returns>
785 public override string ToString()
786 {
787 StringBuilder sb = new StringBuilder();
788  
789 sb.Append( "# subs: " + npar + "\n" );
790 sb.Append( "anchor: " + anchored + "\n" );
791 sb.Append( "start: " + (char)startChar + "\n" );
792 sb.Append( "must: " + must + "\n" );
793  
794 for ( int i = 0; i < program.Length; )
795 {
796 sb.Append( i + ":\t" );
797 int op = program[i];
798 if ( op >= CLOSE )
799 {
800 sb.Append( "CLOSE" + ( op - CLOSE ) );
801 }
802 else if ( op >= OPEN )
803 {
804 sb.Append( "OPEN" + ( op - OPEN ) );
805 }
806 else
807 {
808 sb.Append( opnames[op] );
809 }
810 int line;
811 int offset = (int)program[i + 1];
812 if ( offset == 0 )
813 {
814 sb.Append( '\t' );
815 }
816 else if ( op == BACK )
817 {
818 sb.Append( "\t-" + offset + "," + ( i - offset ) );
819 }
820 else
821 {
822 sb.Append( "\t+" + offset + "," + ( i + offset ) );
823 }
824  
825 if ( ( op == ANYOF ) || ( op == ANYBUT ) || ( op == EXACTLY ) )
826 {
827 sb.Append( "\t'" );
828 sb.Append( program, i + 3, program[i + 2] );
829 sb.Append( "'" );
830 i += 3 + program[i + 2];
831 }
832 else
833 {
834 i += 2;
835 }
836 sb.Append( '\n' );
837 }
838 return sb.ToString();
839 }
840  
841  
842 private void compile( string exp )
843 {
844 Compiler rcstate = new Compiler();
845 rcstate.parse = exp.ToCharArray();
846 rcstate.off = 0;
847 rcstate.npar = 1;
848 rcstate.code = new StringBuilder();
849  
850 rcstate.reg( false );
851  
852 program = rcstate.code.ToString().ToCharArray();
853 npar = rcstate.npar;
854 startChar = -1;
855  
856 /* optimize */
857 if ( program[rcstate.regnext( 0 )] == END )
858 {
859 if ( program[2] == BOL )
860 {
861 anchored = true;
862 }
863 else if ( program[2] == EXACTLY )
864 {
865 startChar = (int)program[5];
866 }
867 }
868  
869 /*
870 * If there's something expensive in the r.e., find the
871 * longest literal string that must appear and make it the
872 * regmust. Resolve ties in favor of later strings, since
873 * the regstart check works with the beginning of the r.e.
874 * and avoiding duplication strengthens checking. Not a
875 * strong reason, but sufficient in the absence of others.
876 */
877 /*
878 if ((rcstate.flagp & Compiler.SPSTART) != 0) {
879 int index = -1;
880 int longest = 0;
881  
882 for (scan = 0; scan < program.length; ) {
883 switch (program[scan]) {
884 case EXACTLY:
885 int length = program[scan + 2];
886 if (length > longest) {
887 index = scan;
888 longest = length;
889 }
890 // fall through;
891  
892 case ANYOF:
893 case ANYBUT:
894 scan += 3 + program[scan + 2];
895 break;
896  
897 default:
898 scan += 2;
899 break;
900 }
901 }
902 if (longest > 0) {
903 must = new String(program, index + 3, longest);
904 }
905 }*/
906 }
907  
908 internal Match exec( string str, int start, int off )
909 {
910 if ( ignoreCase )
911 {
912 str = str.ToLower();
913 }
914  
915 Match match = new Match();
916  
917 match.program = program;
918  
919 /* Mark beginning of line for ^ . */
920 match.str = str;
921 match.bol = start;
922 match.length = str.Length;
923  
924 match.indices = new int[npar * 2];
925  
926 if ( anchored )
927 {
928 /* Simplest case: anchored match need be tried only once. */
929 if ( match.regtry( off ) )
930 {
931 return match;
932 }
933 }
934 else if ( startChar >= 0 )
935 {
936 /* We know what char it must start with. */
937 while ( off < match.length )
938 {
939 off = str.IndexOf( (System.Char)startChar, off );
940 if ( off < 0 )
941 {
942 break;
943 }
944 if ( match.regtry( off ) )
945 {
946 return match;
947 }
948 off++;
949 }
950 }
951 else
952 {
953 /* Messy cases: unanchored match. */
954 do
955 {
956 if ( match.regtry( off ) )
957 {
958 return match;
959 }
960 }
961 while ( off++ < match.length );
962 }
963 return null;
964 }
965  
966 internal class Compiler
967 {
968 internal char[] parse;
969 internal int off;
970 internal int npar;
971 internal StringBuilder code;
972 internal int flagp;
973  
974  
975 internal const string META = "^$.[()|?+*\\";
976 internal const string MULT = "*+?";
977  
978 internal const int WORST = 0; /* Worst case. */
979 internal const int HASWIDTH = 1; /* Known never to match null string. */
980 internal const int SIMPLE = 2; /* Simple enough to be STAR/PLUS operand. */
981 internal const int SPSTART = 4; /* Starts with * or +. */
982  
983 /*
984 - reg - regular expression, i.e. main body or parenthesized thing
985 *
986 * Caller must absorb opening parenthesis.
987 *
988 * Combining parenthesis handling with the base level of regular expression
989 * is a trifle forced, but the need to tie the tails of the branches to what
990 * follows makes it hard to avoid.
991 */
992 internal int reg( bool paren )
993 {
994 int netFlags = HASWIDTH;
995 int parno = 0;
996  
997 int ret = -1;
998 if ( paren )
999 {
1000 parno = npar++;
1001 if ( npar >= sunlabs.brazil.util.regexp.Regexp.NSUBEXP )
1002 {
1003 throw new System.ArgumentException( "too many ()" );
1004 }
1005 ret = regnode( (char)( sunlabs.brazil.util.regexp.Regexp.OPEN + parno ) );
1006 }
1007  
1008 /* Pick up the branches, linking them together. */
1009 int br = regbranch();
1010 if ( ret >= 0 )
1011 {
1012 regtail( ret, br );
1013 }
1014 else
1015 {
1016 ret = br;
1017 }
1018  
1019 if ( ( flagp & HASWIDTH ) == 0 )
1020 {
1021 netFlags &= ~HASWIDTH;
1022 }
1023 netFlags |= ( flagp & SPSTART );
1024 while ( ( off < parse.Length ) && ( parse[off] == '|' ) )
1025 {
1026 off++;
1027 br = regbranch();
1028 regtail( ret, br );
1029 if ( ( flagp & HASWIDTH ) == 0 )
1030 {
1031 netFlags &= ~HASWIDTH;
1032 }
1033 netFlags |= ( flagp & SPSTART );
1034 }
1035  
1036 /* Make a closing node, and hook it on the end. */
1037 int ender = regnode( ( paren ) ? (char)( sunlabs.brazil.util.regexp.Regexp.CLOSE + parno ) : sunlabs.brazil.util.regexp.Regexp.END );
1038 regtail( ret, ender );
1039  
1040 /* Hook the tails of the branches to the closing node. */
1041 for ( br = ret; br >= 0; br = regnext( br ) )
1042 {
1043 regoptail( br, ender );
1044 }
1045  
1046 /* Check for proper termination. */
1047 if ( paren && ( ( off >= parse.Length ) || ( parse[off++] != ')' ) ) )
1048 {
1049 throw new System.ArgumentException( "missing )" );
1050 }
1051 else if ( ( paren == false ) && ( off < parse.Length ) )
1052 {
1053 throw new System.ArgumentException( "unexpected )" );
1054 }
1055  
1056 flagp = netFlags;
1057 return ret;
1058 }
1059  
1060 /*
1061 - regbranch - one alternative of an | operator
1062 *
1063 * Implements the concatenation operator.
1064 */
1065 internal int regbranch()
1066 {
1067 int netFlags = WORST; /* Tentatively. */
1068  
1069 int ret = regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH );
1070 int chain = -1;
1071 while ( ( off < parse.Length ) && ( parse[off] != '|' ) && ( parse[off] != ')' ) )
1072 {
1073 int latest = regpiece();
1074 netFlags |= flagp & HASWIDTH;
1075 if ( chain < 0 )
1076 {
1077 /* First piece. */
1078 netFlags |= ( flagp & SPSTART );
1079 }
1080 else
1081 {
1082 regtail( chain, latest );
1083 }
1084 chain = latest;
1085 }
1086 if ( chain < 0 )
1087 {
1088 /* Loop ran zero times. */
1089 regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING );
1090 }
1091  
1092 flagp = netFlags;
1093 return ret;
1094 }
1095  
1096 /*
1097 - regpiece - something followed by possible [*+?]
1098 *
1099 * Note that the branching code sequences used for ? and the general cases
1100 * of * and + are somewhat optimized: they use the same NOTHING node as
1101 * both the endmarker for their branch list and the body of the last branch.
1102 * It might seem that this node could be dispensed with entirely, but the
1103 * endmarker role is not redundant.
1104 */
1105 internal int regpiece()
1106 {
1107 int netFlags;
1108  
1109 int ret = regatom();
1110  
1111 if ( ( off >= parse.Length ) || ( isMult( parse[off] ) == false ) )
1112 {
1113 return ret;
1114 }
1115 char op = parse[off];
1116  
1117 if ( ( ( flagp & HASWIDTH ) == 0 ) && ( op != '?' ) )
1118 {
1119 throw new System.ArgumentException( "*+ operand could be empty" );
1120 }
1121 netFlags = ( op != '+' ) ? ( WORST | SPSTART ) : ( WORST | HASWIDTH );
1122  
1123 if ( ( op == '*' ) && ( ( flagp & SIMPLE ) != 0 ) )
1124 {
1125 reginsert( sunlabs.brazil.util.regexp.Regexp.STAR, ret );
1126 }
1127 else if ( op == '*' )
1128 {
1129 /* Emit x* as (x&|), where & means "self". */
1130 reginsert( sunlabs.brazil.util.regexp.Regexp.BRANCH, ret ); /* Either x */
1131 regoptail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.BACK ) ); /* and loop */
1132 regoptail( ret, ret ); /* back */
1133 regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ) ); /* or */
1134 regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING ) ); /* null. */
1135 }
1136 else if ( ( op == '+' ) && ( ( flagp & SIMPLE ) != 0 ) )
1137 {
1138 reginsert( sunlabs.brazil.util.regexp.Regexp.PLUS, ret );
1139 }
1140 else if ( op == '+' )
1141 {
1142 /* Emit x+ as x(&|), where & means "self". */
1143 int next = regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ); /* Either */
1144 regtail( ret, next );
1145 regtail( regnode( sunlabs.brazil.util.regexp.Regexp.BACK ), ret ); /* loop back */
1146 regtail( next, regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ) ); /* or */
1147 regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING ) ); /* null. */
1148 }
1149 else if ( op == '?' )
1150 {
1151 /* Emit x? as (x|) */
1152 reginsert( sunlabs.brazil.util.regexp.Regexp.BRANCH, ret ); /* Either x */
1153 regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ) ); /* or */
1154 int next = regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING ); /* null. */
1155 regtail( ret, next );
1156 regoptail( ret, next );
1157 }
1158 off++;
1159 if ( ( off < parse.Length ) && isMult( parse[off] ) )
1160 {
1161 throw new System.ArgumentException( "nested *?+" );
1162 }
1163  
1164 flagp = netFlags;
1165 return ret;
1166 }
1167  
1168 /*
1169 - regatom - the lowest level
1170 *
1171 * Optimization: gobbles an entire sequence of ordinary characters so that
1172 * it can turn them into a single node, which is smaller to store and
1173 * faster to run. Backslashed characters are exceptions, each becoming a
1174 * separate node; the code is simpler that way and it's not worth fixing.
1175 */
1176 internal int regatom()
1177 {
1178 int netFlags = WORST; /* Tentatively. */
1179 int ret;
1180  
1181 switch ( parse[off++] )
1182 {
1183  
1184 case '^':
1185 ret = regnode( sunlabs.brazil.util.regexp.Regexp.BOL );
1186 break;
1187  
1188 case '$':
1189 ret = regnode( sunlabs.brazil.util.regexp.Regexp.EOL );
1190 break;
1191  
1192 case '.':
1193 ret = regnode( sunlabs.brazil.util.regexp.Regexp.ANY );
1194 netFlags |= ( HASWIDTH | SIMPLE );
1195 break;
1196  
1197 case '[':
1198 {
1199 try
1200 {
1201 if ( parse[off] == '^' )
1202 {
1203 ret = regnode( sunlabs.brazil.util.regexp.Regexp.ANYBUT );
1204 off++;
1205 }
1206 else
1207 {
1208 ret = regnode( sunlabs.brazil.util.regexp.Regexp.ANYOF );
1209 }
1210  
1211 int pos = reglen();
1212 regc( '\x0000' );
1213  
1214 if ( ( parse[off] == ']' ) || ( parse[off] == '-' ) )
1215 {
1216 regc( parse[off++] );
1217 }
1218 while ( parse[off] != ']' )
1219 {
1220 if ( parse[off] == '-' )
1221 {
1222 off++;
1223 if ( parse[off] == ']' )
1224 {
1225 regc( '-' );
1226 }
1227 else
1228 {
1229 int start = parse[off - 2];
1230 int end = parse[off++];
1231 if ( start > end )
1232 {
1233 throw new System.ArgumentException( "invalid [] range" );
1234 }
1235 for ( int i = start + 1; i <= end; i++ )
1236 {
1237 regc( (char)i );
1238 }
1239 }
1240 }
1241 else
1242 {
1243 regc( parse[off++] );
1244 }
1245 }
1246 regset( pos, (char)( reglen() - pos - 1 ) );
1247 off++;
1248 netFlags |= HASWIDTH | SIMPLE;
1249 }
1250 catch ( System.IndexOutOfRangeException e )
1251 {
1252 throw new System.ArgumentException( "missing ]" );
1253 }
1254 break;
1255 }
1256  
1257 case '(':
1258 ret = reg( true );
1259 netFlags |= ( flagp & ( HASWIDTH | SPSTART ) );
1260 break;
1261  
1262 case '|':
1263 case ')':
1264 throw new System.ArgumentException( "internal urp" );
1265  
1266 case '?':
1267 case '+':
1268 case '*':
1269 throw new System.ArgumentException( "?+* follows nothing" );
1270  
1271 case '\\':
1272 if ( off >= parse.Length )
1273 {
1274 throw new System.ArgumentException( "trailing \\" );
1275 }
1276 ret = regnode( sunlabs.brazil.util.regexp.Regexp.EXACTLY );
1277 regc( (char)1 );
1278 regc( parse[off++] );
1279 netFlags |= HASWIDTH | SIMPLE;
1280 break;
1281  
1282 default:
1283 {
1284 off--;
1285 int end;
1286 for ( end = off; end < parse.Length; end++ )
1287 {
1288 if ( META.IndexOf( (System.Char)parse[end] ) >= 0 )
1289 {
1290 break;
1291 }
1292 }
1293 if ( ( end > off + 1 ) && ( end < parse.Length ) && isMult( parse[end] ) )
1294 {
1295 end--; /* Back off clear of ?+* operand. */
1296 }
1297 netFlags |= HASWIDTH;
1298 if ( end == off + 1 )
1299 {
1300 netFlags |= SIMPLE;
1301 }
1302 ret = regnode( sunlabs.brazil.util.regexp.Regexp.EXACTLY );
1303 regc( (char)( end - off ) );
1304 for ( ; off < end; off++ )
1305 {
1306 regc( parse[off] );
1307 }
1308 }
1309 break;
1310  
1311 }
1312  
1313 flagp = netFlags;
1314 return ret;
1315 }
1316  
1317 /*
1318 - regnode - emit a node
1319 */
1320 internal int regnode( char op )
1321 {
1322 int ret = code.Length;
1323 code.Append( op );
1324 code.Append( '\x0000' );
1325  
1326 return ret;
1327 }
1328  
1329 /*
1330 - regc - emit (if appropriate) a byte of code
1331 */
1332 internal void regc( char b )
1333 {
1334 code.Append( b );
1335 }
1336  
1337 internal int reglen()
1338 {
1339 return code.Length;
1340 }
1341  
1342 internal void regset( int pos, char ch )
1343 {
1344 code[pos] = ch;
1345 }
1346  
1347  
1348 /*
1349 - reginsert - insert an operator in front of already-emitted operand
1350 *
1351 * Means relocating the operand.
1352 */
1353 internal void reginsert( char op, int pos )
1354 {
1355 char[] tmp = new char[] { op, '\x0000' };
1356 code.Insert( pos, tmp );
1357 }
1358  
1359 /*
1360 - regtail - set the next-pointer at the end of a node chain
1361 */
1362 internal void regtail( int pos, int val )
1363 {
1364 /* Find last node. */
1365  
1366 int scan = pos;
1367 while ( true )
1368 {
1369 int tmp = regnext( scan );
1370 if ( tmp < 0 )
1371 {
1372 break;
1373 }
1374 scan = tmp;
1375 }
1376  
1377 int offset = ( code[scan] == sunlabs.brazil.util.regexp.Regexp.BACK ) ? scan - val : val - scan;
1378 code[scan + 1] = (char)offset;
1379 }
1380  
1381 /*
1382 - regoptail - regtail on operand of first argument; nop if operandless
1383 */
1384 internal void regoptail( int pos, int val )
1385 {
1386 if ( ( pos < 0 ) || ( code[pos] != sunlabs.brazil.util.regexp.Regexp.BRANCH ) )
1387 {
1388 return;
1389 }
1390 regtail( pos + 2, val );
1391 }
1392  
1393  
1394 /*
1395 - regnext - dig the "next" pointer out of a node
1396 */
1397 internal int regnext( int pos )
1398 {
1399 int offset = code[pos + 1];
1400 if ( offset == 0 )
1401 {
1402 return -1;
1403 }
1404 if ( code[pos] == sunlabs.brazil.util.regexp.Regexp.BACK )
1405 {
1406 return pos - offset;
1407 }
1408 else
1409 {
1410 return pos + offset;
1411 }
1412 }
1413  
1414 internal static bool isMult( char ch )
1415 {
1416 return ( ch == '*' ) || ( ch == '+' ) || ( ch == '?' );
1417 }
1418 }
1419  
1420 internal class Match
1421 {
1422 internal char[] program;
1423  
1424 internal string str;
1425 internal int bol;
1426 internal int input;
1427 internal int length;
1428  
1429 internal int[] indices;
1430  
1431 internal bool regtry( int off )
1432 {
1433 this.input = off;
1434  
1435 for ( int i = 0; i < indices.Length; i++ )
1436 {
1437 indices[i] = -1;
1438 }
1439  
1440 if ( regmatch( 0 ) )
1441 {
1442 indices[0] = off;
1443 indices[1] = input;
1444 return true;
1445 }
1446 else
1447 {
1448 return false;
1449 }
1450 }
1451  
1452 /*
1453 - regmatch - main matching routine
1454 *
1455 * Conceptually the strategy is simple: check to see whether the current
1456 * node matches, call self recursively to see whether the rest matches,
1457 * and then act accordingly. In practice we make some effort to avoid
1458 * recursion, in particular by going through "ordinary" nodes (that don't
1459 * need to know whether the rest of the match failed) by a loop instead of
1460 * by recursion.
1461 */
1462 internal bool regmatch( int scan )
1463 {
1464 while ( true )
1465 {
1466 int next = regnext( scan );
1467 int op = program[scan];
1468 switch ( op )
1469 {
1470  
1471 case sunlabs.brazil.util.regexp.Regexp.BOL:
1472 if ( input != bol )
1473 {
1474 return false;
1475 }
1476 break;
1477  
1478  
1479 case sunlabs.brazil.util.regexp.Regexp.EOL:
1480 if ( input != length )
1481 {
1482 return false;
1483 }
1484 break;
1485  
1486  
1487 case sunlabs.brazil.util.regexp.Regexp.ANY:
1488 if ( input >= length )
1489 {
1490 return false;
1491 }
1492 input++;
1493 break;
1494  
1495  
1496 case sunlabs.brazil.util.regexp.Regexp.EXACTLY:
1497 {
1498 if ( compare( scan ) == false )
1499 {
1500 return false;
1501 }
1502 break;
1503 }
1504  
1505  
1506 case sunlabs.brazil.util.regexp.Regexp.ANYOF:
1507 if ( input >= length )
1508 {
1509 return false;
1510 }
1511 if ( present( scan ) == false )
1512 {
1513 return false;
1514 }
1515 input++;
1516 break;
1517  
1518  
1519 case sunlabs.brazil.util.regexp.Regexp.ANYBUT:
1520 if ( input >= length )
1521 {
1522 return false;
1523 }
1524 if ( present( scan ) )
1525 {
1526 return false;
1527 }
1528 input++;
1529 break;
1530  
1531  
1532 case sunlabs.brazil.util.regexp.Regexp.NOTHING:
1533 case sunlabs.brazil.util.regexp.Regexp.BACK:
1534 break;
1535  
1536  
1537 case sunlabs.brazil.util.regexp.Regexp.BRANCH:
1538 {
1539 if ( program[next] != sunlabs.brazil.util.regexp.Regexp.BRANCH )
1540 {
1541 next = scan + 2;
1542 }
1543 else
1544 {
1545 do
1546 {
1547 int save = input;
1548 if ( regmatch( scan + 2 ) )
1549 {
1550 return true;
1551 }
1552 input = save;
1553 scan = regnext( scan );
1554 }
1555 while ( ( scan >= 0 ) && ( program[scan] == sunlabs.brazil.util.regexp.Regexp.BRANCH ) );
1556 return false;
1557 }
1558 break;
1559 }
1560  
1561  
1562 case sunlabs.brazil.util.regexp.Regexp.STAR:
1563 case sunlabs.brazil.util.regexp.Regexp.PLUS:
1564 {
1565 /*
1566 * Lookahead to avoid useless match attempts
1567 * when we know what character comes next.
1568 */
1569  
1570 int ch = -1;
1571 if ( program[next] == sunlabs.brazil.util.regexp.Regexp.EXACTLY )
1572 {
1573 ch = program[next + 3];
1574 }
1575  
1576 int min = ( op == sunlabs.brazil.util.regexp.Regexp.STAR ) ? 0 : 1;
1577 int save = input;
1578 int no = regrepeat( scan + 2 );
1579  
1580 while ( no >= min )
1581 {
1582 /* If it could work, try it. */
1583 if ( ( ch < 0 ) || ( ( input < length ) && ( str[input] == ch ) ) )
1584 {
1585 if ( regmatch( next ) )
1586 {
1587 return true;
1588 }
1589 }
1590 /* Couldn't or didn't -- back up. */
1591 no--;
1592 input = save + no;
1593 }
1594 return false;
1595 }
1596  
1597  
1598 case sunlabs.brazil.util.regexp.Regexp.END:
1599 return true;
1600  
1601  
1602 default:
1603 if ( op >= sunlabs.brazil.util.regexp.Regexp.CLOSE )
1604 {
1605 int no = op - sunlabs.brazil.util.regexp.Regexp.CLOSE;
1606 int save = input;
1607  
1608 if ( regmatch( next ) )
1609 {
1610 /*
1611 * Don't set endp if some later
1612 * invocation of the same parentheses
1613 * already has.
1614 */
1615 if ( indices[no * 2 + 1] <= 0 )
1616 {
1617 indices[no * 2 + 1] = save;
1618 }
1619 return true;
1620 }
1621 }
1622 else if ( op >= sunlabs.brazil.util.regexp.Regexp.OPEN )
1623 {
1624 int no = op - sunlabs.brazil.util.regexp.Regexp.OPEN;
1625 int save = input;
1626  
1627 if ( regmatch( next ) )
1628 {
1629 /*
1630 * Don't set startp if some later invocation of the
1631 * same parentheses already has.
1632 */
1633 if ( indices[no * 2] <= 0 )
1634 {
1635 indices[no * 2] = save;
1636 }
1637 return true;
1638 }
1639 }
1640 return false;
1641  
1642 }
1643 scan = next;
1644 }
1645 }
1646  
1647 internal bool compare( int scan )
1648 {
1649 int count = program[scan + 2];
1650 if ( input + count > length )
1651 {
1652 return false;
1653 }
1654 int start = scan + 3;
1655 int end = start + count;
1656 for ( int i = start; i < end; i++ )
1657 {
1658 if ( str[input++] != program[i] )
1659 {
1660 return false;
1661 }
1662 }
1663 return true;
1664 }
1665  
1666 internal bool present( int scan )
1667 {
1668 char ch = str[input];
1669  
1670 int count = program[scan + 2];
1671 int start = scan + 3;
1672 int end = start + count;
1673  
1674 for ( int i = start; i < end; i++ )
1675 {
1676 if ( program[i] == ch )
1677 {
1678 return true;
1679 }
1680 }
1681 return false;
1682 }
1683  
1684  
1685 /*
1686 - regrepeat - repeatedly match something simple, report how many
1687 */
1688 internal int regrepeat( int scan )
1689 {
1690 int op = program[scan];
1691 int count = 0;
1692  
1693 switch ( op )
1694 {
1695  
1696 case sunlabs.brazil.util.regexp.Regexp.ANY:
1697  
1698 count = length - input;
1699 input = length;
1700 break;
1701  
1702  
1703 case sunlabs.brazil.util.regexp.Regexp.EXACTLY:
1704 {
1705 // 'g*' matches all the following 'g' characters.
1706  
1707 char ch = program[scan + 3];
1708 while ( ( input < length ) && ( str[input] == ch ) )
1709 {
1710 input++;
1711 count++;
1712 }
1713 break;
1714 }
1715  
1716  
1717 case sunlabs.brazil.util.regexp.Regexp.ANYOF:
1718  
1719 while ( ( input < length ) && present( scan ) )
1720 {
1721 input++;
1722 count++;
1723 }
1724 break;
1725  
1726  
1727  
1728 case sunlabs.brazil.util.regexp.Regexp.ANYBUT:
1729 while ( ( input < length ) && !present( scan ) )
1730 {
1731 input++;
1732 count++;
1733 }
1734 break;
1735 }
1736 return count;
1737 }
1738  
1739 /*
1740 - regnext - dig the "next" pointer out of a node
1741 */
1742 internal int regnext( int scan )
1743 {
1744 int offset = program[scan + 1];
1745 if ( program[scan] == sunlabs.brazil.util.regexp.Regexp.BACK )
1746 {
1747 return scan - offset;
1748 }
1749 else
1750 {
1751 return scan + offset;
1752 }
1753 }
1754 }
1755 }
1756 }