wasCSharpSQLite – Blame information for rev 4
?pathlinks?
Rev | Author | Line No. | Line |
---|---|---|---|
1 | office | 1 | /* |
2 | * Regexp.java |
||
3 | * |
||
4 | * Copyright (c) 1999 Sun Microsystems, Inc. |
||
5 | * |
||
6 | * See the file "license.terms" for information on usage and |
||
7 | * redistribution of this file, and for a DISCLAIMER OF ALL |
||
8 | * WARRANTIES. |
||
9 | * |
||
10 | * SCCS: %Z% %M% %I% %E% %U% |
||
11 | */ |
||
12 | // Included in SQLite3 port to C# for use in testharness only; 2008 Noah B Hart |
||
13 | //$Header$ |
||
14 | |||
15 | using System; |
||
16 | using System.Text; |
||
17 | namespace sunlabs.brazil.util.regexp |
||
18 | { |
||
19 | |||
20 | /// <summary> The <code>Regexp</code> class can be used to match a pattern against a |
||
21 | /// string and optionally replace the matched parts with new strings. |
||
22 | /// <p> |
||
23 | /// Regular expressions were implemented by translating Henry Spencer's |
||
24 | /// regular expression package for <a href="http://www.scriptics.com">tcl8.0</a>. |
||
25 | /// Much of the description below is copied verbatim from the tcl8.0 regsub |
||
26 | /// manual entry. |
||
27 | /// <hr> |
||
28 | /// REGULAR EXPRESSIONS |
||
29 | /// <p> |
||
30 | /// A regular expression is zero or more <code>branches</code>, separated by |
||
31 | /// "|". It matches anything that matches one of the branches. |
||
32 | /// <p> |
||
33 | /// A branch is zero or more <code>pieces</code>, concatenated. |
||
34 | /// It matches a match for the first piece, followed by a match for the |
||
35 | /// second piece, etc. |
||
36 | /// <p> |
||
37 | /// A piece is an <code>atom</code>, possibly followed by "*", "+", or |
||
38 | /// "?". <ul> |
||
39 | /// <li> An atom followed by "*" matches a sequence of 0 or more matches of |
||
40 | /// the atom. |
||
41 | /// <li> An atom followed by "+" matches a sequence of 1 or more matches of |
||
42 | /// the atom. |
||
43 | /// <li> An atom followed by "?" matches either 0 or 1 matches of the atom. |
||
44 | /// </ul> |
||
45 | /// <p> |
||
46 | /// An atom is <ul> |
||
47 | /// <li> a regular expression in parentheses (matching a match for the |
||
48 | /// regular expression) |
||
49 | /// <li> a <code>range</code> (see below) |
||
50 | /// <li> "." (matching any single character) |
||
51 | /// <li> "^" (matching the null string at the beginning of the input string) |
||
52 | /// <li> "$" (matching the null string at the end of the input string) |
||
53 | /// <li> a "\" followed by a single character (matching that character) |
||
54 | /// <li> a single character with no other significance (matching that |
||
55 | /// character). |
||
56 | /// </ul> |
||
57 | /// <p> |
||
58 | /// A <code>range</code> is a sequence of characters enclosed in "[]". |
||
59 | /// The range normally matches any single character from the sequence. |
||
60 | /// If the sequence begins with "^", the range matches any single character |
||
61 | /// <b>not</b> from the rest of the sequence. |
||
62 | /// If two characters in the sequence are separated by "-", this is shorthand |
||
63 | /// for the full list of characters between them (e.g. "[0-9]" matches any |
||
64 | /// decimal digit). To include a literal "]" in the sequence, make it the |
||
65 | /// first character (following a possible "^"). To include a literal "-", |
||
66 | /// make it the first or last character. |
||
67 | /// <p> |
||
68 | /// In general there may be more than one way to match a regular expression |
||
69 | /// to an input string. For example, consider the command |
||
70 | /// <pre> |
||
71 | /// String[] match = new String[2]; |
||
72 | /// Regexp.match("(a*)b*", "aabaaabb", match); |
||
73 | /// </pre> |
||
74 | /// Considering only the rules given so far, <code>match[0]</code> and |
||
75 | /// <code>match[1]</code> could end up with the values <ul> |
||
76 | /// <li> "aabb" and "aa" |
||
77 | /// <li> "aaab" and "aaa" |
||
78 | /// <li> "ab" and "a" |
||
79 | /// </ul> |
||
80 | /// or any of several other combinations. To resolve this potential ambiguity, |
||
81 | /// Regexp chooses among alternatives using the rule "first then longest". |
||
82 | /// In other words, it considers the possible matches in order working |
||
83 | /// from left to right across the input string and the pattern, and it |
||
84 | /// attempts to match longer pieces of the input string before shorter |
||
85 | /// ones. More specifically, the following rules apply in decreasing |
||
86 | /// order of priority: <ol> |
||
87 | /// <li> If a regular expression could match two different parts of an input |
||
88 | /// string then it will match the one that begins earliest. |
||
89 | /// <li> If a regular expression contains "|" operators then the |
||
90 | /// leftmost matching sub-expression is chosen. |
||
91 | /// <li> In "*", "+", and "?" constructs, longer matches are chosen in |
||
92 | /// preference to shorter ones. |
||
93 | /// <li> |
||
94 | /// In sequences of expression components the components are considered |
||
95 | /// from left to right. |
||
96 | /// </ol> |
||
97 | /// <p> |
||
98 | /// In the example from above, "(a*)b*" therefore matches exactly "aab"; the |
||
99 | /// "(a*)" portion of the pattern is matched first and it consumes the leading |
||
100 | /// "aa", then the "b*" portion of the pattern consumes the next "b". Or, |
||
101 | /// consider the following example: |
||
102 | /// <pre> |
||
103 | /// String match = new String[3]; |
||
104 | /// Regexp.match("(ab|a)(b*)c", "abc", match); |
||
105 | /// </pre> |
||
106 | /// After this command, <code>match[0]</code> will be "abc", |
||
107 | /// <code>match[1]</code> will be "ab", and <code>match[2]</code> will be an |
||
108 | /// empty string. |
||
109 | /// Rule 4 specifies that the "(ab|a)" component gets first shot at the input |
||
110 | /// string and Rule 2 specifies that the "ab" sub-expression |
||
111 | /// is checked before the "a" sub-expression. |
||
112 | /// Thus the "b" has already been claimed before the "(b*)" |
||
113 | /// component is checked and therefore "(b*)" must match an empty string. |
||
114 | /// <hr> |
||
115 | /// <a name=regsub></a> |
||
116 | /// REGULAR EXPRESSION SUBSTITUTION |
||
117 | /// <p> |
||
118 | /// Regular expression substitution matches a string against a regular |
||
119 | /// expression, transforming the string by replacing the matched region(s) |
||
120 | /// with new substring(s). |
||
121 | /// <p> |
||
122 | /// What gets substituted into the result is controlled by a |
||
123 | /// <code>subspec</code>. The subspec is a formatting string that specifies |
||
124 | /// what portions of the matched region should be substituted into the |
||
125 | /// result. |
||
126 | /// <ul> |
||
127 | /// <li> "&" or "\0" is replaced with a copy of the entire matched region. |
||
128 | /// <li> "\<code>n</code>", where <code>n</code> is a digit from 1 to 9, |
||
129 | /// is replaced with a copy of the <code>n</code><i>th</i> subexpression. |
||
130 | /// <li> "\&" or "\\" are replaced with just "&" or "\" to escape their |
||
131 | /// special meaning. |
||
132 | /// <li> any other character is passed through. |
||
133 | /// </ul> |
||
134 | /// In the above, strings like "\2" represents the two characters |
||
135 | /// <code>backslash</code> and "2", not the Unicode character 0002. |
||
136 | /// <hr> |
||
137 | /// Here is an example of how to use Regexp |
||
138 | /// <pre> |
||
139 | /// |
||
140 | /// public static void |
||
141 | /// main(String[] args) |
||
142 | /// throws Exception |
||
143 | /// { |
||
144 | /// Regexp re; |
||
145 | /// String[] matches; |
||
146 | /// String s; |
||
147 | /// |
||
148 | /// /* |
||
149 | /// * A regular expression to match the first line of a HTTP request. |
||
150 | /// * |
||
151 | /// * 1. ^ - starting at the beginning of the line |
||
152 | /// * 2. ([A-Z]+) - match and remember some upper case characters |
||
153 | /// * 3. [ \t]+ - skip blank space |
||
154 | /// * 4. ([^ \t]*) - match and remember up to the next blank space |
||
155 | /// * 5. [ \t]+ - skip more blank space |
||
156 | /// * 6. (HTTP/1\\.[01]) - match and remember HTTP/1.0 or HTTP/1.1 |
||
157 | /// * 7. $ - end of string - no chars left. |
||
158 | /// */ |
||
159 | /// |
||
160 | /// s = "GET http://a.b.com:1234/index.html HTTP/1.1"; |
||
161 | /// |
||
162 | /// Regexp re = new Regexp("^([A-Z]+)[ \t]+([^ \t]+)[ \t]+(HTTP/1\\.[01])$"); |
||
163 | /// String[] matches = new String[4]; |
||
164 | /// if (re.match(s, matches)) { |
||
165 | /// System.out.println("METHOD " + matches[1]); |
||
166 | /// System.out.println("URL " + matches[2]); |
||
167 | /// System.out.println("VERSION " + matches[3]); |
||
168 | /// } |
||
169 | /// |
||
170 | /// /* |
||
171 | /// * A regular expression to extract some simple comma-separated data, |
||
172 | /// * reorder some of the columns, and discard column 2. |
||
173 | /// */ |
||
174 | /// |
||
175 | /// s = "abc,def,ghi,klm,nop,pqr"; |
||
176 | /// |
||
177 | /// Regexp re = new Regexp("^([^,]+),([^,]+),([^,]+),(.*)"); |
||
178 | /// System.out.println(re.sub(s, "\\3,\\1,\\4")); |
||
179 | /// } |
||
180 | /// </pre> |
||
181 | /// |
||
182 | /// </summary> |
||
183 | /// <author> Colin Stevens (colin.stevens@sun.com) |
||
184 | /// </author> |
||
185 | /// <version> 1.7, 99/10/14 |
||
186 | /// </version> |
||
187 | /// <seealso cref="Regsub"> |
||
188 | /// </seealso> |
||
189 | |||
190 | public class Regexp |
||
191 | { |
||
192 | //[STAThread] |
||
193 | //public static void Main(string[] args) |
||
194 | //{ |
||
195 | // if ((args.Length == 2) && (args[0].Equals("compile"))) |
||
196 | // { |
||
197 | // System.Diagnostics.Debug.WriteLine(new Regexp(args[1])); |
||
198 | // } |
||
199 | // else if ((args.Length == 3) && (args[0].Equals("match"))) |
||
200 | // { |
||
201 | // Regexp r = new Regexp(args[1]); |
||
202 | // string[] substrs = new string[r.subspecs()]; |
||
203 | // bool match = r.match(args[2], substrs); |
||
204 | // System.Diagnostics.Debug.WriteLine("match:\t" + match); |
||
205 | // for (int i = 0; i < substrs.Length; i++) |
||
206 | // { |
||
207 | // System.Diagnostics.Debug.WriteLine((i + 1) + ":\t" + substrs[i]); |
||
208 | // } |
||
209 | // } |
||
210 | // else if ((args.Length == 4) && (args[0].Equals("sub"))) |
||
211 | // { |
||
212 | // Regexp r = new Regexp(args[1]); |
||
213 | // System.Diagnostics.Debug.WriteLine(r.subAll(args[2], args[3])); |
||
214 | // } |
||
215 | // else |
||
216 | // { |
||
217 | // System.Diagnostics.Debug.WriteLine("usage:"); |
||
218 | // System.Diagnostics.Debug.WriteLine("\tRegexp match <pattern> <string>"); |
||
219 | // System.Diagnostics.Debug.WriteLine("\tRegexp sub <pattern> <string> <subspec>"); |
||
220 | // System.Diagnostics.Debug.WriteLine("\tRegexp compile <pattern>"); |
||
221 | // } |
||
222 | //} |
||
223 | |||
224 | /* |
||
225 | * Structure for regexp "program". This is essentially a linear encoding |
||
226 | * of a nondeterministic finite-state machine (aka syntax charts or |
||
227 | * "railroad normal form" in parsing technology). Each node is an opcode |
||
228 | * plus a "next" pointer, possibly plus an operand. "Next" pointers of |
||
229 | * all nodes except BRANCH implement concatenation; a "next" pointer with |
||
230 | * a BRANCH on both ends of it is connecting two alternatives. (Here we |
||
231 | * have one of the subtle syntax dependencies: an individual BRANCH (as |
||
232 | * opposed to a collection of them) is never concatenated with anything |
||
233 | * because of operator precedence.) The operand of some types of node is |
||
234 | * a literal string; for others, it is a node leading into a sub-FSM. In |
||
235 | * particular, the operand of a BRANCH node is the first node of the branch. |
||
236 | * (NB this is *not* a tree structure: the tail of the branch connects |
||
237 | * to the thing following the set of BRANCHes.) The opcodes are: |
||
238 | */ |
||
239 | |||
240 | internal const int NSUBEXP = 100; |
||
241 | |||
242 | /* definition number opnd? meaning */ |
||
243 | |||
244 | internal const char END = (char)( 0 ); /* no End of program. */ |
||
245 | internal const char BOL = (char)( 1 ); /* no Match "" at beginning of line. */ |
||
246 | internal const char EOL = (char)( 2 ); /* no Match "" at end of line. */ |
||
247 | internal const char ANY = (char)( 3 ); /* no Match any one character. */ |
||
248 | internal const char ANYOF = (char)( 4 ); /* str Match any character in this string. */ |
||
249 | internal const char ANYBUT = (char)( 5 ); /* str Match any character not in this string. */ |
||
250 | internal const char BRANCH = (char)( 6 ); /* node Match this alternative, or the next... */ |
||
251 | internal const char BACK = (char)( 7 ); /* no Match "", "next" ptr points backward. */ |
||
252 | internal const char EXACTLY = (char)( 8 ); /* str Match this string. */ |
||
253 | internal const char NOTHING = (char)( 9 ); /* no Match empty string. */ |
||
254 | internal const char STAR = (char)( 10 ); /* node Match this (simple) thing 0 or more times. */ |
||
255 | internal const char PLUS = (char)( 11 ); /* node Match this (simple) thing 1 or more times. */ |
||
256 | internal const char OPEN = (char)( 20 ); /* no Mark this point in input as start of #n. */ |
||
257 | /* OPEN+1 is number 1, etc. */ |
||
258 | internal static readonly char CLOSE = (char)( OPEN + NSUBEXP ); |
||
259 | /* no Analogous to OPEN. */ |
||
260 | internal static readonly string[] opnames = new string[] { "END", "BOL", "EOL", "ANY", "ANYOF", "ANYBUT", "BRANCH", "BACK", "EXACTLY", "NOTHING", "STAR", "PLUS" }; |
||
261 | |||
262 | /* |
||
263 | * A node is one char of opcode followed by one char of "next" pointer. |
||
264 | * The value is a positive offset from the opcode of the node containing |
||
265 | * it. An operand, if any, simply follows the node. (Note that much of |
||
266 | * the code generation knows about this implicit relationship.) |
||
267 | * |
||
268 | * Opcode notes: |
||
269 | * |
||
270 | * BRANCH The set of branches constituting a single choice are hooked |
||
271 | * together with their "next" pointers, since precedence prevents |
||
272 | * anything being concatenated to any individual branch. The |
||
273 | * "next" pointer of the last BRANCH in a choice points to the |
||
274 | * thing following the whole choice. This is also where the |
||
275 | * final "next" pointer of each individual branch points; each |
||
276 | * branch starts with the operand node of a BRANCH node. |
||
277 | * |
||
278 | * ANYOF, ANYBUT, EXACTLY |
||
279 | * The format of a string operand is one char of length |
||
280 | * followed by the characters making up the string. |
||
281 | * |
||
282 | * BACK Normal "next" pointers all implicitly point forward; BACK |
||
283 | * exists to make loop structures possible. |
||
284 | * |
||
285 | * STAR, PLUS |
||
286 | * '?', and complex '*' and '+' are implemented as circular |
||
287 | * BRANCH structures using BACK. Simple cases (one character |
||
288 | * per match) are implemented with STAR and PLUS for speed |
||
289 | * and to minimize recursive plunges. |
||
290 | * |
||
291 | * OPENn, CLOSEn |
||
292 | * are numbered at compile time. |
||
293 | */ |
||
294 | |||
295 | |||
296 | /// <summary> The bytecodes making up the regexp program.</summary> |
||
297 | internal char[] program; |
||
298 | |||
299 | /// <summary> Whether the regexp matching should be case insensitive.</summary> |
||
300 | internal bool ignoreCase; |
||
301 | |||
302 | /// <summary> The number of parenthesized subexpressions in the regexp pattern, |
||
303 | /// plus 1 for the match of the whole pattern itself. |
||
304 | /// </summary> |
||
305 | internal int npar; |
||
306 | |||
307 | /// <summary> <code>true</code> if the pattern must match the beginning of the |
||
308 | /// string, so we don't have to waste time matching against all possible |
||
309 | /// starting locations in the string. |
||
310 | /// </summary> |
||
311 | internal bool anchored; |
||
312 | |||
313 | internal int startChar; |
||
314 | internal string must; |
||
315 | |||
316 | /// <summary> Compiles a new Regexp object from the given regular expression |
||
317 | /// pattern. |
||
318 | /// <p> |
||
319 | /// It takes a certain amount of time to parse and validate a regular |
||
320 | /// expression pattern before it can be used to perform matches |
||
321 | /// or substitutions. If the caller caches the new Regexp object, that |
||
322 | /// parsing time will be saved because the same Regexp can be used with |
||
323 | /// respect to many different strings. |
||
324 | /// |
||
325 | /// </summary> |
||
326 | /// <param name="">pat |
||
327 | /// The string holding the regular expression pattern. |
||
328 | /// |
||
329 | /// @throws IllegalArgumentException if the pattern is malformed. |
||
330 | /// The detail message for the exception will be set to a |
||
331 | /// string indicating how the pattern was malformed. |
||
332 | /// </param> |
||
333 | public Regexp( string pat ) |
||
334 | { |
||
335 | compile( pat ); |
||
336 | } |
||
337 | |||
338 | /// <summary> Compiles a new Regexp object from the given regular expression |
||
339 | /// pattern. |
||
340 | /// |
||
341 | /// </summary> |
||
342 | /// <param name="">pat |
||
343 | /// The string holding the regular expression pattern. |
||
344 | /// |
||
345 | /// </param> |
||
346 | /// <param name="">ignoreCase |
||
347 | /// If <code>true</code> then this regular expression will |
||
348 | /// do case-insensitive matching. If <code>false</code>, then |
||
349 | /// the matches are case-sensitive. Regular expressions |
||
350 | /// generated by <code>Regexp(String)</code> are case-sensitive. |
||
351 | /// |
||
352 | /// @throws IllegalArgumentException if the pattern is malformed. |
||
353 | /// The detail message for the exception will be set to a |
||
354 | /// string indicating how the pattern was malformed. |
||
355 | /// </param> |
||
356 | public Regexp( string pat, bool ignoreCase ) |
||
357 | { |
||
358 | this.ignoreCase = ignoreCase; |
||
359 | if ( ignoreCase ) |
||
360 | { |
||
361 | pat = pat.ToLower(); |
||
362 | } |
||
363 | compile( pat ); |
||
364 | } |
||
365 | |||
366 | /// <summary> Returns the number of parenthesized subexpressions in this regular |
||
367 | /// expression, plus one more for this expression itself. |
||
368 | /// |
||
369 | /// </summary> |
||
370 | /// <returns> The number. |
||
371 | /// </returns> |
||
372 | public int subspecs() |
||
373 | { |
||
374 | return npar; |
||
375 | } |
||
376 | |||
377 | /// <summary> Matches the given string against this regular expression. |
||
378 | /// |
||
379 | /// </summary> |
||
380 | /// <param name="">str |
||
381 | /// The string to match. |
||
382 | /// |
||
383 | /// </param> |
||
384 | /// <returns> The substring of <code>str</code> that matched the entire |
||
385 | /// regular expression, or <code>null</code> if the string did not |
||
386 | /// match this regular expression. |
||
387 | /// </returns> |
||
388 | public string match( string str ) |
||
389 | { |
||
390 | Match m = exec( str, 0, 0 ); |
||
391 | |||
392 | if ( m == null ) |
||
393 | { |
||
394 | return null; |
||
395 | } |
||
396 | return str.Substring( m.indices[0], ( m.indices[1] ) - ( m.indices[0] ) ); |
||
397 | } |
||
398 | |||
399 | /// <summary> Matches the given string against this regular expression, and computes |
||
400 | /// the set of substrings that matched the parenthesized subexpressions. |
||
401 | /// <p> |
||
402 | /// <code>substrs[0]</code> is set to the range of <code>str</code> |
||
403 | /// that matched the entire regular expression. |
||
404 | /// <p> |
||
405 | /// <code>substrs[1]</code> is set to the range of <code>str</code> |
||
406 | /// that matched the first (leftmost) parenthesized subexpression. |
||
407 | /// <code>substrs[n]</code> is set to the range that matched the |
||
408 | /// <code>n</code><i>th</i> subexpression, and so on. |
||
409 | /// <p> |
||
410 | /// If subexpression <code>n</code> did not match, then |
||
411 | /// <code>substrs[n]</code> is set to <code>null</code>. Not to |
||
412 | /// be confused with "", which is a valid value for a |
||
413 | /// subexpression that matched 0 characters. |
||
414 | /// <p> |
||
415 | /// The length that the caller should use when allocating the |
||
416 | /// <code>substr</code> array is the return value of |
||
417 | /// <code>Regexp.subspecs</code>. The array |
||
418 | /// can be shorter (in which case not all the information will |
||
419 | /// be returned), or longer (in which case the remainder of the |
||
420 | /// elements are initialized to <code>null</code>), or |
||
421 | /// <code>null</code> (to ignore the subexpressions). |
||
422 | /// |
||
423 | /// </summary> |
||
424 | /// <param name="">str |
||
425 | /// The string to match. |
||
426 | /// |
||
427 | /// </param> |
||
428 | /// <param name="">substrs |
||
429 | /// An array of strings allocated by the caller, and filled in |
||
430 | /// with information about the portions of <code>str</code> that |
||
431 | /// matched the regular expression. May be <code>null</code>. |
||
432 | /// |
||
433 | /// </param> |
||
434 | /// <returns> <code>true</code> if <code>str</code> that matched this |
||
435 | /// regular expression, <code>false</code> otherwise. |
||
436 | /// If <code>false</code> is returned, then the contents of |
||
437 | /// <code>substrs</code> are unchanged. |
||
438 | /// |
||
439 | /// </returns> |
||
440 | /// <seealso cref="#subspecs"> |
||
441 | /// </seealso> |
||
442 | public bool match( string str, string[] substrs ) |
||
443 | { |
||
444 | Match m = exec( str, 0, 0 ); |
||
445 | |||
446 | if ( m == null ) |
||
447 | { |
||
448 | return false; |
||
449 | } |
||
450 | if ( substrs != null ) |
||
451 | { |
||
452 | int max = System.Math.Min( substrs.Length, npar ); |
||
453 | int i; |
||
454 | int j = 0; |
||
455 | for ( i = 0; i < max; i++ ) |
||
456 | { |
||
457 | int start = m.indices[j++]; |
||
458 | int end = m.indices[j++]; |
||
459 | if ( start < 0 ) |
||
460 | { |
||
461 | substrs[i] = null; |
||
462 | } |
||
463 | else |
||
464 | { |
||
465 | substrs[i] = str.Substring( start, ( end ) - ( start ) ); |
||
466 | } |
||
467 | } |
||
468 | for ( ; i < substrs.Length; i++ ) |
||
469 | { |
||
470 | substrs[i] = null; |
||
471 | } |
||
472 | } |
||
473 | return true; |
||
474 | } |
||
475 | |||
476 | /// <summary> Matches the given string against this regular expression, and computes |
||
477 | /// the set of substrings that matched the parenthesized subexpressions. |
||
478 | /// <p> |
||
479 | /// For the indices specified below, the range extends from the character |
||
480 | /// at the starting index up to, but not including, the character at the |
||
481 | /// ending index. |
||
482 | /// <p> |
||
483 | /// <code>indices[0]</code> and <code>indices[1]</code> are set to |
||
484 | /// starting and ending indices of the range of <code>str</code> |
||
485 | /// that matched the entire regular expression. |
||
486 | /// <p> |
||
487 | /// <code>indices[2]</code> and <code>indices[3]</code> are set to the |
||
488 | /// starting and ending indices of the range of <code>str</code> that |
||
489 | /// matched the first (leftmost) parenthesized subexpression. |
||
490 | /// <code>indices[n * 2]</code> and <code>indices[n * 2 + 1]</code> |
||
491 | /// are set to the range that matched the <code>n</code><i>th</i> |
||
492 | /// subexpression, and so on. |
||
493 | /// <p> |
||
494 | /// If subexpression <code>n</code> did not match, then |
||
495 | /// <code>indices[n * 2]</code> and <code>indices[n * 2 + 1]</code> |
||
496 | /// are both set to <code>-1</code>. |
||
497 | /// <p> |
||
498 | /// The length that the caller should use when allocating the |
||
499 | /// <code>indices</code> array is twice the return value of |
||
500 | /// <code>Regexp.subspecs</code>. The array |
||
501 | /// can be shorter (in which case not all the information will |
||
502 | /// be returned), or longer (in which case the remainder of the |
||
503 | /// elements are initialized to <code>-1</code>), or |
||
504 | /// <code>null</code> (to ignore the subexpressions). |
||
505 | /// |
||
506 | /// </summary> |
||
507 | /// <param name="">str |
||
508 | /// The string to match. |
||
509 | /// |
||
510 | /// </param> |
||
511 | /// <param name="">indices |
||
512 | /// An array of integers allocated by the caller, and filled in |
||
513 | /// with information about the portions of <code>str</code> that |
||
514 | /// matched all the parts of the regular expression. |
||
515 | /// May be <code>null</code>. |
||
516 | /// |
||
517 | /// </param> |
||
518 | /// <returns> <code>true</code> if the string matched the regular expression, |
||
519 | /// <code>false</code> otherwise. If <code>false</code> is |
||
520 | /// returned, then the contents of <code>indices</code> are |
||
521 | /// unchanged. |
||
522 | /// |
||
523 | /// </returns> |
||
524 | /// <seealso cref="#subspecs"> |
||
525 | /// </seealso> |
||
526 | public bool match( string str, int[] indices ) |
||
527 | { |
||
528 | Match m = exec( str, 0, 0 ); |
||
529 | |||
530 | if ( m == null ) |
||
531 | { |
||
532 | return false; |
||
533 | } |
||
534 | if ( indices != null ) |
||
535 | { |
||
536 | int max = System.Math.Min( indices.Length, npar * 2 ); |
||
537 | Array.Copy( (System.Array)m.indices, 0, (System.Array)indices, 0, max ); |
||
538 | |||
539 | for ( int i = max; i < indices.Length; i++ ) |
||
540 | { |
||
541 | indices[i] = -1; |
||
542 | } |
||
543 | } |
||
544 | return true; |
||
545 | } |
||
546 | |||
547 | /// <summary> Matches a string against a regular expression and replaces the first |
||
548 | /// match with the string generated from the substitution parameter. |
||
549 | /// |
||
550 | /// </summary> |
||
551 | /// <param name="">str |
||
552 | /// The string to match against this regular expression. |
||
553 | /// |
||
554 | /// </param> |
||
555 | /// <param name="">subspec |
||
556 | /// The substitution parameter, described in <a href=#regsub> |
||
557 | /// REGULAR EXPRESSION SUBSTITUTION</a>. |
||
558 | /// |
||
559 | /// </param> |
||
560 | /// <returns> The string formed by replacing the first match in |
||
561 | /// <code>str</code> with the string generated from |
||
562 | /// <code>subspec</code>. If no matches were found, then |
||
563 | /// the return value is <code>null</code>. |
||
564 | /// </returns> |
||
565 | public string sub( string str, string subspec ) |
||
566 | { |
||
567 | Regsub rs = new Regsub( this, str ); |
||
568 | if ( rs.nextMatch() ) |
||
569 | { |
||
570 | StringBuilder sb = new StringBuilder( rs.skipped() ); |
||
571 | applySubspec( rs, subspec, sb ); |
||
572 | sb.Append( rs.rest() ); |
||
573 | |||
574 | return sb.ToString(); |
||
575 | } |
||
576 | else |
||
577 | { |
||
578 | return null; |
||
579 | } |
||
580 | } |
||
581 | |||
582 | /// <summary> Matches a string against a regular expression and replaces all |
||
583 | /// matches with the string generated from the substitution parameter. |
||
584 | /// After each substutition is done, the portions of the string already |
||
585 | /// examined, including the newly substituted region, are <b>not</b> checked |
||
586 | /// again for new matches -- only the rest of the string is examined. |
||
587 | /// |
||
588 | /// </summary> |
||
589 | /// <param name="">str |
||
590 | /// The string to match against this regular expression. |
||
591 | /// |
||
592 | /// </param> |
||
593 | /// <param name="">subspec |
||
594 | /// The substitution parameter, described in <a href=#regsub> |
||
595 | /// REGULAR EXPRESSION SUBSTITUTION</a>. |
||
596 | /// |
||
597 | /// </param> |
||
598 | /// <returns> The string formed by replacing all the matches in |
||
599 | /// <code>str</code> with the strings generated from |
||
600 | /// <code>subspec</code>. If no matches were found, then |
||
601 | /// the return value is a copy of <code>str</code>. |
||
602 | /// </returns> |
||
603 | public string subAll( string str, string subspec ) |
||
604 | { |
||
605 | return sub( str, new SubspecFilter( subspec, true ) ); |
||
606 | } |
||
607 | |||
608 | /// <summary> Utility method to give access to the standard substitution algorithm |
||
609 | /// used by <code>sub</code> and <code>subAll</code>. Appends to the |
||
610 | /// string buffer the string generated by applying the substitution |
||
611 | /// parameter to the matched region. |
||
612 | /// |
||
613 | /// </summary> |
||
614 | /// <param name="">rs |
||
615 | /// Information about the matched region. |
||
616 | /// |
||
617 | /// </param> |
||
618 | /// <param name="">subspec |
||
619 | /// The substitution parameter. |
||
620 | /// |
||
621 | /// </param> |
||
622 | /// <param name="">sb |
||
623 | /// StringBuffer to which the generated string is appended. |
||
624 | /// </param> |
||
625 | public static void applySubspec( Regsub rs, string subspec, StringBuilder sb ) |
||
626 | { |
||
627 | try |
||
628 | { |
||
629 | int len = subspec.Length; |
||
630 | for ( int i = 0; i < len; i++ ) |
||
631 | { |
||
632 | char ch = subspec[i]; |
||
633 | switch ( ch ) |
||
634 | { |
||
635 | |||
636 | case '&': |
||
637 | { |
||
638 | sb.Append( rs.matched() ); |
||
639 | break; |
||
640 | } |
||
641 | |||
642 | case '\\': |
||
643 | { |
||
644 | i++; |
||
645 | ch = subspec[i]; |
||
646 | if ( ( ch >= '0' ) && ( ch <= '9' ) ) |
||
647 | { |
||
648 | string match = rs.submatch( ch - '0' ); |
||
649 | if ( (System.Object)match != null ) |
||
650 | { |
||
651 | sb.Append( match ); |
||
652 | } |
||
653 | break; |
||
654 | } |
||
655 | // fall through. |
||
656 | } |
||
657 | goto default; |
||
658 | |||
659 | default: |
||
660 | { |
||
661 | sb.Append( ch ); |
||
662 | } |
||
663 | break; |
||
664 | |||
665 | } |
||
666 | } |
||
667 | } |
||
668 | catch ( System.IndexOutOfRangeException e ) |
||
669 | { |
||
670 | /* |
||
671 | * Ignore malformed substitution pattern. |
||
672 | * Return string matched so far. |
||
673 | */ |
||
674 | } |
||
675 | } |
||
676 | |||
677 | public string sub( string str, Filter rf ) |
||
678 | { |
||
679 | Regsub rs = new Regsub( this, str ); |
||
680 | if ( rs.nextMatch() == false ) |
||
681 | { |
||
682 | return str; |
||
683 | } |
||
684 | |||
685 | StringBuilder sb = new StringBuilder(); |
||
686 | do |
||
687 | { |
||
688 | sb.Append( rs.skipped() ); |
||
689 | if ( rf.filter( rs, sb ) == false ) |
||
690 | { |
||
691 | break; |
||
692 | } |
||
693 | } |
||
694 | while ( rs.nextMatch() ); |
||
695 | sb.Append( rs.rest() ); |
||
696 | return sb.ToString(); |
||
697 | } |
||
698 | |||
699 | /// <summary> This interface is used by the <code>Regexp</code> class to generate |
||
700 | /// the replacement string for each pattern match found in the source |
||
701 | /// string. |
||
702 | /// |
||
703 | /// </summary> |
||
704 | /// <author> Colin Stevens (colin.stevens@sun.com) |
||
705 | /// </author> |
||
706 | /// <version> 1.7, 99/10/14 |
||
707 | /// </version> |
||
708 | public interface Filter |
||
709 | { |
||
710 | /// <summary> Given the current state of the match, generate the replacement |
||
711 | /// string. This method will be called for each match found in |
||
712 | /// the source string, unless this filter decides not to handle any |
||
713 | /// more matches. |
||
714 | /// <p> |
||
715 | /// The implementation can use whatever rules it chooses |
||
716 | /// to generate the replacement string. For example, here is an |
||
717 | /// example of a filter that replaces the first <b>5</b> |
||
718 | /// occurrences of "%XX" in a string with the ASCII character |
||
719 | /// represented by the hex digits "XX": |
||
720 | /// <pre> |
||
721 | /// String str = ...; |
||
722 | /// |
||
723 | /// Regexp re = new Regexp("%[a-fA-F0-9][a-fA-F0-9]"); |
||
724 | /// |
||
725 | /// Regexp.Filter rf = new Regexp.Filter() { |
||
726 | /// int count = 5; |
||
727 | /// public boolean filter(Regsub rs, StringBuffer sb) { |
||
728 | /// String match = rs.matched(); |
||
729 | /// int hi = Character.digit(match.charAt(1), 16); |
||
730 | /// int lo = Character.digit(match.charAt(2), 16); |
||
731 | /// sb.append((char) ((hi << 4) | lo)); |
||
732 | /// return (--count > 0); |
||
733 | /// } |
||
734 | /// } |
||
735 | /// |
||
736 | /// String result = re.sub(str, rf); |
||
737 | /// </pre> |
||
738 | /// |
||
739 | /// </summary> |
||
740 | /// <param name="">rs |
||
741 | /// <code>Regsub</code> containing the state of the current |
||
742 | /// match. |
||
743 | /// |
||
744 | /// </param> |
||
745 | /// <param name="">sb |
||
746 | /// The string buffer that this filter should append the |
||
747 | /// generated string to. This string buffer actually |
||
748 | /// contains the results the calling <code>Regexp</code> has |
||
749 | /// generated up to this point. |
||
750 | /// |
||
751 | /// </param> |
||
752 | /// <returns> <code>false</code> if no further matches should be |
||
753 | /// considered in this string, <code>true</code> to allow |
||
754 | /// <code>Regexp</code> to continue looking for further |
||
755 | /// matches. |
||
756 | /// </returns> |
||
757 | bool filter( Regsub rs, StringBuilder sb ); |
||
758 | } |
||
759 | |||
760 | private class SubspecFilter : Filter |
||
761 | { |
||
762 | internal string subspec; |
||
763 | internal bool all; |
||
764 | |||
765 | public SubspecFilter( string subspec, bool all ) |
||
766 | { |
||
767 | this.subspec = subspec; |
||
768 | this.all = all; |
||
769 | } |
||
770 | |||
771 | public bool filter( Regsub rs, StringBuilder sb ) |
||
772 | { |
||
773 | sunlabs.brazil.util.regexp.Regexp.applySubspec( rs, subspec, sb ); |
||
774 | return all; |
||
775 | } |
||
776 | } |
||
777 | |||
778 | /// <summary> Returns a string representation of this compiled regular |
||
779 | /// expression. The format of the string representation is a |
||
780 | /// symbolic dump of the bytecodes. |
||
781 | /// |
||
782 | /// </summary> |
||
783 | /// <returns> A string representation of this regular expression. |
||
784 | /// </returns> |
||
785 | public override string ToString() |
||
786 | { |
||
787 | StringBuilder sb = new StringBuilder(); |
||
788 | |||
789 | sb.Append( "# subs: " + npar + "\n" ); |
||
790 | sb.Append( "anchor: " + anchored + "\n" ); |
||
791 | sb.Append( "start: " + (char)startChar + "\n" ); |
||
792 | sb.Append( "must: " + must + "\n" ); |
||
793 | |||
794 | for ( int i = 0; i < program.Length; ) |
||
795 | { |
||
796 | sb.Append( i + ":\t" ); |
||
797 | int op = program[i]; |
||
798 | if ( op >= CLOSE ) |
||
799 | { |
||
800 | sb.Append( "CLOSE" + ( op - CLOSE ) ); |
||
801 | } |
||
802 | else if ( op >= OPEN ) |
||
803 | { |
||
804 | sb.Append( "OPEN" + ( op - OPEN ) ); |
||
805 | } |
||
806 | else |
||
807 | { |
||
808 | sb.Append( opnames[op] ); |
||
809 | } |
||
810 | int line; |
||
811 | int offset = (int)program[i + 1]; |
||
812 | if ( offset == 0 ) |
||
813 | { |
||
814 | sb.Append( '\t' ); |
||
815 | } |
||
816 | else if ( op == BACK ) |
||
817 | { |
||
818 | sb.Append( "\t-" + offset + "," + ( i - offset ) ); |
||
819 | } |
||
820 | else |
||
821 | { |
||
822 | sb.Append( "\t+" + offset + "," + ( i + offset ) ); |
||
823 | } |
||
824 | |||
825 | if ( ( op == ANYOF ) || ( op == ANYBUT ) || ( op == EXACTLY ) ) |
||
826 | { |
||
827 | sb.Append( "\t'" ); |
||
828 | sb.Append( program, i + 3, program[i + 2] ); |
||
829 | sb.Append( "'" ); |
||
830 | i += 3 + program[i + 2]; |
||
831 | } |
||
832 | else |
||
833 | { |
||
834 | i += 2; |
||
835 | } |
||
836 | sb.Append( '\n' ); |
||
837 | } |
||
838 | return sb.ToString(); |
||
839 | } |
||
840 | |||
841 | |||
842 | private void compile( string exp ) |
||
843 | { |
||
844 | Compiler rcstate = new Compiler(); |
||
845 | rcstate.parse = exp.ToCharArray(); |
||
846 | rcstate.off = 0; |
||
847 | rcstate.npar = 1; |
||
848 | rcstate.code = new StringBuilder(); |
||
849 | |||
850 | rcstate.reg( false ); |
||
851 | |||
852 | program = rcstate.code.ToString().ToCharArray(); |
||
853 | npar = rcstate.npar; |
||
854 | startChar = -1; |
||
855 | |||
856 | /* optimize */ |
||
857 | if ( program[rcstate.regnext( 0 )] == END ) |
||
858 | { |
||
859 | if ( program[2] == BOL ) |
||
860 | { |
||
861 | anchored = true; |
||
862 | } |
||
863 | else if ( program[2] == EXACTLY ) |
||
864 | { |
||
865 | startChar = (int)program[5]; |
||
866 | } |
||
867 | } |
||
868 | |||
869 | /* |
||
870 | * If there's something expensive in the r.e., find the |
||
871 | * longest literal string that must appear and make it the |
||
872 | * regmust. Resolve ties in favor of later strings, since |
||
873 | * the regstart check works with the beginning of the r.e. |
||
874 | * and avoiding duplication strengthens checking. Not a |
||
875 | * strong reason, but sufficient in the absence of others. |
||
876 | */ |
||
877 | /* |
||
878 | if ((rcstate.flagp & Compiler.SPSTART) != 0) { |
||
879 | int index = -1; |
||
880 | int longest = 0; |
||
881 | |||
882 | for (scan = 0; scan < program.length; ) { |
||
883 | switch (program[scan]) { |
||
884 | case EXACTLY: |
||
885 | int length = program[scan + 2]; |
||
886 | if (length > longest) { |
||
887 | index = scan; |
||
888 | longest = length; |
||
889 | } |
||
890 | // fall through; |
||
891 | |||
892 | case ANYOF: |
||
893 | case ANYBUT: |
||
894 | scan += 3 + program[scan + 2]; |
||
895 | break; |
||
896 | |||
897 | default: |
||
898 | scan += 2; |
||
899 | break; |
||
900 | } |
||
901 | } |
||
902 | if (longest > 0) { |
||
903 | must = new String(program, index + 3, longest); |
||
904 | } |
||
905 | }*/ |
||
906 | } |
||
907 | |||
908 | internal Match exec( string str, int start, int off ) |
||
909 | { |
||
910 | if ( ignoreCase ) |
||
911 | { |
||
912 | str = str.ToLower(); |
||
913 | } |
||
914 | |||
915 | Match match = new Match(); |
||
916 | |||
917 | match.program = program; |
||
918 | |||
919 | /* Mark beginning of line for ^ . */ |
||
920 | match.str = str; |
||
921 | match.bol = start; |
||
922 | match.length = str.Length; |
||
923 | |||
924 | match.indices = new int[npar * 2]; |
||
925 | |||
926 | if ( anchored ) |
||
927 | { |
||
928 | /* Simplest case: anchored match need be tried only once. */ |
||
929 | if ( match.regtry( off ) ) |
||
930 | { |
||
931 | return match; |
||
932 | } |
||
933 | } |
||
934 | else if ( startChar >= 0 ) |
||
935 | { |
||
936 | /* We know what char it must start with. */ |
||
937 | while ( off < match.length ) |
||
938 | { |
||
939 | off = str.IndexOf( (System.Char)startChar, off ); |
||
940 | if ( off < 0 ) |
||
941 | { |
||
942 | break; |
||
943 | } |
||
944 | if ( match.regtry( off ) ) |
||
945 | { |
||
946 | return match; |
||
947 | } |
||
948 | off++; |
||
949 | } |
||
950 | } |
||
951 | else |
||
952 | { |
||
953 | /* Messy cases: unanchored match. */ |
||
954 | do |
||
955 | { |
||
956 | if ( match.regtry( off ) ) |
||
957 | { |
||
958 | return match; |
||
959 | } |
||
960 | } |
||
961 | while ( off++ < match.length ); |
||
962 | } |
||
963 | return null; |
||
964 | } |
||
965 | |||
966 | internal class Compiler |
||
967 | { |
||
968 | internal char[] parse; |
||
969 | internal int off; |
||
970 | internal int npar; |
||
971 | internal StringBuilder code; |
||
972 | internal int flagp; |
||
973 | |||
974 | |||
975 | internal const string META = "^$.[()|?+*\\"; |
||
976 | internal const string MULT = "*+?"; |
||
977 | |||
978 | internal const int WORST = 0; /* Worst case. */ |
||
979 | internal const int HASWIDTH = 1; /* Known never to match null string. */ |
||
980 | internal const int SIMPLE = 2; /* Simple enough to be STAR/PLUS operand. */ |
||
981 | internal const int SPSTART = 4; /* Starts with * or +. */ |
||
982 | |||
983 | /* |
||
984 | - reg - regular expression, i.e. main body or parenthesized thing |
||
985 | * |
||
986 | * Caller must absorb opening parenthesis. |
||
987 | * |
||
988 | * Combining parenthesis handling with the base level of regular expression |
||
989 | * is a trifle forced, but the need to tie the tails of the branches to what |
||
990 | * follows makes it hard to avoid. |
||
991 | */ |
||
992 | internal int reg( bool paren ) |
||
993 | { |
||
994 | int netFlags = HASWIDTH; |
||
995 | int parno = 0; |
||
996 | |||
997 | int ret = -1; |
||
998 | if ( paren ) |
||
999 | { |
||
1000 | parno = npar++; |
||
1001 | if ( npar >= sunlabs.brazil.util.regexp.Regexp.NSUBEXP ) |
||
1002 | { |
||
1003 | throw new System.ArgumentException( "too many ()" ); |
||
1004 | } |
||
1005 | ret = regnode( (char)( sunlabs.brazil.util.regexp.Regexp.OPEN + parno ) ); |
||
1006 | } |
||
1007 | |||
1008 | /* Pick up the branches, linking them together. */ |
||
1009 | int br = regbranch(); |
||
1010 | if ( ret >= 0 ) |
||
1011 | { |
||
1012 | regtail( ret, br ); |
||
1013 | } |
||
1014 | else |
||
1015 | { |
||
1016 | ret = br; |
||
1017 | } |
||
1018 | |||
1019 | if ( ( flagp & HASWIDTH ) == 0 ) |
||
1020 | { |
||
1021 | netFlags &= ~HASWIDTH; |
||
1022 | } |
||
1023 | netFlags |= ( flagp & SPSTART ); |
||
1024 | while ( ( off < parse.Length ) && ( parse[off] == '|' ) ) |
||
1025 | { |
||
1026 | off++; |
||
1027 | br = regbranch(); |
||
1028 | regtail( ret, br ); |
||
1029 | if ( ( flagp & HASWIDTH ) == 0 ) |
||
1030 | { |
||
1031 | netFlags &= ~HASWIDTH; |
||
1032 | } |
||
1033 | netFlags |= ( flagp & SPSTART ); |
||
1034 | } |
||
1035 | |||
1036 | /* Make a closing node, and hook it on the end. */ |
||
1037 | int ender = regnode( ( paren ) ? (char)( sunlabs.brazil.util.regexp.Regexp.CLOSE + parno ) : sunlabs.brazil.util.regexp.Regexp.END ); |
||
1038 | regtail( ret, ender ); |
||
1039 | |||
1040 | /* Hook the tails of the branches to the closing node. */ |
||
1041 | for ( br = ret; br >= 0; br = regnext( br ) ) |
||
1042 | { |
||
1043 | regoptail( br, ender ); |
||
1044 | } |
||
1045 | |||
1046 | /* Check for proper termination. */ |
||
1047 | if ( paren && ( ( off >= parse.Length ) || ( parse[off++] != ')' ) ) ) |
||
1048 | { |
||
1049 | throw new System.ArgumentException( "missing )" ); |
||
1050 | } |
||
1051 | else if ( ( paren == false ) && ( off < parse.Length ) ) |
||
1052 | { |
||
1053 | throw new System.ArgumentException( "unexpected )" ); |
||
1054 | } |
||
1055 | |||
1056 | flagp = netFlags; |
||
1057 | return ret; |
||
1058 | } |
||
1059 | |||
1060 | /* |
||
1061 | - regbranch - one alternative of an | operator |
||
1062 | * |
||
1063 | * Implements the concatenation operator. |
||
1064 | */ |
||
1065 | internal int regbranch() |
||
1066 | { |
||
1067 | int netFlags = WORST; /* Tentatively. */ |
||
1068 | |||
1069 | int ret = regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ); |
||
1070 | int chain = -1; |
||
1071 | while ( ( off < parse.Length ) && ( parse[off] != '|' ) && ( parse[off] != ')' ) ) |
||
1072 | { |
||
1073 | int latest = regpiece(); |
||
1074 | netFlags |= flagp & HASWIDTH; |
||
1075 | if ( chain < 0 ) |
||
1076 | { |
||
1077 | /* First piece. */ |
||
1078 | netFlags |= ( flagp & SPSTART ); |
||
1079 | } |
||
1080 | else |
||
1081 | { |
||
1082 | regtail( chain, latest ); |
||
1083 | } |
||
1084 | chain = latest; |
||
1085 | } |
||
1086 | if ( chain < 0 ) |
||
1087 | { |
||
1088 | /* Loop ran zero times. */ |
||
1089 | regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING ); |
||
1090 | } |
||
1091 | |||
1092 | flagp = netFlags; |
||
1093 | return ret; |
||
1094 | } |
||
1095 | |||
1096 | /* |
||
1097 | - regpiece - something followed by possible [*+?] |
||
1098 | * |
||
1099 | * Note that the branching code sequences used for ? and the general cases |
||
1100 | * of * and + are somewhat optimized: they use the same NOTHING node as |
||
1101 | * both the endmarker for their branch list and the body of the last branch. |
||
1102 | * It might seem that this node could be dispensed with entirely, but the |
||
1103 | * endmarker role is not redundant. |
||
1104 | */ |
||
1105 | internal int regpiece() |
||
1106 | { |
||
1107 | int netFlags; |
||
1108 | |||
1109 | int ret = regatom(); |
||
1110 | |||
1111 | if ( ( off >= parse.Length ) || ( isMult( parse[off] ) == false ) ) |
||
1112 | { |
||
1113 | return ret; |
||
1114 | } |
||
1115 | char op = parse[off]; |
||
1116 | |||
1117 | if ( ( ( flagp & HASWIDTH ) == 0 ) && ( op != '?' ) ) |
||
1118 | { |
||
1119 | throw new System.ArgumentException( "*+ operand could be empty" ); |
||
1120 | } |
||
1121 | netFlags = ( op != '+' ) ? ( WORST | SPSTART ) : ( WORST | HASWIDTH ); |
||
1122 | |||
1123 | if ( ( op == '*' ) && ( ( flagp & SIMPLE ) != 0 ) ) |
||
1124 | { |
||
1125 | reginsert( sunlabs.brazil.util.regexp.Regexp.STAR, ret ); |
||
1126 | } |
||
1127 | else if ( op == '*' ) |
||
1128 | { |
||
1129 | /* Emit x* as (x&|), where & means "self". */ |
||
1130 | reginsert( sunlabs.brazil.util.regexp.Regexp.BRANCH, ret ); /* Either x */ |
||
1131 | regoptail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.BACK ) ); /* and loop */ |
||
1132 | regoptail( ret, ret ); /* back */ |
||
1133 | regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ) ); /* or */ |
||
1134 | regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING ) ); /* null. */ |
||
1135 | } |
||
1136 | else if ( ( op == '+' ) && ( ( flagp & SIMPLE ) != 0 ) ) |
||
1137 | { |
||
1138 | reginsert( sunlabs.brazil.util.regexp.Regexp.PLUS, ret ); |
||
1139 | } |
||
1140 | else if ( op == '+' ) |
||
1141 | { |
||
1142 | /* Emit x+ as x(&|), where & means "self". */ |
||
1143 | int next = regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ); /* Either */ |
||
1144 | regtail( ret, next ); |
||
1145 | regtail( regnode( sunlabs.brazil.util.regexp.Regexp.BACK ), ret ); /* loop back */ |
||
1146 | regtail( next, regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ) ); /* or */ |
||
1147 | regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING ) ); /* null. */ |
||
1148 | } |
||
1149 | else if ( op == '?' ) |
||
1150 | { |
||
1151 | /* Emit x? as (x|) */ |
||
1152 | reginsert( sunlabs.brazil.util.regexp.Regexp.BRANCH, ret ); /* Either x */ |
||
1153 | regtail( ret, regnode( sunlabs.brazil.util.regexp.Regexp.BRANCH ) ); /* or */ |
||
1154 | int next = regnode( sunlabs.brazil.util.regexp.Regexp.NOTHING ); /* null. */ |
||
1155 | regtail( ret, next ); |
||
1156 | regoptail( ret, next ); |
||
1157 | } |
||
1158 | off++; |
||
1159 | if ( ( off < parse.Length ) && isMult( parse[off] ) ) |
||
1160 | { |
||
1161 | throw new System.ArgumentException( "nested *?+" ); |
||
1162 | } |
||
1163 | |||
1164 | flagp = netFlags; |
||
1165 | return ret; |
||
1166 | } |
||
1167 | |||
1168 | /* |
||
1169 | - regatom - the lowest level |
||
1170 | * |
||
1171 | * Optimization: gobbles an entire sequence of ordinary characters so that |
||
1172 | * it can turn them into a single node, which is smaller to store and |
||
1173 | * faster to run. Backslashed characters are exceptions, each becoming a |
||
1174 | * separate node; the code is simpler that way and it's not worth fixing. |
||
1175 | */ |
||
1176 | internal int regatom() |
||
1177 | { |
||
1178 | int netFlags = WORST; /* Tentatively. */ |
||
1179 | int ret; |
||
1180 | |||
1181 | switch ( parse[off++] ) |
||
1182 | { |
||
1183 | |||
1184 | case '^': |
||
1185 | ret = regnode( sunlabs.brazil.util.regexp.Regexp.BOL ); |
||
1186 | break; |
||
1187 | |||
1188 | case '$': |
||
1189 | ret = regnode( sunlabs.brazil.util.regexp.Regexp.EOL ); |
||
1190 | break; |
||
1191 | |||
1192 | case '.': |
||
1193 | ret = regnode( sunlabs.brazil.util.regexp.Regexp.ANY ); |
||
1194 | netFlags |= ( HASWIDTH | SIMPLE ); |
||
1195 | break; |
||
1196 | |||
1197 | case '[': |
||
1198 | { |
||
1199 | try |
||
1200 | { |
||
1201 | if ( parse[off] == '^' ) |
||
1202 | { |
||
1203 | ret = regnode( sunlabs.brazil.util.regexp.Regexp.ANYBUT ); |
||
1204 | off++; |
||
1205 | } |
||
1206 | else |
||
1207 | { |
||
1208 | ret = regnode( sunlabs.brazil.util.regexp.Regexp.ANYOF ); |
||
1209 | } |
||
1210 | |||
1211 | int pos = reglen(); |
||
1212 | regc( '\x0000' ); |
||
1213 | |||
1214 | if ( ( parse[off] == ']' ) || ( parse[off] == '-' ) ) |
||
1215 | { |
||
1216 | regc( parse[off++] ); |
||
1217 | } |
||
1218 | while ( parse[off] != ']' ) |
||
1219 | { |
||
1220 | if ( parse[off] == '-' ) |
||
1221 | { |
||
1222 | off++; |
||
1223 | if ( parse[off] == ']' ) |
||
1224 | { |
||
1225 | regc( '-' ); |
||
1226 | } |
||
1227 | else |
||
1228 | { |
||
1229 | int start = parse[off - 2]; |
||
1230 | int end = parse[off++]; |
||
1231 | if ( start > end ) |
||
1232 | { |
||
1233 | throw new System.ArgumentException( "invalid [] range" ); |
||
1234 | } |
||
1235 | for ( int i = start + 1; i <= end; i++ ) |
||
1236 | { |
||
1237 | regc( (char)i ); |
||
1238 | } |
||
1239 | } |
||
1240 | } |
||
1241 | else |
||
1242 | { |
||
1243 | regc( parse[off++] ); |
||
1244 | } |
||
1245 | } |
||
1246 | regset( pos, (char)( reglen() - pos - 1 ) ); |
||
1247 | off++; |
||
1248 | netFlags |= HASWIDTH | SIMPLE; |
||
1249 | } |
||
1250 | catch ( System.IndexOutOfRangeException e ) |
||
1251 | { |
||
1252 | throw new System.ArgumentException( "missing ]" ); |
||
1253 | } |
||
1254 | break; |
||
1255 | } |
||
1256 | |||
1257 | case '(': |
||
1258 | ret = reg( true ); |
||
1259 | netFlags |= ( flagp & ( HASWIDTH | SPSTART ) ); |
||
1260 | break; |
||
1261 | |||
1262 | case '|': |
||
1263 | case ')': |
||
1264 | throw new System.ArgumentException( "internal urp" ); |
||
1265 | |||
1266 | case '?': |
||
1267 | case '+': |
||
1268 | case '*': |
||
1269 | throw new System.ArgumentException( "?+* follows nothing" ); |
||
1270 | |||
1271 | case '\\': |
||
1272 | if ( off >= parse.Length ) |
||
1273 | { |
||
1274 | throw new System.ArgumentException( "trailing \\" ); |
||
1275 | } |
||
1276 | ret = regnode( sunlabs.brazil.util.regexp.Regexp.EXACTLY ); |
||
1277 | regc( (char)1 ); |
||
1278 | regc( parse[off++] ); |
||
1279 | netFlags |= HASWIDTH | SIMPLE; |
||
1280 | break; |
||
1281 | |||
1282 | default: |
||
1283 | { |
||
1284 | off--; |
||
1285 | int end; |
||
1286 | for ( end = off; end < parse.Length; end++ ) |
||
1287 | { |
||
1288 | if ( META.IndexOf( (System.Char)parse[end] ) >= 0 ) |
||
1289 | { |
||
1290 | break; |
||
1291 | } |
||
1292 | } |
||
1293 | if ( ( end > off + 1 ) && ( end < parse.Length ) && isMult( parse[end] ) ) |
||
1294 | { |
||
1295 | end--; /* Back off clear of ?+* operand. */ |
||
1296 | } |
||
1297 | netFlags |= HASWIDTH; |
||
1298 | if ( end == off + 1 ) |
||
1299 | { |
||
1300 | netFlags |= SIMPLE; |
||
1301 | } |
||
1302 | ret = regnode( sunlabs.brazil.util.regexp.Regexp.EXACTLY ); |
||
1303 | regc( (char)( end - off ) ); |
||
1304 | for ( ; off < end; off++ ) |
||
1305 | { |
||
1306 | regc( parse[off] ); |
||
1307 | } |
||
1308 | } |
||
1309 | break; |
||
1310 | |||
1311 | } |
||
1312 | |||
1313 | flagp = netFlags; |
||
1314 | return ret; |
||
1315 | } |
||
1316 | |||
1317 | /* |
||
1318 | - regnode - emit a node |
||
1319 | */ |
||
1320 | internal int regnode( char op ) |
||
1321 | { |
||
1322 | int ret = code.Length; |
||
1323 | code.Append( op ); |
||
1324 | code.Append( '\x0000' ); |
||
1325 | |||
1326 | return ret; |
||
1327 | } |
||
1328 | |||
1329 | /* |
||
1330 | - regc - emit (if appropriate) a byte of code |
||
1331 | */ |
||
1332 | internal void regc( char b ) |
||
1333 | { |
||
1334 | code.Append( b ); |
||
1335 | } |
||
1336 | |||
1337 | internal int reglen() |
||
1338 | { |
||
1339 | return code.Length; |
||
1340 | } |
||
1341 | |||
1342 | internal void regset( int pos, char ch ) |
||
1343 | { |
||
1344 | code[pos] = ch; |
||
1345 | } |
||
1346 | |||
1347 | |||
1348 | /* |
||
1349 | - reginsert - insert an operator in front of already-emitted operand |
||
1350 | * |
||
1351 | * Means relocating the operand. |
||
1352 | */ |
||
1353 | internal void reginsert( char op, int pos ) |
||
1354 | { |
||
1355 | char[] tmp = new char[] { op, '\x0000' }; |
||
1356 | code.Insert( pos, tmp ); |
||
1357 | } |
||
1358 | |||
1359 | /* |
||
1360 | - regtail - set the next-pointer at the end of a node chain |
||
1361 | */ |
||
1362 | internal void regtail( int pos, int val ) |
||
1363 | { |
||
1364 | /* Find last node. */ |
||
1365 | |||
1366 | int scan = pos; |
||
1367 | while ( true ) |
||
1368 | { |
||
1369 | int tmp = regnext( scan ); |
||
1370 | if ( tmp < 0 ) |
||
1371 | { |
||
1372 | break; |
||
1373 | } |
||
1374 | scan = tmp; |
||
1375 | } |
||
1376 | |||
1377 | int offset = ( code[scan] == sunlabs.brazil.util.regexp.Regexp.BACK ) ? scan - val : val - scan; |
||
1378 | code[scan + 1] = (char)offset; |
||
1379 | } |
||
1380 | |||
1381 | /* |
||
1382 | - regoptail - regtail on operand of first argument; nop if operandless |
||
1383 | */ |
||
1384 | internal void regoptail( int pos, int val ) |
||
1385 | { |
||
1386 | if ( ( pos < 0 ) || ( code[pos] != sunlabs.brazil.util.regexp.Regexp.BRANCH ) ) |
||
1387 | { |
||
1388 | return; |
||
1389 | } |
||
1390 | regtail( pos + 2, val ); |
||
1391 | } |
||
1392 | |||
1393 | |||
1394 | /* |
||
1395 | - regnext - dig the "next" pointer out of a node |
||
1396 | */ |
||
1397 | internal int regnext( int pos ) |
||
1398 | { |
||
1399 | int offset = code[pos + 1]; |
||
1400 | if ( offset == 0 ) |
||
1401 | { |
||
1402 | return -1; |
||
1403 | } |
||
1404 | if ( code[pos] == sunlabs.brazil.util.regexp.Regexp.BACK ) |
||
1405 | { |
||
1406 | return pos - offset; |
||
1407 | } |
||
1408 | else |
||
1409 | { |
||
1410 | return pos + offset; |
||
1411 | } |
||
1412 | } |
||
1413 | |||
1414 | internal static bool isMult( char ch ) |
||
1415 | { |
||
1416 | return ( ch == '*' ) || ( ch == '+' ) || ( ch == '?' ); |
||
1417 | } |
||
1418 | } |
||
1419 | |||
1420 | internal class Match |
||
1421 | { |
||
1422 | internal char[] program; |
||
1423 | |||
1424 | internal string str; |
||
1425 | internal int bol; |
||
1426 | internal int input; |
||
1427 | internal int length; |
||
1428 | |||
1429 | internal int[] indices; |
||
1430 | |||
1431 | internal bool regtry( int off ) |
||
1432 | { |
||
1433 | this.input = off; |
||
1434 | |||
1435 | for ( int i = 0; i < indices.Length; i++ ) |
||
1436 | { |
||
1437 | indices[i] = -1; |
||
1438 | } |
||
1439 | |||
1440 | if ( regmatch( 0 ) ) |
||
1441 | { |
||
1442 | indices[0] = off; |
||
1443 | indices[1] = input; |
||
1444 | return true; |
||
1445 | } |
||
1446 | else |
||
1447 | { |
||
1448 | return false; |
||
1449 | } |
||
1450 | } |
||
1451 | |||
1452 | /* |
||
1453 | - regmatch - main matching routine |
||
1454 | * |
||
1455 | * Conceptually the strategy is simple: check to see whether the current |
||
1456 | * node matches, call self recursively to see whether the rest matches, |
||
1457 | * and then act accordingly. In practice we make some effort to avoid |
||
1458 | * recursion, in particular by going through "ordinary" nodes (that don't |
||
1459 | * need to know whether the rest of the match failed) by a loop instead of |
||
1460 | * by recursion. |
||
1461 | */ |
||
1462 | internal bool regmatch( int scan ) |
||
1463 | { |
||
1464 | while ( true ) |
||
1465 | { |
||
1466 | int next = regnext( scan ); |
||
1467 | int op = program[scan]; |
||
1468 | switch ( op ) |
||
1469 | { |
||
1470 | |||
1471 | case sunlabs.brazil.util.regexp.Regexp.BOL: |
||
1472 | if ( input != bol ) |
||
1473 | { |
||
1474 | return false; |
||
1475 | } |
||
1476 | break; |
||
1477 | |||
1478 | |||
1479 | case sunlabs.brazil.util.regexp.Regexp.EOL: |
||
1480 | if ( input != length ) |
||
1481 | { |
||
1482 | return false; |
||
1483 | } |
||
1484 | break; |
||
1485 | |||
1486 | |||
1487 | case sunlabs.brazil.util.regexp.Regexp.ANY: |
||
1488 | if ( input >= length ) |
||
1489 | { |
||
1490 | return false; |
||
1491 | } |
||
1492 | input++; |
||
1493 | break; |
||
1494 | |||
1495 | |||
1496 | case sunlabs.brazil.util.regexp.Regexp.EXACTLY: |
||
1497 | { |
||
1498 | if ( compare( scan ) == false ) |
||
1499 | { |
||
1500 | return false; |
||
1501 | } |
||
1502 | break; |
||
1503 | } |
||
1504 | |||
1505 | |||
1506 | case sunlabs.brazil.util.regexp.Regexp.ANYOF: |
||
1507 | if ( input >= length ) |
||
1508 | { |
||
1509 | return false; |
||
1510 | } |
||
1511 | if ( present( scan ) == false ) |
||
1512 | { |
||
1513 | return false; |
||
1514 | } |
||
1515 | input++; |
||
1516 | break; |
||
1517 | |||
1518 | |||
1519 | case sunlabs.brazil.util.regexp.Regexp.ANYBUT: |
||
1520 | if ( input >= length ) |
||
1521 | { |
||
1522 | return false; |
||
1523 | } |
||
1524 | if ( present( scan ) ) |
||
1525 | { |
||
1526 | return false; |
||
1527 | } |
||
1528 | input++; |
||
1529 | break; |
||
1530 | |||
1531 | |||
1532 | case sunlabs.brazil.util.regexp.Regexp.NOTHING: |
||
1533 | case sunlabs.brazil.util.regexp.Regexp.BACK: |
||
1534 | break; |
||
1535 | |||
1536 | |||
1537 | case sunlabs.brazil.util.regexp.Regexp.BRANCH: |
||
1538 | { |
||
1539 | if ( program[next] != sunlabs.brazil.util.regexp.Regexp.BRANCH ) |
||
1540 | { |
||
1541 | next = scan + 2; |
||
1542 | } |
||
1543 | else |
||
1544 | { |
||
1545 | do |
||
1546 | { |
||
1547 | int save = input; |
||
1548 | if ( regmatch( scan + 2 ) ) |
||
1549 | { |
||
1550 | return true; |
||
1551 | } |
||
1552 | input = save; |
||
1553 | scan = regnext( scan ); |
||
1554 | } |
||
1555 | while ( ( scan >= 0 ) && ( program[scan] == sunlabs.brazil.util.regexp.Regexp.BRANCH ) ); |
||
1556 | return false; |
||
1557 | } |
||
1558 | break; |
||
1559 | } |
||
1560 | |||
1561 | |||
1562 | case sunlabs.brazil.util.regexp.Regexp.STAR: |
||
1563 | case sunlabs.brazil.util.regexp.Regexp.PLUS: |
||
1564 | { |
||
1565 | /* |
||
1566 | * Lookahead to avoid useless match attempts |
||
1567 | * when we know what character comes next. |
||
1568 | */ |
||
1569 | |||
1570 | int ch = -1; |
||
1571 | if ( program[next] == sunlabs.brazil.util.regexp.Regexp.EXACTLY ) |
||
1572 | { |
||
1573 | ch = program[next + 3]; |
||
1574 | } |
||
1575 | |||
1576 | int min = ( op == sunlabs.brazil.util.regexp.Regexp.STAR ) ? 0 : 1; |
||
1577 | int save = input; |
||
1578 | int no = regrepeat( scan + 2 ); |
||
1579 | |||
1580 | while ( no >= min ) |
||
1581 | { |
||
1582 | /* If it could work, try it. */ |
||
1583 | if ( ( ch < 0 ) || ( ( input < length ) && ( str[input] == ch ) ) ) |
||
1584 | { |
||
1585 | if ( regmatch( next ) ) |
||
1586 | { |
||
1587 | return true; |
||
1588 | } |
||
1589 | } |
||
1590 | /* Couldn't or didn't -- back up. */ |
||
1591 | no--; |
||
1592 | input = save + no; |
||
1593 | } |
||
1594 | return false; |
||
1595 | } |
||
1596 | |||
1597 | |||
1598 | case sunlabs.brazil.util.regexp.Regexp.END: |
||
1599 | return true; |
||
1600 | |||
1601 | |||
1602 | default: |
||
1603 | if ( op >= sunlabs.brazil.util.regexp.Regexp.CLOSE ) |
||
1604 | { |
||
1605 | int no = op - sunlabs.brazil.util.regexp.Regexp.CLOSE; |
||
1606 | int save = input; |
||
1607 | |||
1608 | if ( regmatch( next ) ) |
||
1609 | { |
||
1610 | /* |
||
1611 | * Don't set endp if some later |
||
1612 | * invocation of the same parentheses |
||
1613 | * already has. |
||
1614 | */ |
||
1615 | if ( indices[no * 2 + 1] <= 0 ) |
||
1616 | { |
||
1617 | indices[no * 2 + 1] = save; |
||
1618 | } |
||
1619 | return true; |
||
1620 | } |
||
1621 | } |
||
1622 | else if ( op >= sunlabs.brazil.util.regexp.Regexp.OPEN ) |
||
1623 | { |
||
1624 | int no = op - sunlabs.brazil.util.regexp.Regexp.OPEN; |
||
1625 | int save = input; |
||
1626 | |||
1627 | if ( regmatch( next ) ) |
||
1628 | { |
||
1629 | /* |
||
1630 | * Don't set startp if some later invocation of the |
||
1631 | * same parentheses already has. |
||
1632 | */ |
||
1633 | if ( indices[no * 2] <= 0 ) |
||
1634 | { |
||
1635 | indices[no * 2] = save; |
||
1636 | } |
||
1637 | return true; |
||
1638 | } |
||
1639 | } |
||
1640 | return false; |
||
1641 | |||
1642 | } |
||
1643 | scan = next; |
||
1644 | } |
||
1645 | } |
||
1646 | |||
1647 | internal bool compare( int scan ) |
||
1648 | { |
||
1649 | int count = program[scan + 2]; |
||
1650 | if ( input + count > length ) |
||
1651 | { |
||
1652 | return false; |
||
1653 | } |
||
1654 | int start = scan + 3; |
||
1655 | int end = start + count; |
||
1656 | for ( int i = start; i < end; i++ ) |
||
1657 | { |
||
1658 | if ( str[input++] != program[i] ) |
||
1659 | { |
||
1660 | return false; |
||
1661 | } |
||
1662 | } |
||
1663 | return true; |
||
1664 | } |
||
1665 | |||
1666 | internal bool present( int scan ) |
||
1667 | { |
||
1668 | char ch = str[input]; |
||
1669 | |||
1670 | int count = program[scan + 2]; |
||
1671 | int start = scan + 3; |
||
1672 | int end = start + count; |
||
1673 | |||
1674 | for ( int i = start; i < end; i++ ) |
||
1675 | { |
||
1676 | if ( program[i] == ch ) |
||
1677 | { |
||
1678 | return true; |
||
1679 | } |
||
1680 | } |
||
1681 | return false; |
||
1682 | } |
||
1683 | |||
1684 | |||
1685 | /* |
||
1686 | - regrepeat - repeatedly match something simple, report how many |
||
1687 | */ |
||
1688 | internal int regrepeat( int scan ) |
||
1689 | { |
||
1690 | int op = program[scan]; |
||
1691 | int count = 0; |
||
1692 | |||
1693 | switch ( op ) |
||
1694 | { |
||
1695 | |||
1696 | case sunlabs.brazil.util.regexp.Regexp.ANY: |
||
1697 | |||
1698 | count = length - input; |
||
1699 | input = length; |
||
1700 | break; |
||
1701 | |||
1702 | |||
1703 | case sunlabs.brazil.util.regexp.Regexp.EXACTLY: |
||
1704 | { |
||
1705 | // 'g*' matches all the following 'g' characters. |
||
1706 | |||
1707 | char ch = program[scan + 3]; |
||
1708 | while ( ( input < length ) && ( str[input] == ch ) ) |
||
1709 | { |
||
1710 | input++; |
||
1711 | count++; |
||
1712 | } |
||
1713 | break; |
||
1714 | } |
||
1715 | |||
1716 | |||
1717 | case sunlabs.brazil.util.regexp.Regexp.ANYOF: |
||
1718 | |||
1719 | while ( ( input < length ) && present( scan ) ) |
||
1720 | { |
||
1721 | input++; |
||
1722 | count++; |
||
1723 | } |
||
1724 | break; |
||
1725 | |||
1726 | |||
1727 | |||
1728 | case sunlabs.brazil.util.regexp.Regexp.ANYBUT: |
||
1729 | while ( ( input < length ) && !present( scan ) ) |
||
1730 | { |
||
1731 | input++; |
||
1732 | count++; |
||
1733 | } |
||
1734 | break; |
||
1735 | } |
||
1736 | return count; |
||
1737 | } |
||
1738 | |||
1739 | /* |
||
1740 | - regnext - dig the "next" pointer out of a node |
||
1741 | */ |
||
1742 | internal int regnext( int scan ) |
||
1743 | { |
||
1744 | int offset = program[scan + 1]; |
||
1745 | if ( program[scan] == sunlabs.brazil.util.regexp.Regexp.BACK ) |
||
1746 | { |
||
1747 | return scan - offset; |
||
1748 | } |
||
1749 | else |
||
1750 | { |
||
1751 | return scan + offset; |
||
1752 | } |
||
1753 | } |
||
1754 | } |
||
1755 | } |
||
1756 | } |