1 | /* |
---|
2 | * regcomp and regexec -- regsub and regerror are elsewhere |
---|
3 | * |
---|
4 | * Copyright (c) 1986 by University of Toronto. |
---|
5 | * Written by Henry Spencer. Not derived from licensed software. |
---|
6 | * |
---|
7 | * Permission is granted to anyone to use this software for any |
---|
8 | * purpose on any computer system, and to redistribute it freely, |
---|
9 | * subject to the following restrictions: |
---|
10 | * |
---|
11 | * 1. The author is not responsible for the consequences of use of |
---|
12 | * this software, no matter how awful, even if they arise |
---|
13 | * from defects in it. |
---|
14 | * |
---|
15 | * 2. The origin of this software must not be misrepresented, either |
---|
16 | * by explicit claim or by omission. |
---|
17 | * |
---|
18 | * 3. Altered versions must be plainly marked as such, and must not |
---|
19 | * be misrepresented as being the original software. |
---|
20 | *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, |
---|
21 | *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | |
---|
22 | *** to assist in implementing egrep. |
---|
23 | *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, |
---|
24 | *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching |
---|
25 | *** as in BSD grep and ex. |
---|
26 | *** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, |
---|
27 | *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. |
---|
28 | *** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, |
---|
29 | *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. |
---|
30 | *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald |
---|
31 | *** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h |
---|
32 | *** was moved into regexp.h, and the include of regexp.h now uses "'s |
---|
33 | *** to avoid conflicting with the system regexp.h. Const, bless its |
---|
34 | *** soul, was removed so it can compile everywhere. The declaration |
---|
35 | *** of strchr() was in conflict on AIX, so it was removed (as it is |
---|
36 | *** happily defined in string.h). |
---|
37 | *** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald |
---|
38 | *** seiwald@perforce.com, on 20 January 2000, to use function prototypes. |
---|
39 | * |
---|
40 | * Beware that some of this code is subtly aware of the way operator |
---|
41 | * precedence is structured in regular expressions. Serious changes in |
---|
42 | * regular-expression syntax might require a total rethink. |
---|
43 | */ |
---|
44 | #include "regexp.h" |
---|
45 | #include "debug.h" |
---|
46 | #include <stdio.h> |
---|
47 | #include <ctype.h> |
---|
48 | #ifndef ultrix |
---|
49 | #include <stdlib.h> |
---|
50 | #endif |
---|
51 | #include <string.h> |
---|
52 | |
---|
53 | /* |
---|
54 | * The "internal use only" fields in regexp.h are present to pass info from |
---|
55 | * compile to execute that permits the execute phase to run lots faster on |
---|
56 | * simple cases. They are: |
---|
57 | * |
---|
58 | * regstart char that must begin a match; '\0' if none obvious |
---|
59 | * reganch is the match anchored (at beginning-of-line only)? |
---|
60 | * regmust string (pointer into program) that match must include, or NULL |
---|
61 | * regmlen length of regmust string |
---|
62 | * |
---|
63 | * Regstart and reganch permit very fast decisions on suitable starting points |
---|
64 | * for a match, cutting down the work a lot. Regmust permits fast rejection |
---|
65 | * of lines that cannot possibly match. The regmust tests are costly enough |
---|
66 | * that regcomp() supplies a regmust only if the r.e. contains something |
---|
67 | * potentially expensive (at present, the only such thing detected is * or + |
---|
68 | * at the start of the r.e., which can involve a lot of backup). Regmlen is |
---|
69 | * supplied because the test in regexec() needs it and regcomp() is computing |
---|
70 | * it anyway. |
---|
71 | */ |
---|
72 | |
---|
73 | /* |
---|
74 | * Structure for regexp "program". This is essentially a linear encoding |
---|
75 | * of a nondeterministic finite-state machine (aka syntax charts or |
---|
76 | * "railroad normal form" in parsing technology). Each node is an opcode |
---|
77 | * plus a "next" pointer, possibly plus an operand. "Next" pointers of |
---|
78 | * all nodes except BRANCH implement concatenation; a "next" pointer with |
---|
79 | * a BRANCH on both ends of it is connecting two alternatives. (Here we |
---|
80 | * have one of the subtle syntax dependencies: an individual BRANCH (as |
---|
81 | * opposed to a collection of them) is never concatenated with anything |
---|
82 | * because of operator precedence.) The operand of some types of node is |
---|
83 | * a literal string; for others, it is a node leading into a sub-FSM. In |
---|
84 | * particular, the operand of a BRANCH node is the first node of the branch. |
---|
85 | * (NB this is *not* a tree structure: the tail of the branch connects |
---|
86 | * to the thing following the set of BRANCHes.) The opcodes are: |
---|
87 | */ |
---|
88 | |
---|
89 | /* definition number opnd? meaning */ |
---|
90 | #define END 0 /* no End of program. */ |
---|
91 | #define BOL 1 /* no Match "" at beginning of line. */ |
---|
92 | #define EOL 2 /* no Match "" at end of line. */ |
---|
93 | #define ANY 3 /* no Match any one character. */ |
---|
94 | #define ANYOF 4 /* str Match any character in this string. */ |
---|
95 | #define ANYBUT 5 /* str Match any character not in this string. */ |
---|
96 | #define BRANCH 6 /* node Match this alternative, or the next... */ |
---|
97 | #define BACK 7 /* no Match "", "next" ptr points backward. */ |
---|
98 | #define EXACTLY 8 /* str Match this string. */ |
---|
99 | #define NOTHING 9 /* no Match empty string. */ |
---|
100 | #define STAR 10 /* node Match this (simple) thing 0 or more times. */ |
---|
101 | #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ |
---|
102 | #define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ |
---|
103 | #define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ |
---|
104 | #define OPEN 20 /* no Mark this point in input as start of #n. */ |
---|
105 | /* OPEN+1 is number 1, etc. */ |
---|
106 | #define CLOSE 30 /* no Analogous to OPEN. */ |
---|
107 | |
---|
108 | /* |
---|
109 | * Opcode notes: |
---|
110 | * |
---|
111 | * BRANCH The set of branches constituting a single choice are hooked |
---|
112 | * together with their "next" pointers, since precedence prevents |
---|
113 | * anything being concatenated to any individual branch. The |
---|
114 | * "next" pointer of the last BRANCH in a choice points to the |
---|
115 | * thing following the whole choice. This is also where the |
---|
116 | * final "next" pointer of each individual branch points; each |
---|
117 | * branch starts with the operand node of a BRANCH node. |
---|
118 | * |
---|
119 | * BACK Normal "next" pointers all implicitly point forward; BACK |
---|
120 | * exists to make loop structures possible. |
---|
121 | * |
---|
122 | * STAR,PLUS '?', and complex '*' and '+', are implemented as circular |
---|
123 | * BRANCH structures using BACK. Simple cases (one character |
---|
124 | * per match) are implemented with STAR and PLUS for speed |
---|
125 | * and to minimize recursive plunges. |
---|
126 | * |
---|
127 | * OPEN,CLOSE ...are numbered at compile time. |
---|
128 | */ |
---|
129 | |
---|
130 | /* |
---|
131 | * A node is one char of opcode followed by two chars of "next" pointer. |
---|
132 | * "Next" pointers are stored as two 8-bit pieces, high order first. The |
---|
133 | * value is a positive offset from the opcode of the node containing it. |
---|
134 | * An operand, if any, simply follows the node. (Note that much of the |
---|
135 | * code generation knows about this implicit relationship.) |
---|
136 | * |
---|
137 | * Using two bytes for the "next" pointer is vast overkill for most things, |
---|
138 | * but allows patterns to get big without disasters. |
---|
139 | */ |
---|
140 | #define OP(p) (*(p)) |
---|
141 | #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) |
---|
142 | #define OPERAND(p) ((p) + 3) |
---|
143 | |
---|
144 | /* |
---|
145 | * See regmagic.h for one further detail of program structure. |
---|
146 | */ |
---|
147 | |
---|
148 | |
---|
149 | /* |
---|
150 | * Utility definitions. |
---|
151 | */ |
---|
152 | #ifndef CHARBITS |
---|
153 | #define UCHARAT(p) ((int)*(unsigned char *)(p)) |
---|
154 | #else |
---|
155 | #define UCHARAT(p) ((int)*(p)&CHARBITS) |
---|
156 | #endif |
---|
157 | |
---|
158 | #define FAIL(m) { regerror(m); return(NULL); } |
---|
159 | #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') |
---|
160 | |
---|
161 | /* |
---|
162 | * Flags to be passed up and down. |
---|
163 | */ |
---|
164 | #define HASWIDTH 01 /* Known never to match null string. */ |
---|
165 | #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ |
---|
166 | #define SPSTART 04 /* Starts with * or +. */ |
---|
167 | #define WORST 0 /* Worst case. */ |
---|
168 | |
---|
169 | /* |
---|
170 | * Global work variables for regcomp(). |
---|
171 | */ |
---|
172 | static char *regparse; /* Input-scan pointer. */ |
---|
173 | static int regnpar; /* () count. */ |
---|
174 | static char regdummy; |
---|
175 | static char *regcode; /* Code-emit pointer; ®dummy = don't. */ |
---|
176 | static long regsize; /* Code size. */ |
---|
177 | |
---|
178 | /* |
---|
179 | * Forward declarations for regcomp()'s friends. |
---|
180 | */ |
---|
181 | #ifndef STATIC |
---|
182 | #define STATIC static |
---|
183 | #endif |
---|
184 | STATIC char *reg( int paren, int *flagp ); |
---|
185 | STATIC char *regbranch( int *flagp ); |
---|
186 | STATIC char *regpiece( int *flagp ); |
---|
187 | STATIC char *regatom( int *flagp ); |
---|
188 | STATIC char *regnode( int op ); |
---|
189 | STATIC char *regnext( register char *p ); |
---|
190 | STATIC void regc( int b ); |
---|
191 | STATIC void reginsert( char op, char *opnd ); |
---|
192 | STATIC void regtail( char *p, char *val ); |
---|
193 | STATIC void regoptail( char *p, char *val ); |
---|
194 | #ifdef STRCSPN |
---|
195 | STATIC int strcspn(); |
---|
196 | #endif |
---|
197 | |
---|
198 | /* |
---|
199 | - regcomp - compile a regular expression into internal code |
---|
200 | * |
---|
201 | * We can't allocate space until we know how big the compiled form will be, |
---|
202 | * but we can't compile it (and thus know how big it is) until we've got a |
---|
203 | * place to put the code. So we cheat: we compile it twice, once with code |
---|
204 | * generation turned off and size counting turned on, and once "for real". |
---|
205 | * This also means that we don't allocate space until we are sure that the |
---|
206 | * thing really will compile successfully, and we never have to move the |
---|
207 | * code and thus invalidate pointers into it. (Note that it has to be in |
---|
208 | * one piece because free() must be able to free it all.) |
---|
209 | * |
---|
210 | * Beware that the optimization-preparation code in here knows about some |
---|
211 | * of the structure of the compiled regexp. |
---|
212 | */ |
---|
213 | regexp * |
---|
214 | regcomp( char *exp ) |
---|
215 | { |
---|
216 | register regexp *r; |
---|
217 | register char *scan; |
---|
218 | register char *longest; |
---|
219 | register unsigned len; |
---|
220 | int flags; |
---|
221 | |
---|
222 | if (exp == NULL) |
---|
223 | FAIL("NULL argument"); |
---|
224 | |
---|
225 | /* First pass: determine size, legality. */ |
---|
226 | #ifdef notdef |
---|
227 | if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ |
---|
228 | #endif |
---|
229 | regparse = (char *)exp; |
---|
230 | regnpar = 1; |
---|
231 | regsize = 0L; |
---|
232 | regcode = ®dummy; |
---|
233 | regc(MAGIC); |
---|
234 | if (reg(0, &flags) == NULL) |
---|
235 | return(NULL); |
---|
236 | |
---|
237 | /* Small enough for pointer-storage convention? */ |
---|
238 | if (regsize >= 32767L) /* Probably could be 65535L. */ |
---|
239 | FAIL("regexp too big"); |
---|
240 | |
---|
241 | /* Allocate space. */ |
---|
242 | r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); |
---|
243 | if (r == NULL) |
---|
244 | FAIL("out of space"); |
---|
245 | if ( DEBUG_PROFILE ) |
---|
246 | profile_memory( sizeof(regexp) + (unsigned)regsize ); |
---|
247 | |
---|
248 | /* Second pass: emit code. */ |
---|
249 | regparse = (char *)exp; |
---|
250 | regnpar = 1; |
---|
251 | regcode = r->program; |
---|
252 | regc(MAGIC); |
---|
253 | if (reg(0, &flags) == NULL) |
---|
254 | return(NULL); |
---|
255 | |
---|
256 | /* Dig out information for optimizations. */ |
---|
257 | r->regstart = '\0'; /* Worst-case defaults. */ |
---|
258 | r->reganch = 0; |
---|
259 | r->regmust = NULL; |
---|
260 | r->regmlen = 0; |
---|
261 | scan = r->program+1; /* First BRANCH. */ |
---|
262 | if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ |
---|
263 | scan = OPERAND(scan); |
---|
264 | |
---|
265 | /* Starting-point info. */ |
---|
266 | if (OP(scan) == EXACTLY) |
---|
267 | r->regstart = *OPERAND(scan); |
---|
268 | else if (OP(scan) == BOL) |
---|
269 | r->reganch++; |
---|
270 | |
---|
271 | /* |
---|
272 | * If there's something expensive in the r.e., find the |
---|
273 | * longest literal string that must appear and make it the |
---|
274 | * regmust. Resolve ties in favor of later strings, since |
---|
275 | * the regstart check works with the beginning of the r.e. |
---|
276 | * and avoiding duplication strengthens checking. Not a |
---|
277 | * strong reason, but sufficient in the absence of others. |
---|
278 | */ |
---|
279 | if (flags&SPSTART) { |
---|
280 | longest = NULL; |
---|
281 | len = 0; |
---|
282 | for (; scan != NULL; scan = regnext(scan)) |
---|
283 | if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { |
---|
284 | longest = OPERAND(scan); |
---|
285 | len = strlen(OPERAND(scan)); |
---|
286 | } |
---|
287 | r->regmust = longest; |
---|
288 | r->regmlen = len; |
---|
289 | } |
---|
290 | } |
---|
291 | |
---|
292 | return(r); |
---|
293 | } |
---|
294 | |
---|
295 | /* |
---|
296 | - reg - regular expression, i.e. main body or parenthesized thing |
---|
297 | * |
---|
298 | * Caller must absorb opening parenthesis. |
---|
299 | * |
---|
300 | * Combining parenthesis handling with the base level of regular expression |
---|
301 | * is a trifle forced, but the need to tie the tails of the branches to what |
---|
302 | * follows makes it hard to avoid. |
---|
303 | */ |
---|
304 | static char * |
---|
305 | reg( |
---|
306 | int paren, /* Parenthesized? */ |
---|
307 | int *flagp ) |
---|
308 | { |
---|
309 | register char *ret; |
---|
310 | register char *br; |
---|
311 | register char *ender; |
---|
312 | register int parno; |
---|
313 | int flags; |
---|
314 | |
---|
315 | *flagp = HASWIDTH; /* Tentatively. */ |
---|
316 | |
---|
317 | /* Make an OPEN node, if parenthesized. */ |
---|
318 | if (paren) { |
---|
319 | if (regnpar >= NSUBEXP) |
---|
320 | FAIL("too many ()"); |
---|
321 | parno = regnpar; |
---|
322 | regnpar++; |
---|
323 | ret = regnode(OPEN+parno); |
---|
324 | } else |
---|
325 | ret = NULL; |
---|
326 | |
---|
327 | /* Pick up the branches, linking them together. */ |
---|
328 | br = regbranch(&flags); |
---|
329 | if (br == NULL) |
---|
330 | return(NULL); |
---|
331 | if (ret != NULL) |
---|
332 | regtail(ret, br); /* OPEN -> first. */ |
---|
333 | else |
---|
334 | ret = br; |
---|
335 | if (!(flags&HASWIDTH)) |
---|
336 | *flagp &= ~HASWIDTH; |
---|
337 | *flagp |= flags&SPSTART; |
---|
338 | while (*regparse == '|' || *regparse == '\n') { |
---|
339 | regparse++; |
---|
340 | br = regbranch(&flags); |
---|
341 | if (br == NULL) |
---|
342 | return(NULL); |
---|
343 | regtail(ret, br); /* BRANCH -> BRANCH. */ |
---|
344 | if (!(flags&HASWIDTH)) |
---|
345 | *flagp &= ~HASWIDTH; |
---|
346 | *flagp |= flags&SPSTART; |
---|
347 | } |
---|
348 | |
---|
349 | /* Make a closing node, and hook it on the end. */ |
---|
350 | ender = regnode((paren) ? CLOSE+parno : END); |
---|
351 | regtail(ret, ender); |
---|
352 | |
---|
353 | /* Hook the tails of the branches to the closing node. */ |
---|
354 | for (br = ret; br != NULL; br = regnext(br)) |
---|
355 | regoptail(br, ender); |
---|
356 | |
---|
357 | /* Check for proper termination. */ |
---|
358 | if (paren && *regparse++ != ')') { |
---|
359 | FAIL("unmatched ()"); |
---|
360 | } else if (!paren && *regparse != '\0') { |
---|
361 | if (*regparse == ')') { |
---|
362 | FAIL("unmatched ()"); |
---|
363 | } else |
---|
364 | FAIL("junk on end"); /* "Can't happen". */ |
---|
365 | /* NOTREACHED */ |
---|
366 | } |
---|
367 | |
---|
368 | return(ret); |
---|
369 | } |
---|
370 | |
---|
371 | /* |
---|
372 | - regbranch - one alternative of an | operator |
---|
373 | * |
---|
374 | * Implements the concatenation operator. |
---|
375 | */ |
---|
376 | static char * |
---|
377 | regbranch( int *flagp ) |
---|
378 | { |
---|
379 | register char *ret; |
---|
380 | register char *chain; |
---|
381 | register char *latest; |
---|
382 | int flags; |
---|
383 | |
---|
384 | *flagp = WORST; /* Tentatively. */ |
---|
385 | |
---|
386 | ret = regnode(BRANCH); |
---|
387 | chain = NULL; |
---|
388 | while (*regparse != '\0' && *regparse != ')' && |
---|
389 | *regparse != '\n' && *regparse != '|') { |
---|
390 | latest = regpiece(&flags); |
---|
391 | if (latest == NULL) |
---|
392 | return(NULL); |
---|
393 | *flagp |= flags&HASWIDTH; |
---|
394 | if (chain == NULL) /* First piece. */ |
---|
395 | *flagp |= flags&SPSTART; |
---|
396 | else |
---|
397 | regtail(chain, latest); |
---|
398 | chain = latest; |
---|
399 | } |
---|
400 | if (chain == NULL) /* Loop ran zero times. */ |
---|
401 | (void) regnode(NOTHING); |
---|
402 | |
---|
403 | return(ret); |
---|
404 | } |
---|
405 | |
---|
406 | /* |
---|
407 | - regpiece - something followed by possible [*+?] |
---|
408 | * |
---|
409 | * Note that the branching code sequences used for ? and the general cases |
---|
410 | * of * and + are somewhat optimized: they use the same NOTHING node as |
---|
411 | * both the endmarker for their branch list and the body of the last branch. |
---|
412 | * It might seem that this node could be dispensed with entirely, but the |
---|
413 | * endmarker role is not redundant. |
---|
414 | */ |
---|
415 | static char * |
---|
416 | regpiece( int *flagp ) |
---|
417 | { |
---|
418 | register char *ret; |
---|
419 | register char op; |
---|
420 | register char *next; |
---|
421 | int flags; |
---|
422 | |
---|
423 | ret = regatom(&flags); |
---|
424 | if (ret == NULL) |
---|
425 | return(NULL); |
---|
426 | |
---|
427 | op = *regparse; |
---|
428 | if (!ISMULT(op)) { |
---|
429 | *flagp = flags; |
---|
430 | return(ret); |
---|
431 | } |
---|
432 | |
---|
433 | if (!(flags&HASWIDTH) && op != '?') |
---|
434 | FAIL("*+ operand could be empty"); |
---|
435 | *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); |
---|
436 | |
---|
437 | if (op == '*' && (flags&SIMPLE)) |
---|
438 | reginsert(STAR, ret); |
---|
439 | else if (op == '*') { |
---|
440 | /* Emit x* as (x&|), where & means "self". */ |
---|
441 | reginsert(BRANCH, ret); /* Either x */ |
---|
442 | regoptail(ret, regnode(BACK)); /* and loop */ |
---|
443 | regoptail(ret, ret); /* back */ |
---|
444 | regtail(ret, regnode(BRANCH)); /* or */ |
---|
445 | regtail(ret, regnode(NOTHING)); /* null. */ |
---|
446 | } else if (op == '+' && (flags&SIMPLE)) |
---|
447 | reginsert(PLUS, ret); |
---|
448 | else if (op == '+') { |
---|
449 | /* Emit x+ as x(&|), where & means "self". */ |
---|
450 | next = regnode(BRANCH); /* Either */ |
---|
451 | regtail(ret, next); |
---|
452 | regtail(regnode(BACK), ret); /* loop back */ |
---|
453 | regtail(next, regnode(BRANCH)); /* or */ |
---|
454 | regtail(ret, regnode(NOTHING)); /* null. */ |
---|
455 | } else if (op == '?') { |
---|
456 | /* Emit x? as (x|) */ |
---|
457 | reginsert(BRANCH, ret); /* Either x */ |
---|
458 | regtail(ret, regnode(BRANCH)); /* or */ |
---|
459 | next = regnode(NOTHING); /* null. */ |
---|
460 | regtail(ret, next); |
---|
461 | regoptail(ret, next); |
---|
462 | } |
---|
463 | regparse++; |
---|
464 | if (ISMULT(*regparse)) |
---|
465 | FAIL("nested *?+"); |
---|
466 | |
---|
467 | return(ret); |
---|
468 | } |
---|
469 | |
---|
470 | /* |
---|
471 | - regatom - the lowest level |
---|
472 | * |
---|
473 | * Optimization: gobbles an entire sequence of ordinary characters so that |
---|
474 | * it can turn them into a single node, which is smaller to store and |
---|
475 | * faster to run. Backslashed characters are exceptions, each becoming a |
---|
476 | * separate node; the code is simpler that way and it's not worth fixing. |
---|
477 | */ |
---|
478 | static char * |
---|
479 | regatom( int *flagp ) |
---|
480 | { |
---|
481 | register char *ret; |
---|
482 | int flags; |
---|
483 | |
---|
484 | *flagp = WORST; /* Tentatively. */ |
---|
485 | |
---|
486 | switch (*regparse++) { |
---|
487 | /* FIXME: these chars only have meaning at beg/end of pat? */ |
---|
488 | case '^': |
---|
489 | ret = regnode(BOL); |
---|
490 | break; |
---|
491 | case '$': |
---|
492 | ret = regnode(EOL); |
---|
493 | break; |
---|
494 | case '.': |
---|
495 | ret = regnode(ANY); |
---|
496 | *flagp |= HASWIDTH|SIMPLE; |
---|
497 | break; |
---|
498 | case '[': { |
---|
499 | register int classr; |
---|
500 | register int classend; |
---|
501 | |
---|
502 | if (*regparse == '^') { /* Complement of range. */ |
---|
503 | ret = regnode(ANYBUT); |
---|
504 | regparse++; |
---|
505 | } else |
---|
506 | ret = regnode(ANYOF); |
---|
507 | if (*regparse == ']' || *regparse == '-') |
---|
508 | regc(*regparse++); |
---|
509 | while (*regparse != '\0' && *regparse != ']') { |
---|
510 | if (*regparse == '-') { |
---|
511 | regparse++; |
---|
512 | if (*regparse == ']' || *regparse == '\0') |
---|
513 | regc('-'); |
---|
514 | else { |
---|
515 | classr = UCHARAT(regparse-2)+1; |
---|
516 | classend = UCHARAT(regparse); |
---|
517 | if (classr > classend+1) |
---|
518 | FAIL("invalid [] range"); |
---|
519 | for (; classr <= classend; classr++) |
---|
520 | regc(classr); |
---|
521 | regparse++; |
---|
522 | } |
---|
523 | } else |
---|
524 | regc(*regparse++); |
---|
525 | } |
---|
526 | regc('\0'); |
---|
527 | if (*regparse != ']') |
---|
528 | FAIL("unmatched []"); |
---|
529 | regparse++; |
---|
530 | *flagp |= HASWIDTH|SIMPLE; |
---|
531 | } |
---|
532 | break; |
---|
533 | case '(': |
---|
534 | ret = reg(1, &flags); |
---|
535 | if (ret == NULL) |
---|
536 | return(NULL); |
---|
537 | *flagp |= flags&(HASWIDTH|SPSTART); |
---|
538 | break; |
---|
539 | case '\0': |
---|
540 | case '|': |
---|
541 | case '\n': |
---|
542 | case ')': |
---|
543 | FAIL("internal urp"); /* Supposed to be caught earlier. */ |
---|
544 | break; |
---|
545 | case '?': |
---|
546 | case '+': |
---|
547 | case '*': |
---|
548 | FAIL("?+* follows nothing"); |
---|
549 | break; |
---|
550 | case '\\': |
---|
551 | switch (*regparse++) { |
---|
552 | case '\0': |
---|
553 | FAIL("trailing \\"); |
---|
554 | break; |
---|
555 | case '<': |
---|
556 | ret = regnode(WORDA); |
---|
557 | break; |
---|
558 | case '>': |
---|
559 | ret = regnode(WORDZ); |
---|
560 | break; |
---|
561 | /* FIXME: Someday handle \1, \2, ... */ |
---|
562 | default: |
---|
563 | /* Handle general quoted chars in exact-match routine */ |
---|
564 | goto de_fault; |
---|
565 | } |
---|
566 | break; |
---|
567 | de_fault: |
---|
568 | default: |
---|
569 | /* |
---|
570 | * Encode a string of characters to be matched exactly. |
---|
571 | * |
---|
572 | * This is a bit tricky due to quoted chars and due to |
---|
573 | * '*', '+', and '?' taking the SINGLE char previous |
---|
574 | * as their operand. |
---|
575 | * |
---|
576 | * On entry, the char at regparse[-1] is going to go |
---|
577 | * into the string, no matter what it is. (It could be |
---|
578 | * following a \ if we are entered from the '\' case.) |
---|
579 | * |
---|
580 | * Basic idea is to pick up a good char in ch and |
---|
581 | * examine the next char. If it's *+? then we twiddle. |
---|
582 | * If it's \ then we frozzle. If it's other magic char |
---|
583 | * we push ch and terminate the string. If none of the |
---|
584 | * above, we push ch on the string and go around again. |
---|
585 | * |
---|
586 | * regprev is used to remember where "the current char" |
---|
587 | * starts in the string, if due to a *+? we need to back |
---|
588 | * up and put the current char in a separate, 1-char, string. |
---|
589 | * When regprev is NULL, ch is the only char in the |
---|
590 | * string; this is used in *+? handling, and in setting |
---|
591 | * flags |= SIMPLE at the end. |
---|
592 | */ |
---|
593 | { |
---|
594 | char *regprev; |
---|
595 | register char ch; |
---|
596 | |
---|
597 | regparse--; /* Look at cur char */ |
---|
598 | ret = regnode(EXACTLY); |
---|
599 | for ( regprev = 0 ; ; ) { |
---|
600 | ch = *regparse++; /* Get current char */ |
---|
601 | switch (*regparse) { /* look at next one */ |
---|
602 | |
---|
603 | default: |
---|
604 | regc(ch); /* Add cur to string */ |
---|
605 | break; |
---|
606 | |
---|
607 | case '.': case '[': case '(': |
---|
608 | case ')': case '|': case '\n': |
---|
609 | case '$': case '^': |
---|
610 | case '\0': |
---|
611 | /* FIXME, $ and ^ should not always be magic */ |
---|
612 | magic: |
---|
613 | regc(ch); /* dump cur char */ |
---|
614 | goto done; /* and we are done */ |
---|
615 | |
---|
616 | case '?': case '+': case '*': |
---|
617 | if (!regprev) /* If just ch in str, */ |
---|
618 | goto magic; /* use it */ |
---|
619 | /* End mult-char string one early */ |
---|
620 | regparse = regprev; /* Back up parse */ |
---|
621 | goto done; |
---|
622 | |
---|
623 | case '\\': |
---|
624 | regc(ch); /* Cur char OK */ |
---|
625 | switch (regparse[1]){ /* Look after \ */ |
---|
626 | case '\0': |
---|
627 | case '<': |
---|
628 | case '>': |
---|
629 | /* FIXME: Someday handle \1, \2, ... */ |
---|
630 | goto done; /* Not quoted */ |
---|
631 | default: |
---|
632 | /* Backup point is \, scan * point is after it. */ |
---|
633 | regprev = regparse; |
---|
634 | regparse++; |
---|
635 | continue; /* NOT break; */ |
---|
636 | } |
---|
637 | } |
---|
638 | regprev = regparse; /* Set backup point */ |
---|
639 | } |
---|
640 | done: |
---|
641 | regc('\0'); |
---|
642 | *flagp |= HASWIDTH; |
---|
643 | if (!regprev) /* One char? */ |
---|
644 | *flagp |= SIMPLE; |
---|
645 | } |
---|
646 | break; |
---|
647 | } |
---|
648 | |
---|
649 | return(ret); |
---|
650 | } |
---|
651 | |
---|
652 | /* |
---|
653 | - regnode - emit a node |
---|
654 | */ |
---|
655 | static char * /* Location. */ |
---|
656 | regnode( int op ) |
---|
657 | { |
---|
658 | register char *ret; |
---|
659 | register char *ptr; |
---|
660 | |
---|
661 | ret = regcode; |
---|
662 | if (ret == ®dummy) { |
---|
663 | regsize += 3; |
---|
664 | return(ret); |
---|
665 | } |
---|
666 | |
---|
667 | ptr = ret; |
---|
668 | *ptr++ = op; |
---|
669 | *ptr++ = '\0'; /* Null "next" pointer. */ |
---|
670 | *ptr++ = '\0'; |
---|
671 | regcode = ptr; |
---|
672 | |
---|
673 | return(ret); |
---|
674 | } |
---|
675 | |
---|
676 | /* |
---|
677 | - regc - emit (if appropriate) a byte of code |
---|
678 | */ |
---|
679 | static void |
---|
680 | regc( int b ) |
---|
681 | { |
---|
682 | if (regcode != ®dummy) |
---|
683 | *regcode++ = b; |
---|
684 | else |
---|
685 | regsize++; |
---|
686 | } |
---|
687 | |
---|
688 | /* |
---|
689 | - reginsert - insert an operator in front of already-emitted operand |
---|
690 | * |
---|
691 | * Means relocating the operand. |
---|
692 | */ |
---|
693 | static void |
---|
694 | reginsert( |
---|
695 | char op, |
---|
696 | char *opnd ) |
---|
697 | { |
---|
698 | register char *src; |
---|
699 | register char *dst; |
---|
700 | register char *place; |
---|
701 | |
---|
702 | if (regcode == ®dummy) { |
---|
703 | regsize += 3; |
---|
704 | return; |
---|
705 | } |
---|
706 | |
---|
707 | src = regcode; |
---|
708 | regcode += 3; |
---|
709 | dst = regcode; |
---|
710 | while (src > opnd) |
---|
711 | *--dst = *--src; |
---|
712 | |
---|
713 | place = opnd; /* Op node, where operand used to be. */ |
---|
714 | *place++ = op; |
---|
715 | *place++ = '\0'; |
---|
716 | *place++ = '\0'; |
---|
717 | } |
---|
718 | |
---|
719 | /* |
---|
720 | - regtail - set the next-pointer at the end of a node chain |
---|
721 | */ |
---|
722 | static void |
---|
723 | regtail( |
---|
724 | char *p, |
---|
725 | char *val ) |
---|
726 | { |
---|
727 | register char *scan; |
---|
728 | register char *temp; |
---|
729 | register int offset; |
---|
730 | |
---|
731 | if (p == ®dummy) |
---|
732 | return; |
---|
733 | |
---|
734 | /* Find last node. */ |
---|
735 | scan = p; |
---|
736 | for (;;) { |
---|
737 | temp = regnext(scan); |
---|
738 | if (temp == NULL) |
---|
739 | break; |
---|
740 | scan = temp; |
---|
741 | } |
---|
742 | |
---|
743 | if (OP(scan) == BACK) |
---|
744 | offset = scan - val; |
---|
745 | else |
---|
746 | offset = val - scan; |
---|
747 | *(scan+1) = (offset>>8)&0377; |
---|
748 | *(scan+2) = offset&0377; |
---|
749 | } |
---|
750 | |
---|
751 | /* |
---|
752 | - regoptail - regtail on operand of first argument; nop if operandless |
---|
753 | */ |
---|
754 | |
---|
755 | static void |
---|
756 | regoptail( |
---|
757 | char *p, |
---|
758 | char *val ) |
---|
759 | { |
---|
760 | /* "Operandless" and "op != BRANCH" are synonymous in practice. */ |
---|
761 | if (p == NULL || p == ®dummy || OP(p) != BRANCH) |
---|
762 | return; |
---|
763 | regtail(OPERAND(p), val); |
---|
764 | } |
---|
765 | |
---|
766 | /* |
---|
767 | * regexec and friends |
---|
768 | */ |
---|
769 | |
---|
770 | /* |
---|
771 | * Global work variables for regexec(). |
---|
772 | */ |
---|
773 | static char *reginput; /* String-input pointer. */ |
---|
774 | static char *regbol; /* Beginning of input, for ^ check. */ |
---|
775 | static char **regstartp; /* Pointer to startp array. */ |
---|
776 | static char **regendp; /* Ditto for endp. */ |
---|
777 | |
---|
778 | /* |
---|
779 | * Forwards. |
---|
780 | */ |
---|
781 | STATIC int regtry( regexp *prog, char *string ); |
---|
782 | STATIC int regmatch( char *prog ); |
---|
783 | STATIC int regrepeat( char *p ); |
---|
784 | |
---|
785 | #ifdef DEBUG |
---|
786 | int regnarrate = 0; |
---|
787 | void regdump(); |
---|
788 | STATIC char *regprop(); |
---|
789 | #endif |
---|
790 | |
---|
791 | /* |
---|
792 | - regexec - match a regexp against a string |
---|
793 | */ |
---|
794 | int |
---|
795 | regexec( |
---|
796 | register regexp *prog, |
---|
797 | register char *string ) |
---|
798 | { |
---|
799 | register char *s; |
---|
800 | |
---|
801 | /* Be paranoid... */ |
---|
802 | if (prog == NULL || string == NULL) { |
---|
803 | regerror("NULL parameter"); |
---|
804 | return(0); |
---|
805 | } |
---|
806 | |
---|
807 | /* Check validity of program. */ |
---|
808 | if (UCHARAT(prog->program) != MAGIC) { |
---|
809 | regerror("corrupted program"); |
---|
810 | return(0); |
---|
811 | } |
---|
812 | |
---|
813 | /* If there is a "must appear" string, look for it. */ |
---|
814 | if (prog->regmust != NULL) { |
---|
815 | s = (char *)string; |
---|
816 | while ((s = strchr(s, prog->regmust[0])) != NULL) { |
---|
817 | if (strncmp(s, prog->regmust, prog->regmlen) == 0) |
---|
818 | break; /* Found it. */ |
---|
819 | s++; |
---|
820 | } |
---|
821 | if (s == NULL) /* Not present. */ |
---|
822 | return(0); |
---|
823 | } |
---|
824 | |
---|
825 | /* Mark beginning of line for ^ . */ |
---|
826 | regbol = (char *)string; |
---|
827 | |
---|
828 | /* Simplest case: anchored match need be tried only once. */ |
---|
829 | if (prog->reganch) |
---|
830 | return(regtry(prog, string)); |
---|
831 | |
---|
832 | /* Messy cases: unanchored match. */ |
---|
833 | s = (char *)string; |
---|
834 | if (prog->regstart != '\0') |
---|
835 | /* We know what char it must start with. */ |
---|
836 | while ((s = strchr(s, prog->regstart)) != NULL) { |
---|
837 | if (regtry(prog, s)) |
---|
838 | return(1); |
---|
839 | s++; |
---|
840 | } |
---|
841 | else |
---|
842 | /* We don't -- general case. */ |
---|
843 | do { |
---|
844 | if (regtry(prog, s)) |
---|
845 | return(1); |
---|
846 | } while (*s++ != '\0'); |
---|
847 | |
---|
848 | /* Failure. */ |
---|
849 | return(0); |
---|
850 | } |
---|
851 | |
---|
852 | /* |
---|
853 | - regtry - try match at specific point |
---|
854 | */ |
---|
855 | static int /* 0 failure, 1 success */ |
---|
856 | regtry( |
---|
857 | regexp *prog, |
---|
858 | char *string ) |
---|
859 | { |
---|
860 | register int i; |
---|
861 | register char **sp; |
---|
862 | register char **ep; |
---|
863 | |
---|
864 | reginput = string; |
---|
865 | regstartp = prog->startp; |
---|
866 | regendp = prog->endp; |
---|
867 | |
---|
868 | sp = prog->startp; |
---|
869 | ep = prog->endp; |
---|
870 | for (i = NSUBEXP; i > 0; i--) { |
---|
871 | *sp++ = NULL; |
---|
872 | *ep++ = NULL; |
---|
873 | } |
---|
874 | if (regmatch(prog->program + 1)) { |
---|
875 | prog->startp[0] = string; |
---|
876 | prog->endp[0] = reginput; |
---|
877 | return(1); |
---|
878 | } else |
---|
879 | return(0); |
---|
880 | } |
---|
881 | |
---|
882 | /* |
---|
883 | - regmatch - main matching routine |
---|
884 | * |
---|
885 | * Conceptually the strategy is simple: check to see whether the current |
---|
886 | * node matches, call self recursively to see whether the rest matches, |
---|
887 | * and then act accordingly. In practice we make some effort to avoid |
---|
888 | * recursion, in particular by going through "ordinary" nodes (that don't |
---|
889 | * need to know whether the rest of the match failed) by a loop instead of |
---|
890 | * by recursion. |
---|
891 | */ |
---|
892 | static int /* 0 failure, 1 success */ |
---|
893 | regmatch( char *prog ) |
---|
894 | { |
---|
895 | register char *scan; /* Current node. */ |
---|
896 | char *next; /* Next node. */ |
---|
897 | |
---|
898 | scan = prog; |
---|
899 | #ifdef DEBUG |
---|
900 | if (scan != NULL && regnarrate) |
---|
901 | fprintf(stderr, "%s(\n", regprop(scan)); |
---|
902 | #endif |
---|
903 | while (scan != NULL) { |
---|
904 | #ifdef DEBUG |
---|
905 | if (regnarrate) |
---|
906 | fprintf(stderr, "%s...\n", regprop(scan)); |
---|
907 | #endif |
---|
908 | next = regnext(scan); |
---|
909 | |
---|
910 | switch (OP(scan)) { |
---|
911 | case BOL: |
---|
912 | if (reginput != regbol) |
---|
913 | return(0); |
---|
914 | break; |
---|
915 | case EOL: |
---|
916 | if (*reginput != '\0') |
---|
917 | return(0); |
---|
918 | break; |
---|
919 | case WORDA: |
---|
920 | /* Must be looking at a letter, digit, or _ */ |
---|
921 | if ((!isalnum(*reginput)) && *reginput != '_') |
---|
922 | return(0); |
---|
923 | /* Prev must be BOL or nonword */ |
---|
924 | if (reginput > regbol && |
---|
925 | (isalnum(reginput[-1]) || reginput[-1] == '_')) |
---|
926 | return(0); |
---|
927 | break; |
---|
928 | case WORDZ: |
---|
929 | /* Must be looking at non letter, digit, or _ */ |
---|
930 | if (isalnum(*reginput) || *reginput == '_') |
---|
931 | return(0); |
---|
932 | /* We don't care what the previous char was */ |
---|
933 | break; |
---|
934 | case ANY: |
---|
935 | if (*reginput == '\0') |
---|
936 | return(0); |
---|
937 | reginput++; |
---|
938 | break; |
---|
939 | case EXACTLY: { |
---|
940 | register int len; |
---|
941 | register char *opnd; |
---|
942 | |
---|
943 | opnd = OPERAND(scan); |
---|
944 | /* Inline the first character, for speed. */ |
---|
945 | if (*opnd != *reginput) |
---|
946 | return(0); |
---|
947 | len = strlen(opnd); |
---|
948 | if (len > 1 && strncmp(opnd, reginput, len) != 0) |
---|
949 | return(0); |
---|
950 | reginput += len; |
---|
951 | } |
---|
952 | break; |
---|
953 | case ANYOF: |
---|
954 | if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) |
---|
955 | return(0); |
---|
956 | reginput++; |
---|
957 | break; |
---|
958 | case ANYBUT: |
---|
959 | if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) |
---|
960 | return(0); |
---|
961 | reginput++; |
---|
962 | break; |
---|
963 | case NOTHING: |
---|
964 | break; |
---|
965 | case BACK: |
---|
966 | break; |
---|
967 | case OPEN+1: |
---|
968 | case OPEN+2: |
---|
969 | case OPEN+3: |
---|
970 | case OPEN+4: |
---|
971 | case OPEN+5: |
---|
972 | case OPEN+6: |
---|
973 | case OPEN+7: |
---|
974 | case OPEN+8: |
---|
975 | case OPEN+9: { |
---|
976 | register int no; |
---|
977 | register char *save; |
---|
978 | |
---|
979 | no = OP(scan) - OPEN; |
---|
980 | save = reginput; |
---|
981 | |
---|
982 | if (regmatch(next)) { |
---|
983 | /* |
---|
984 | * Don't set startp if some later |
---|
985 | * invocation of the same parentheses |
---|
986 | * already has. |
---|
987 | */ |
---|
988 | if (regstartp[no] == NULL) |
---|
989 | regstartp[no] = save; |
---|
990 | return(1); |
---|
991 | } else |
---|
992 | return(0); |
---|
993 | } |
---|
994 | break; |
---|
995 | case CLOSE+1: |
---|
996 | case CLOSE+2: |
---|
997 | case CLOSE+3: |
---|
998 | case CLOSE+4: |
---|
999 | case CLOSE+5: |
---|
1000 | case CLOSE+6: |
---|
1001 | case CLOSE+7: |
---|
1002 | case CLOSE+8: |
---|
1003 | case CLOSE+9: { |
---|
1004 | register int no; |
---|
1005 | register char *save; |
---|
1006 | |
---|
1007 | no = OP(scan) - CLOSE; |
---|
1008 | save = reginput; |
---|
1009 | |
---|
1010 | if (regmatch(next)) { |
---|
1011 | /* |
---|
1012 | * Don't set endp if some later |
---|
1013 | * invocation of the same parentheses |
---|
1014 | * already has. |
---|
1015 | */ |
---|
1016 | if (regendp[no] == NULL) |
---|
1017 | regendp[no] = save; |
---|
1018 | return(1); |
---|
1019 | } else |
---|
1020 | return(0); |
---|
1021 | } |
---|
1022 | break; |
---|
1023 | case BRANCH: { |
---|
1024 | register char *save; |
---|
1025 | |
---|
1026 | if (OP(next) != BRANCH) /* No choice. */ |
---|
1027 | next = OPERAND(scan); /* Avoid recursion. */ |
---|
1028 | else { |
---|
1029 | do { |
---|
1030 | save = reginput; |
---|
1031 | if (regmatch(OPERAND(scan))) |
---|
1032 | return(1); |
---|
1033 | reginput = save; |
---|
1034 | scan = regnext(scan); |
---|
1035 | } while (scan != NULL && OP(scan) == BRANCH); |
---|
1036 | return(0); |
---|
1037 | /* NOTREACHED */ |
---|
1038 | } |
---|
1039 | } |
---|
1040 | break; |
---|
1041 | case STAR: |
---|
1042 | case PLUS: { |
---|
1043 | register char nextch; |
---|
1044 | register int no; |
---|
1045 | register char *save; |
---|
1046 | register int min; |
---|
1047 | |
---|
1048 | /* |
---|
1049 | * Lookahead to avoid useless match attempts |
---|
1050 | * when we know what character comes next. |
---|
1051 | */ |
---|
1052 | nextch = '\0'; |
---|
1053 | if (OP(next) == EXACTLY) |
---|
1054 | nextch = *OPERAND(next); |
---|
1055 | min = (OP(scan) == STAR) ? 0 : 1; |
---|
1056 | save = reginput; |
---|
1057 | no = regrepeat(OPERAND(scan)); |
---|
1058 | while (no >= min) { |
---|
1059 | /* If it could work, try it. */ |
---|
1060 | if (nextch == '\0' || *reginput == nextch) |
---|
1061 | if (regmatch(next)) |
---|
1062 | return(1); |
---|
1063 | /* Couldn't or didn't -- back up. */ |
---|
1064 | no--; |
---|
1065 | reginput = save + no; |
---|
1066 | } |
---|
1067 | return(0); |
---|
1068 | } |
---|
1069 | break; |
---|
1070 | case END: |
---|
1071 | return(1); /* Success! */ |
---|
1072 | break; |
---|
1073 | default: |
---|
1074 | regerror("memory corruption"); |
---|
1075 | return(0); |
---|
1076 | break; |
---|
1077 | } |
---|
1078 | |
---|
1079 | scan = next; |
---|
1080 | } |
---|
1081 | |
---|
1082 | /* |
---|
1083 | * We get here only if there's trouble -- normally "case END" is |
---|
1084 | * the terminating point. |
---|
1085 | */ |
---|
1086 | regerror("corrupted pointers"); |
---|
1087 | return(0); |
---|
1088 | } |
---|
1089 | |
---|
1090 | /* |
---|
1091 | - regrepeat - repeatedly match something simple, report how many |
---|
1092 | */ |
---|
1093 | static int |
---|
1094 | regrepeat( char *p ) |
---|
1095 | { |
---|
1096 | register int count = 0; |
---|
1097 | register char *scan; |
---|
1098 | register char *opnd; |
---|
1099 | |
---|
1100 | scan = reginput; |
---|
1101 | opnd = OPERAND(p); |
---|
1102 | switch (OP(p)) { |
---|
1103 | case ANY: |
---|
1104 | count = strlen(scan); |
---|
1105 | scan += count; |
---|
1106 | break; |
---|
1107 | case EXACTLY: |
---|
1108 | while (*opnd == *scan) { |
---|
1109 | count++; |
---|
1110 | scan++; |
---|
1111 | } |
---|
1112 | break; |
---|
1113 | case ANYOF: |
---|
1114 | while (*scan != '\0' && strchr(opnd, *scan) != NULL) { |
---|
1115 | count++; |
---|
1116 | scan++; |
---|
1117 | } |
---|
1118 | break; |
---|
1119 | case ANYBUT: |
---|
1120 | while (*scan != '\0' && strchr(opnd, *scan) == NULL) { |
---|
1121 | count++; |
---|
1122 | scan++; |
---|
1123 | } |
---|
1124 | break; |
---|
1125 | default: /* Oh dear. Called inappropriately. */ |
---|
1126 | regerror("internal foulup"); |
---|
1127 | count = 0; /* Best compromise. */ |
---|
1128 | break; |
---|
1129 | } |
---|
1130 | reginput = scan; |
---|
1131 | |
---|
1132 | return(count); |
---|
1133 | } |
---|
1134 | |
---|
1135 | /* |
---|
1136 | - regnext - dig the "next" pointer out of a node |
---|
1137 | */ |
---|
1138 | static char * |
---|
1139 | regnext( register char *p ) |
---|
1140 | { |
---|
1141 | register int offset; |
---|
1142 | |
---|
1143 | if (p == ®dummy) |
---|
1144 | return(NULL); |
---|
1145 | |
---|
1146 | offset = NEXT(p); |
---|
1147 | if (offset == 0) |
---|
1148 | return(NULL); |
---|
1149 | |
---|
1150 | if (OP(p) == BACK) |
---|
1151 | return(p-offset); |
---|
1152 | else |
---|
1153 | return(p+offset); |
---|
1154 | } |
---|
1155 | |
---|
1156 | #ifdef DEBUG |
---|
1157 | |
---|
1158 | STATIC char *regprop(); |
---|
1159 | |
---|
1160 | /* |
---|
1161 | - regdump - dump a regexp onto stdout in vaguely comprehensible form |
---|
1162 | */ |
---|
1163 | void |
---|
1164 | regdump( regexp *r ) |
---|
1165 | { |
---|
1166 | register char *s; |
---|
1167 | register char op = EXACTLY; /* Arbitrary non-END op. */ |
---|
1168 | register char *next; |
---|
1169 | |
---|
1170 | |
---|
1171 | s = r->program + 1; |
---|
1172 | while (op != END) { /* While that wasn't END last time... */ |
---|
1173 | op = OP(s); |
---|
1174 | printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ |
---|
1175 | next = regnext(s); |
---|
1176 | if (next == NULL) /* Next ptr. */ |
---|
1177 | printf("(0)"); |
---|
1178 | else |
---|
1179 | printf("(%d)", (s-r->program)+(next-s)); |
---|
1180 | s += 3; |
---|
1181 | if (op == ANYOF || op == ANYBUT || op == EXACTLY) { |
---|
1182 | /* Literal string, where present. */ |
---|
1183 | while (*s != '\0') { |
---|
1184 | putchar(*s); |
---|
1185 | s++; |
---|
1186 | } |
---|
1187 | s++; |
---|
1188 | } |
---|
1189 | putchar('\n'); |
---|
1190 | } |
---|
1191 | |
---|
1192 | /* Header fields of interest. */ |
---|
1193 | if (r->regstart != '\0') |
---|
1194 | printf("start `%c' ", r->regstart); |
---|
1195 | if (r->reganch) |
---|
1196 | printf("anchored "); |
---|
1197 | if (r->regmust != NULL) |
---|
1198 | printf("must have \"%s\"", r->regmust); |
---|
1199 | printf("\n"); |
---|
1200 | } |
---|
1201 | |
---|
1202 | /* |
---|
1203 | - regprop - printable representation of opcode |
---|
1204 | */ |
---|
1205 | static char * |
---|
1206 | regprop( char *op ) |
---|
1207 | { |
---|
1208 | register char *p; |
---|
1209 | static char buf[50]; |
---|
1210 | |
---|
1211 | (void) strcpy(buf, ":"); |
---|
1212 | |
---|
1213 | switch (OP(op)) { |
---|
1214 | case BOL: |
---|
1215 | p = "BOL"; |
---|
1216 | break; |
---|
1217 | case EOL: |
---|
1218 | p = "EOL"; |
---|
1219 | break; |
---|
1220 | case ANY: |
---|
1221 | p = "ANY"; |
---|
1222 | break; |
---|
1223 | case ANYOF: |
---|
1224 | p = "ANYOF"; |
---|
1225 | break; |
---|
1226 | case ANYBUT: |
---|
1227 | p = "ANYBUT"; |
---|
1228 | break; |
---|
1229 | case BRANCH: |
---|
1230 | p = "BRANCH"; |
---|
1231 | break; |
---|
1232 | case EXACTLY: |
---|
1233 | p = "EXACTLY"; |
---|
1234 | break; |
---|
1235 | case NOTHING: |
---|
1236 | p = "NOTHING"; |
---|
1237 | break; |
---|
1238 | case BACK: |
---|
1239 | p = "BACK"; |
---|
1240 | break; |
---|
1241 | case END: |
---|
1242 | p = "END"; |
---|
1243 | break; |
---|
1244 | case OPEN+1: |
---|
1245 | case OPEN+2: |
---|
1246 | case OPEN+3: |
---|
1247 | case OPEN+4: |
---|
1248 | case OPEN+5: |
---|
1249 | case OPEN+6: |
---|
1250 | case OPEN+7: |
---|
1251 | case OPEN+8: |
---|
1252 | case OPEN+9: |
---|
1253 | sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); |
---|
1254 | p = NULL; |
---|
1255 | break; |
---|
1256 | case CLOSE+1: |
---|
1257 | case CLOSE+2: |
---|
1258 | case CLOSE+3: |
---|
1259 | case CLOSE+4: |
---|
1260 | case CLOSE+5: |
---|
1261 | case CLOSE+6: |
---|
1262 | case CLOSE+7: |
---|
1263 | case CLOSE+8: |
---|
1264 | case CLOSE+9: |
---|
1265 | sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); |
---|
1266 | p = NULL; |
---|
1267 | break; |
---|
1268 | case STAR: |
---|
1269 | p = "STAR"; |
---|
1270 | break; |
---|
1271 | case PLUS: |
---|
1272 | p = "PLUS"; |
---|
1273 | break; |
---|
1274 | case WORDA: |
---|
1275 | p = "WORDA"; |
---|
1276 | break; |
---|
1277 | case WORDZ: |
---|
1278 | p = "WORDZ"; |
---|
1279 | break; |
---|
1280 | default: |
---|
1281 | regerror("corrupted opcode"); |
---|
1282 | break; |
---|
1283 | } |
---|
1284 | if (p != NULL) |
---|
1285 | (void) strcat(buf, p); |
---|
1286 | return(buf); |
---|
1287 | } |
---|
1288 | #endif |
---|
1289 | |
---|
1290 | /* |
---|
1291 | * The following is provided for those people who do not have strcspn() in |
---|
1292 | * their C libraries. They should get off their butts and do something |
---|
1293 | * about it; at least one public-domain implementation of those (highly |
---|
1294 | * useful) string routines has been published on Usenet. |
---|
1295 | */ |
---|
1296 | #ifdef STRCSPN |
---|
1297 | /* |
---|
1298 | * strcspn - find length of initial segment of s1 consisting entirely |
---|
1299 | * of characters not from s2 |
---|
1300 | */ |
---|
1301 | |
---|
1302 | static int |
---|
1303 | strcspn( |
---|
1304 | char *s1, |
---|
1305 | char *s2 ) |
---|
1306 | { |
---|
1307 | register char *scan1; |
---|
1308 | register char *scan2; |
---|
1309 | register int count; |
---|
1310 | |
---|
1311 | count = 0; |
---|
1312 | for (scan1 = s1; *scan1 != '\0'; scan1++) { |
---|
1313 | for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ |
---|
1314 | if (*scan1 == *scan2++) |
---|
1315 | return(count); |
---|
1316 | count++; |
---|
1317 | } |
---|
1318 | return(count); |
---|
1319 | } |
---|
1320 | #endif |
---|