Planet
navi homePPSaboutscreenshotsdownloaddevelopmentforum

source: downloads/tcl8.5.2/generic/tclRegexp.c @ 42

Last change on this file since 42 was 25, checked in by landauf, 17 years ago

added tcl to libs

File size: 30.6 KB
Line 
1/*
2 * tclRegexp.c --
3 *
4 *      This file contains the public interfaces to the Tcl regular expression
5 *      mechanism.
6 *
7 * Copyright (c) 1998 by Sun Microsystems, Inc.
8 * Copyright (c) 1998-1999 by Scriptics Corporation.
9 *
10 * See the file "license.terms" for information on usage and redistribution of
11 * this file, and for a DISCLAIMER OF ALL WARRANTIES.
12 *
13 * RCS: @(#) $Id: tclRegexp.c,v 1.28 2007/12/13 15:23:20 dgp Exp $
14 */
15
16#include "tclInt.h"
17#include "tclRegexp.h"
18
19/*
20 *----------------------------------------------------------------------
21 * The routines in this file use Henry Spencer's regular expression package
22 * contained in the following additional source files:
23 *
24 *      regc_color.c    regc_cvec.c     regc_lex.c
25 *      regc_nfa.c      regcomp.c       regcustom.h
26 *      rege_dfa.c      regerror.c      regerrs.h
27 *      regex.h         regexec.c       regfree.c
28 *      regfronts.c     regguts.h
29 *
30 * Copyright (c) 1998 Henry Spencer.  All rights reserved.
31 *
32 * Development of this software was funded, in part, by Cray Research Inc.,
33 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
34 * Corporation, none of whom are responsible for the results. The author
35 * thanks all of them.
36 *
37 * Redistribution and use in source and binary forms -- with or without
38 * modification -- are permitted for any purpose, provided that
39 * redistributions in source form retain this entire copyright notice and
40 * indicate the origin and nature of any modifications.
41 *
42 * I'd appreciate being given credit for this package in the documentation of
43 * software which uses it, but that is not a requirement.
44 *
45 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
46 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
47 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
48 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
49 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
50 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
51 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
52 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
53 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
54 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55 *
56 * *** NOTE: this code has been altered slightly for use in Tcl: ***
57 * *** 1. Names have been changed, e.g. from re_comp to          ***
58 * ***    TclRegComp, to avoid clashes with other                ***
59 * ***    regexp implementations used by applications.           ***
60 */
61
62/*
63 * Thread local storage used to maintain a per-thread cache of compiled
64 * regular expressions.
65 */
66
67#define NUM_REGEXPS 30
68
69typedef struct ThreadSpecificData {
70    int initialized;            /* Set to 1 when the module is initialized. */
71    char *patterns[NUM_REGEXPS];/* Strings corresponding to compiled regular
72                                 * expression patterns. NULL means that this
73                                 * slot isn't used. Malloc-ed. */
74    int patLengths[NUM_REGEXPS];/* Number of non-null characters in
75                                 * corresponding entry in patterns. -1 means
76                                 * entry isn't used. */
77    struct TclRegexp *regexps[NUM_REGEXPS];
78                                /* Compiled forms of above strings. Also
79                                 * malloc-ed, or NULL if not in use yet. */
80} ThreadSpecificData;
81
82static Tcl_ThreadDataKey dataKey;
83
84/*
85 * Declarations for functions used only in this file.
86 */
87
88static TclRegexp *      CompileRegexp(Tcl_Interp *interp, const char *pattern,
89                            int length, int flags);
90static void             DupRegexpInternalRep(Tcl_Obj *srcPtr,
91                            Tcl_Obj *copyPtr);
92static void             FinalizeRegexp(ClientData clientData);
93static void             FreeRegexp(TclRegexp *regexpPtr);
94static void             FreeRegexpInternalRep(Tcl_Obj *objPtr);
95static int              RegExpExecUniChar(Tcl_Interp *interp, Tcl_RegExp re,
96                            const Tcl_UniChar *uniString, int numChars,
97                            int nmatches, int flags);
98static int              SetRegexpFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr);
99
100/*
101 * The regular expression Tcl object type. This serves as a cache of the
102 * compiled form of the regular expression.
103 */
104
105Tcl_ObjType tclRegexpType = {
106    "regexp",                           /* name */
107    FreeRegexpInternalRep,              /* freeIntRepProc */
108    DupRegexpInternalRep,               /* dupIntRepProc */
109    NULL,                               /* updateStringProc */
110    SetRegexpFromAny                    /* setFromAnyProc */
111};
112
113/*
114 *----------------------------------------------------------------------
115 *
116 * Tcl_RegExpCompile --
117 *
118 *      Compile a regular expression into a form suitable for fast matching.
119 *      This function is DEPRECATED in favor of the object version of the
120 *      command.
121 *
122 * Results:
123 *      The return value is a pointer to the compiled form of string, suitable
124 *      for passing to Tcl_RegExpExec. This compiled form is only valid up
125 *      until the next call to this function, so don't keep these around for a
126 *      long time! If an error occurred while compiling the pattern, then NULL
127 *      is returned and an error message is left in the interp's result.
128 *
129 * Side effects:
130 *      Updates the cache of compiled regexps.
131 *
132 *----------------------------------------------------------------------
133 */
134
135Tcl_RegExp
136Tcl_RegExpCompile(
137    Tcl_Interp *interp,         /* For use in error reporting and to access
138                                 * the interp regexp cache. */
139    const char *pattern)        /* String for which to produce compiled
140                                 * regular expression. */
141{
142    return (Tcl_RegExp) CompileRegexp(interp, pattern, (int) strlen(pattern),
143            REG_ADVANCED);
144}
145
146/*
147 *----------------------------------------------------------------------
148 *
149 * Tcl_RegExpExec --
150 *
151 *      Execute the regular expression matcher using a compiled form of a
152 *      regular expression and save information about any match that is found.
153 *
154 * Results:
155 *      If an error occurs during the matching operation then -1 is returned
156 *      and the interp's result contains an error message. Otherwise the
157 *      return value is 1 if a matching range is found and 0 if there is no
158 *      matching range.
159 *
160 * Side effects:
161 *      None.
162 *
163 *----------------------------------------------------------------------
164 */
165
166int
167Tcl_RegExpExec(
168    Tcl_Interp *interp,         /* Interpreter to use for error reporting. */
169    Tcl_RegExp re,              /* Compiled regular expression; must have been
170                                 * returned by previous call to
171                                 * Tcl_GetRegExpFromObj. */
172    const char *text,           /* Text against which to match re. */
173    const char *start)          /* If text is part of a larger string, this
174                                 * identifies beginning of larger string, so
175                                 * that "^" won't match. */
176{
177    int flags, result, numChars;
178    TclRegexp *regexp = (TclRegexp *)re;
179    Tcl_DString ds;
180    const Tcl_UniChar *ustr;
181
182    /*
183     * If the starting point is offset from the beginning of the buffer, then
184     * we need to tell the regexp engine not to match "^".
185     */
186
187    if (text > start) {
188        flags = REG_NOTBOL;
189    } else {
190        flags = 0;
191    }
192
193    /*
194     * Remember the string for use by Tcl_RegExpRange().
195     */
196
197    regexp->string = text;
198    regexp->objPtr = NULL;
199
200    /*
201     * Convert the string to Unicode and perform the match.
202     */
203
204    Tcl_DStringInit(&ds);
205    ustr = Tcl_UtfToUniCharDString(text, -1, &ds);
206    numChars = Tcl_DStringLength(&ds) / sizeof(Tcl_UniChar);
207    result = RegExpExecUniChar(interp, re, ustr, numChars, -1 /* nmatches */,
208            flags);
209    Tcl_DStringFree(&ds);
210
211    return result;
212}
213
214/*
215 *---------------------------------------------------------------------------
216 *
217 * Tcl_RegExpRange --
218 *
219 *      Returns pointers describing the range of a regular expression match,
220 *      or one of the subranges within the match.
221 *
222 * Results:
223 *      The variables at *startPtr and *endPtr are modified to hold the
224 *      addresses of the endpoints of the range given by index. If the
225 *      specified range doesn't exist then NULLs are returned.
226 *
227 * Side effects:
228 *      None.
229 *
230 *---------------------------------------------------------------------------
231 */
232
233void
234Tcl_RegExpRange(
235    Tcl_RegExp re,              /* Compiled regular expression that has been
236                                 * passed to Tcl_RegExpExec. */
237    int index,                  /* 0 means give the range of the entire match,
238                                 * > 0 means give the range of a matching
239                                 * subrange. */
240    const char **startPtr,      /* Store address of first character in
241                                 * (sub-)range here. */
242    const char **endPtr)        /* Store address of character just after last
243                                 * in (sub-)range here. */
244{
245    TclRegexp *regexpPtr = (TclRegexp *) re;
246    const char *string;
247
248    if ((size_t) index > regexpPtr->re.re_nsub) {
249        *startPtr = *endPtr = NULL;
250    } else if (regexpPtr->matches[index].rm_so < 0) {
251        *startPtr = *endPtr = NULL;
252    } else {
253        if (regexpPtr->objPtr) {
254            string = TclGetString(regexpPtr->objPtr);
255        } else {
256            string = regexpPtr->string;
257        }
258        *startPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_so);
259        *endPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_eo);
260    }
261}
262
263/*
264 *---------------------------------------------------------------------------
265 *
266 * RegExpExecUniChar --
267 *
268 *      Execute the regular expression matcher using a compiled form of a
269 *      regular expression and save information about any match that is found.
270 *
271 * Results:
272 *      If an error occurs during the matching operation then -1 is returned
273 *      and an error message is left in interp's result. Otherwise the return
274 *      value is 1 if a matching range was found or 0 if there was no matching
275 *      range.
276 *
277 * Side effects:
278 *      None.
279 *
280 *----------------------------------------------------------------------
281 */
282
283static int
284RegExpExecUniChar(
285    Tcl_Interp *interp,         /* Interpreter to use for error reporting. */
286    Tcl_RegExp re,              /* Compiled regular expression; returned by a
287                                 * previous call to Tcl_GetRegExpFromObj */
288    const Tcl_UniChar *wString, /* String against which to match re. */
289    int numChars,               /* Length of Tcl_UniChar string (must be
290                                 * >=0). */
291    int nmatches,               /* How many subexpression matches (counting
292                                 * the whole match as subexpression 0) are of
293                                 * interest. -1 means "don't know". */
294    int flags)                  /* Regular expression flags. */
295{
296    int status;
297    TclRegexp *regexpPtr = (TclRegexp *) re;
298    size_t last = regexpPtr->re.re_nsub + 1;
299    size_t nm = last;
300
301    if (nmatches >= 0 && (size_t) nmatches < nm) {
302        nm = (size_t) nmatches;
303    }
304
305    status = TclReExec(&regexpPtr->re, wString, (size_t) numChars,
306            &regexpPtr->details, nm, regexpPtr->matches, flags);
307
308    /*
309     * Check for errors.
310     */
311
312    if (status != REG_OKAY) {
313        if (status == REG_NOMATCH) {
314            return 0;
315        }
316        if (interp != NULL) {
317            TclRegError(interp, "error while matching regular expression: ",
318                    status);
319        }
320        return -1;
321    }
322    return 1;
323}
324
325/*
326 *---------------------------------------------------------------------------
327 *
328 * TclRegExpRangeUniChar --
329 *
330 *      Returns pointers describing the range of a regular expression match,
331 *      or one of the subranges within the match, or the hypothetical range
332 *      represented by the rm_extend field of the rm_detail_t.
333 *
334 * Results:
335 *      The variables at *startPtr and *endPtr are modified to hold the
336 *      offsets of the endpoints of the range given by index. If the specified
337 *      range doesn't exist then -1s are supplied.
338 *
339 * Side effects:
340 *      None.
341 *
342 *---------------------------------------------------------------------------
343 */
344
345void
346TclRegExpRangeUniChar(
347    Tcl_RegExp re,              /* Compiled regular expression that has been
348                                 * passed to Tcl_RegExpExec. */
349    int index,                  /* 0 means give the range of the entire match,
350                                 * > 0 means give the range of a matching
351                                 * subrange, -1 means the range of the
352                                 * rm_extend field. */
353    int *startPtr,              /* Store address of first character in
354                                 * (sub-)range here. */
355    int *endPtr)                /* Store address of character just after last
356                                 * in (sub-)range here. */
357{
358    TclRegexp *regexpPtr = (TclRegexp *) re;
359
360    if ((regexpPtr->flags&REG_EXPECT) && index == -1) {
361        *startPtr = regexpPtr->details.rm_extend.rm_so;
362        *endPtr = regexpPtr->details.rm_extend.rm_eo;
363    } else if ((size_t) index > regexpPtr->re.re_nsub) {
364        *startPtr = -1;
365        *endPtr = -1;
366    } else {
367        *startPtr = regexpPtr->matches[index].rm_so;
368        *endPtr = regexpPtr->matches[index].rm_eo;
369    }
370}
371
372/*
373 *----------------------------------------------------------------------
374 *
375 * Tcl_RegExpMatch --
376 *
377 *      See if a string matches a regular expression.
378 *
379 * Results:
380 *      If an error occurs during the matching operation then -1 is returned
381 *      and the interp's result contains an error message. Otherwise the
382 *      return value is 1 if "text" matches "pattern" and 0 otherwise.
383 *
384 * Side effects:
385 *      None.
386 *
387 *----------------------------------------------------------------------
388 */
389
390int
391Tcl_RegExpMatch(
392    Tcl_Interp *interp,         /* Used for error reporting. May be NULL. */
393    const char *text,           /* Text to search for pattern matches. */
394    const char *pattern)        /* Regular expression to match against text. */
395{
396    Tcl_RegExp re;
397
398    re = Tcl_RegExpCompile(interp, pattern);
399    if (re == NULL) {
400        return -1;
401    }
402    return Tcl_RegExpExec(interp, re, text, text);
403}
404
405/*
406 *----------------------------------------------------------------------
407 *
408 * Tcl_RegExpExecObj --
409 *
410 *      Execute a precompiled regexp against the given object.
411 *
412 * Results:
413 *      If an error occurs during the matching operation then -1 is returned
414 *      and the interp's result contains an error message. Otherwise the
415 *      return value is 1 if "string" matches "pattern" and 0 otherwise.
416 *
417 * Side effects:
418 *      Converts the object to a Unicode object.
419 *
420 *----------------------------------------------------------------------
421 */
422
423int
424Tcl_RegExpExecObj(
425    Tcl_Interp *interp,         /* Interpreter to use for error reporting. */
426    Tcl_RegExp re,              /* Compiled regular expression; must have been
427                                 * returned by previous call to
428                                 * Tcl_GetRegExpFromObj. */
429    Tcl_Obj *textObj,           /* Text against which to match re. */
430    int offset,                 /* Character index that marks where matching
431                                 * should begin. */
432    int nmatches,               /* How many subexpression matches (counting
433                                 * the whole match as subexpression 0) are of
434                                 * interest. -1 means all of them. */
435    int flags)                  /* Regular expression execution flags. */
436{
437    TclRegexp *regexpPtr = (TclRegexp *) re;
438    Tcl_UniChar *udata;
439    int length;
440    int reflags = regexpPtr->flags;
441#define TCL_REG_GLOBOK_FLAGS (TCL_REG_ADVANCED | TCL_REG_NOSUB | TCL_REG_NOCASE)
442
443    /*
444     * Take advantage of the equivalent glob pattern, if one exists.
445     * This is possible based only on the right mix of incoming flags (0)
446     * and regexp compile flags.
447     */
448    if ((offset == 0) && (nmatches == 0) && (flags == 0)
449            && !(reflags & ~TCL_REG_GLOBOK_FLAGS)
450            && (regexpPtr->globObjPtr != NULL)) {
451        int nocase = (reflags & TCL_REG_NOCASE) ? TCL_MATCH_NOCASE : 0;
452
453        /*
454         * Pass to TclStringMatchObj for obj-specific handling.
455         * XXX: Currently doesn't take advantage of exact-ness that
456         * XXX: TclReToGlob tells us about
457         */
458
459        return TclStringMatchObj(textObj, regexpPtr->globObjPtr, nocase);
460    }
461
462    /*
463     * Save the target object so we can extract strings from it later.
464     */
465
466    regexpPtr->string = NULL;
467    regexpPtr->objPtr = textObj;
468
469    udata = Tcl_GetUnicodeFromObj(textObj, &length);
470
471    if (offset > length) {
472        offset = length;
473    }
474    udata += offset;
475    length -= offset;
476
477    return RegExpExecUniChar(interp, re, udata, length, nmatches, flags);
478}
479
480/*
481 *----------------------------------------------------------------------
482 *
483 * Tcl_RegExpMatchObj --
484 *
485 *      See if an object matches a regular expression.
486 *
487 * Results:
488 *      If an error occurs during the matching operation then -1 is returned
489 *      and the interp's result contains an error message. Otherwise the
490 *      return value is 1 if "text" matches "pattern" and 0 otherwise.
491 *
492 * Side effects:
493 *      Changes the internal rep of the pattern and string objects.
494 *
495 *----------------------------------------------------------------------
496 */
497
498int
499Tcl_RegExpMatchObj(
500    Tcl_Interp *interp,         /* Used for error reporting. May be NULL. */
501    Tcl_Obj *textObj,           /* Object containing the String to search. */
502    Tcl_Obj *patternObj)        /* Regular expression to match against
503                                 * string. */
504{
505    Tcl_RegExp re;
506
507    re = Tcl_GetRegExpFromObj(interp, patternObj,
508            TCL_REG_ADVANCED | TCL_REG_NOSUB);
509    if (re == NULL) {
510        return -1;
511    }
512    return Tcl_RegExpExecObj(interp, re, textObj, 0 /* offset */,
513            0 /* nmatches */, 0 /* flags */);
514}
515
516/*
517 *----------------------------------------------------------------------
518 *
519 * Tcl_RegExpGetInfo --
520 *
521 *      Retrieve information about the current match.
522 *
523 * Results:
524 *      None.
525 *
526 * Side effects:
527 *      None.
528 *
529 *----------------------------------------------------------------------
530 */
531
532void
533Tcl_RegExpGetInfo(
534    Tcl_RegExp regexp,          /* Pattern from which to get subexpressions. */
535    Tcl_RegExpInfo *infoPtr)    /* Match information is stored here. */
536{
537    TclRegexp *regexpPtr = (TclRegexp *) regexp;
538
539    infoPtr->nsubs = regexpPtr->re.re_nsub;
540    infoPtr->matches = (Tcl_RegExpIndices *) regexpPtr->matches;
541    infoPtr->extendStart = regexpPtr->details.rm_extend.rm_so;
542}
543
544/*
545 *----------------------------------------------------------------------
546 *
547 * Tcl_GetRegExpFromObj --
548 *
549 *      Compile a regular expression into a form suitable for fast matching.
550 *      This function caches the result in a Tcl_Obj.
551 *
552 * Results:
553 *      The return value is a pointer to the compiled form of string, suitable
554 *      for passing to Tcl_RegExpExec. If an error occurred while compiling
555 *      the pattern, then NULL is returned and an error message is left in the
556 *      interp's result.
557 *
558 * Side effects:
559 *      Updates the native rep of the Tcl_Obj.
560 *
561 *----------------------------------------------------------------------
562 */
563
564Tcl_RegExp
565Tcl_GetRegExpFromObj(
566    Tcl_Interp *interp,         /* For use in error reporting, and to access
567                                 * the interp regexp cache. */
568    Tcl_Obj *objPtr,            /* Object whose string rep contains regular
569                                 * expression pattern. Internal rep will be
570                                 * changed to compiled form of this regular
571                                 * expression. */
572    int flags)                  /* Regular expression compilation flags. */
573{
574    int length;
575    TclRegexp *regexpPtr;
576    char *pattern;
577
578    /*
579     * This is OK because we only actually interpret this value properly as a
580     * TclRegexp* when the type is tclRegexpType.
581     */
582
583    regexpPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr;
584
585    if ((objPtr->typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) {
586        pattern = TclGetStringFromObj(objPtr, &length);
587
588        regexpPtr = CompileRegexp(interp, pattern, length, flags);
589        if (regexpPtr == NULL) {
590            return NULL;
591        }
592
593        /*
594         * Add a reference to the regexp so it will persist even if it is
595         * pushed out of the current thread's regexp cache. This reference
596         * will be removed when the object's internal rep is freed.
597         */
598
599        regexpPtr->refCount++;
600
601        /*
602         * Free the old representation and set our type.
603         */
604
605        TclFreeIntRep(objPtr);
606        objPtr->internalRep.otherValuePtr = (void *) regexpPtr;
607        objPtr->typePtr = &tclRegexpType;
608    }
609    return (Tcl_RegExp) regexpPtr;
610}
611
612/*
613 *----------------------------------------------------------------------
614 *
615 * TclRegAbout --
616 *
617 *      Return information about a compiled regular expression.
618 *
619 * Results:
620 *      The return value is -1 for failure, 0 for success, although at the
621 *      moment there's nothing that could fail. On success, a list is left in
622 *      the interp's result: first element is the subexpression count, second
623 *      is a list of re_info bit names.
624 *
625 * Side effects:
626 *      None.
627 *
628 *----------------------------------------------------------------------
629 */
630
631int
632TclRegAbout(
633    Tcl_Interp *interp,         /* For use in variable assignment. */
634    Tcl_RegExp re)              /* The compiled regular expression. */
635{
636    TclRegexp *regexpPtr = (TclRegexp *) re;
637    struct infoname {
638        int bit;
639        const char *text;
640    };
641    static const struct infoname infonames[] = {
642        {REG_UBACKREF,          "REG_UBACKREF"},
643        {REG_ULOOKAHEAD,        "REG_ULOOKAHEAD"},
644        {REG_UBOUNDS,           "REG_UBOUNDS"},
645        {REG_UBRACES,           "REG_UBRACES"},
646        {REG_UBSALNUM,          "REG_UBSALNUM"},
647        {REG_UPBOTCH,           "REG_UPBOTCH"},
648        {REG_UBBS,              "REG_UBBS"},
649        {REG_UNONPOSIX,         "REG_UNONPOSIX"},
650        {REG_UUNSPEC,           "REG_UUNSPEC"},
651        {REG_UUNPORT,           "REG_UUNPORT"},
652        {REG_ULOCALE,           "REG_ULOCALE"},
653        {REG_UEMPTYMATCH,       "REG_UEMPTYMATCH"},
654        {REG_UIMPOSSIBLE,       "REG_UIMPOSSIBLE"},
655        {REG_USHORTEST,         "REG_USHORTEST"},
656        {0,                     NULL}
657    };
658    const struct infoname *inf;
659    Tcl_Obj *infoObj;
660
661    /*
662     * The reset here guarantees that the interpreter result is empty and
663     * unshared. This means that we can use Tcl_ListObjAppendElement on the
664     * result object quite safely.
665     */
666
667    Tcl_ResetResult(interp);
668
669    /*
670     * Assume that there will never be more than INT_MAX subexpressions. This
671     * is a pretty reasonable assumption; the RE engine doesn't scale _that_
672     * well and Tcl has other limits that constrain things as well...
673     */
674
675    Tcl_ListObjAppendElement(NULL, Tcl_GetObjResult(interp),
676            Tcl_NewIntObj((int) regexpPtr->re.re_nsub));
677
678    /*
679     * Now append a list of all the bit-flags set for the RE.
680     */
681
682    TclNewObj(infoObj);
683    for (inf=infonames ; inf->bit != 0 ; inf++) {
684        if (regexpPtr->re.re_info & inf->bit) {
685            Tcl_ListObjAppendElement(NULL, infoObj,
686                    Tcl_NewStringObj(inf->text, -1));
687        }
688    }
689    Tcl_ListObjAppendElement(NULL, Tcl_GetObjResult(interp), infoObj);
690
691    return 0;
692}
693
694/*
695 *----------------------------------------------------------------------
696 *
697 * TclRegError --
698 *
699 *      Generate an error message based on the regexp status code.
700 *
701 * Results:
702 *      Places an error in the interpreter.
703 *
704 * Side effects:
705 *      Sets errorCode as well.
706 *
707 *----------------------------------------------------------------------
708 */
709
710void
711TclRegError(
712    Tcl_Interp *interp,         /* Interpreter for error reporting. */
713    const char *msg,            /* Message to prepend to error. */
714    int status)                 /* Status code to report. */
715{
716    char buf[100];              /* ample in practice */
717    char cbuf[100];             /* lots in practice */
718    size_t n;
719    const char *p;
720
721    Tcl_ResetResult(interp);
722    n = TclReError(status, NULL, buf, sizeof(buf));
723    p = (n > sizeof(buf)) ? "..." : "";
724    Tcl_AppendResult(interp, msg, buf, p, NULL);
725
726    sprintf(cbuf, "%d", status);
727    (void) TclReError(REG_ITOA, NULL, cbuf, sizeof(cbuf));
728    Tcl_SetErrorCode(interp, "REGEXP", cbuf, buf, NULL);
729}
730
731/*
732 *----------------------------------------------------------------------
733 *
734 * FreeRegexpInternalRep --
735 *
736 *      Deallocate the storage associated with a regexp object's internal
737 *      representation.
738 *
739 * Results:
740 *      None.
741 *
742 * Side effects:
743 *      Frees the compiled regular expression.
744 *
745 *----------------------------------------------------------------------
746 */
747
748static void
749FreeRegexpInternalRep(
750    Tcl_Obj *objPtr)            /* Regexp object with internal rep to free. */
751{
752    TclRegexp *regexpRepPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr;
753
754    /*
755     * If this is the last reference to the regexp, free it.
756     */
757
758    if (--(regexpRepPtr->refCount) <= 0) {
759        FreeRegexp(regexpRepPtr);
760    }
761}
762
763/*
764 *----------------------------------------------------------------------
765 *
766 * DupRegexpInternalRep --
767 *
768 *      We copy the reference to the compiled regexp and bump its reference
769 *      count.
770 *
771 * Results:
772 *      None.
773 *
774 * Side effects:
775 *      Increments the reference count of the regexp.
776 *
777 *----------------------------------------------------------------------
778 */
779
780static void
781DupRegexpInternalRep(
782    Tcl_Obj *srcPtr,            /* Object with internal rep to copy. */
783    Tcl_Obj *copyPtr)           /* Object with internal rep to set. */
784{
785    TclRegexp *regexpPtr = (TclRegexp *) srcPtr->internalRep.otherValuePtr;
786
787    regexpPtr->refCount++;
788    copyPtr->internalRep.otherValuePtr = srcPtr->internalRep.otherValuePtr;
789    copyPtr->typePtr = &tclRegexpType;
790}
791
792/*
793 *----------------------------------------------------------------------
794 *
795 * SetRegexpFromAny --
796 *
797 *      Attempt to generate a compiled regular expression for the Tcl object
798 *      "objPtr".
799 *
800 * Results:
801 *      The return value is TCL_OK or TCL_ERROR. If an error occurs during
802 *      conversion, an error message is left in the interpreter's result
803 *      unless "interp" is NULL.
804 *
805 * Side effects:
806 *      If no error occurs, a regular expression is stored as "objPtr"s
807 *      internal representation.
808 *
809 *----------------------------------------------------------------------
810 */
811
812static int
813SetRegexpFromAny(
814    Tcl_Interp *interp,         /* Used for error reporting if not NULL. */
815    Tcl_Obj *objPtr)            /* The object to convert. */
816{
817    if (Tcl_GetRegExpFromObj(interp, objPtr, REG_ADVANCED) == NULL) {
818        return TCL_ERROR;
819    }
820    return TCL_OK;
821}
822
823/*
824 *---------------------------------------------------------------------------
825 *
826 * CompileRegexp --
827 *
828 *      Attempt to compile the given regexp pattern. If the compiled regular
829 *      expression can be found in the per-thread cache, it will be used
830 *      instead of compiling a new copy.
831 *
832 * Results:
833 *      The return value is a pointer to a newly allocated TclRegexp that
834 *      represents the compiled pattern, or NULL if the pattern could not be
835 *      compiled. If NULL is returned, an error message is left in the
836 *      interp's result.
837 *
838 * Side effects:
839 *      The thread-local regexp cache is updated and a new TclRegexp may be
840 *      allocated.
841 *
842 *----------------------------------------------------------------------
843 */
844
845static TclRegexp *
846CompileRegexp(
847    Tcl_Interp *interp,         /* Used for error reporting if not NULL. */
848    const char *string,         /* The regexp to compile (UTF-8). */
849    int length,                 /* The length of the string in bytes. */
850    int flags)                  /* Compilation flags. */
851{
852    TclRegexp *regexpPtr;
853    const Tcl_UniChar *uniString;
854    int numChars, status, i, exact;
855    Tcl_DString stringBuf;
856    ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey);
857
858    if (!tsdPtr->initialized) {
859        tsdPtr->initialized = 1;
860        Tcl_CreateThreadExitHandler(FinalizeRegexp, NULL);
861    }
862
863    /*
864     * This routine maintains a second-level regular expression cache in
865     * addition to the per-object regexp cache. The per-thread cache is needed
866     * to handle the case where for various reasons the object is lost between
867     * invocations of the regexp command, but the literal pattern is the same.
868     */
869
870    /*
871     * Check the per-thread compiled regexp cache. We can only reuse a regexp
872     * if it has the same pattern and the same flags.
873     */
874
875    for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) {
876        if ((length == tsdPtr->patLengths[i])
877                && (tsdPtr->regexps[i]->flags == flags)
878                && (strcmp(string, tsdPtr->patterns[i]) == 0)) {
879            /*
880             * Move the matched pattern to the first slot in the cache and
881             * shift the other patterns down one position.
882             */
883
884            if (i != 0) {
885                int j;
886                char *cachedString;
887
888                cachedString = tsdPtr->patterns[i];
889                regexpPtr = tsdPtr->regexps[i];
890                for (j = i-1; j >= 0; j--) {
891                    tsdPtr->patterns[j+1] = tsdPtr->patterns[j];
892                    tsdPtr->patLengths[j+1] = tsdPtr->patLengths[j];
893                    tsdPtr->regexps[j+1] = tsdPtr->regexps[j];
894                }
895                tsdPtr->patterns[0] = cachedString;
896                tsdPtr->patLengths[0] = length;
897                tsdPtr->regexps[0] = regexpPtr;
898            }
899            return tsdPtr->regexps[0];
900        }
901    }
902
903    /*
904     * This is a new expression, so compile it and add it to the cache.
905     */
906
907    regexpPtr = (TclRegexp *) ckalloc(sizeof(TclRegexp));
908    regexpPtr->objPtr = NULL;
909    regexpPtr->string = NULL;
910    regexpPtr->details.rm_extend.rm_so = -1;
911    regexpPtr->details.rm_extend.rm_eo = -1;
912
913    /*
914     * Get the up-to-date string representation and map to unicode.
915     */
916
917    Tcl_DStringInit(&stringBuf);
918    uniString = Tcl_UtfToUniCharDString(string, length, &stringBuf);
919    numChars = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar);
920
921    /*
922     * Compile the string and check for errors.
923     */
924
925    regexpPtr->flags = flags;
926    status = TclReComp(&regexpPtr->re, uniString, (size_t) numChars, flags);
927    Tcl_DStringFree(&stringBuf);
928
929    if (status != REG_OKAY) {
930        /*
931         * Clean up and report errors in the interpreter, if possible.
932         */
933
934        ckfree((char *)regexpPtr);
935        if (interp) {
936            TclRegError(interp,
937                    "couldn't compile regular expression pattern: ", status);
938        }
939        return NULL;
940    }
941
942    /*
943     * Convert RE to a glob pattern equivalent, if any, and cache it.  If this
944     * is not possible, then globObjPtr will be NULL.  This is used by
945     * Tcl_RegExpExecObj to optionally do a fast match (avoids RE engine).
946     */
947
948    if (TclReToGlob(NULL, string, length, &stringBuf, &exact) == TCL_OK) {
949        regexpPtr->globObjPtr = Tcl_NewStringObj(Tcl_DStringValue(&stringBuf),
950                Tcl_DStringLength(&stringBuf));
951        Tcl_IncrRefCount(regexpPtr->globObjPtr);
952        Tcl_DStringFree(&stringBuf);
953    } else {
954        regexpPtr->globObjPtr = NULL;
955    }
956
957    /*
958     * Allocate enough space for all of the subexpressions, plus one extra for
959     * the entire pattern.
960     */
961
962    regexpPtr->matches = (regmatch_t *) ckalloc(
963            sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1));
964
965    /*
966     * Initialize the refcount to one initially, since it is in the cache.
967     */
968
969    regexpPtr->refCount = 1;
970
971    /*
972     * Free the last regexp, if necessary, and make room at the head of the
973     * list for the new regexp.
974     */
975
976    if (tsdPtr->patterns[NUM_REGEXPS-1] != NULL) {
977        TclRegexp *oldRegexpPtr = tsdPtr->regexps[NUM_REGEXPS-1];
978        if (--(oldRegexpPtr->refCount) <= 0) {
979            FreeRegexp(oldRegexpPtr);
980        }
981        ckfree(tsdPtr->patterns[NUM_REGEXPS-1]);
982    }
983    for (i = NUM_REGEXPS - 2; i >= 0; i--) {
984        tsdPtr->patterns[i+1] = tsdPtr->patterns[i];
985        tsdPtr->patLengths[i+1] = tsdPtr->patLengths[i];
986        tsdPtr->regexps[i+1] = tsdPtr->regexps[i];
987    }
988    tsdPtr->patterns[0] = (char *) ckalloc((unsigned) (length+1));
989    strcpy(tsdPtr->patterns[0], string);
990    tsdPtr->patLengths[0] = length;
991    tsdPtr->regexps[0] = regexpPtr;
992
993    return regexpPtr;
994}
995
996/*
997 *----------------------------------------------------------------------
998 *
999 * FreeRegexp --
1000 *
1001 *      Release the storage associated with a TclRegexp.
1002 *
1003 * Results:
1004 *      None.
1005 *
1006 * Side effects:
1007 *      None.
1008 *
1009 *----------------------------------------------------------------------
1010 */
1011
1012static void
1013FreeRegexp(
1014    TclRegexp *regexpPtr)       /* Compiled regular expression to free. */
1015{
1016    TclReFree(&regexpPtr->re);
1017    if (regexpPtr->globObjPtr) {
1018        TclDecrRefCount(regexpPtr->globObjPtr);
1019    }
1020    if (regexpPtr->matches) {
1021        ckfree((char *) regexpPtr->matches);
1022    }
1023    ckfree((char *) regexpPtr);
1024}
1025
1026/*
1027 *----------------------------------------------------------------------
1028 *
1029 * FinalizeRegexp --
1030 *
1031 *      Release the storage associated with the per-thread regexp cache.
1032 *
1033 * Results:
1034 *      None.
1035 *
1036 * Side effects:
1037 *      None.
1038 *
1039 *----------------------------------------------------------------------
1040 */
1041
1042static void
1043FinalizeRegexp(
1044    ClientData clientData)      /* Not used. */
1045{
1046    int i;
1047    TclRegexp *regexpPtr;
1048    ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey);
1049
1050    for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) {
1051        regexpPtr = tsdPtr->regexps[i];
1052        if (--(regexpPtr->refCount) <= 0) {
1053            FreeRegexp(regexpPtr);
1054        }
1055        ckfree(tsdPtr->patterns[i]);
1056        tsdPtr->patterns[i] = NULL;
1057    }
1058    /*
1059     * We may find ourselves reinitialized if another finalization routine
1060     * invokes regexps.
1061     */
1062    tsdPtr->initialized = 0;
1063}
1064
1065/*
1066 * Local Variables:
1067 * mode: c
1068 * c-basic-offset: 4
1069 * fill-column: 78
1070 * End:
1071 */
Note: See TracBrowser for help on using the repository browser.