Planet
navi homePPSaboutscreenshotsdownloaddevelopmentforum

source: downloads/tcl8.5.2/generic/tclUtf.c @ 35

Last change on this file since 35 was 25, checked in by landauf, 17 years ago

added tcl to libs

File size: 45.7 KB
Line 
1/*
2 * tclUtf.c --
3 *
4 *      Routines for manipulating UTF-8 strings.
5 *
6 * Copyright (c) 1997-1998 Sun Microsystems, Inc.
7 *
8 * See the file "license.terms" for information on usage and redistribution of
9 * this file, and for a DISCLAIMER OF ALL WARRANTIES.
10 *
11 * RCS: @(#) $Id: tclUtf.c,v 1.37 2005/10/31 15:59:41 dkf Exp $
12 */
13
14#include "tclInt.h"
15
16/*
17 * Include the static character classification tables and macros.
18 */
19
20#include "tclUniData.c"
21
22/*
23 * The following macros are used for fast character category tests. The x_BITS
24 * values are shifted right by the category value to determine whether the
25 * given category is included in the set.
26 */
27
28#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
29        | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER))
30
31#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
32
33#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
34        | (1 << PARAGRAPH_SEPARATOR))
35
36#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
37
38#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
39        (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
40        (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
41        (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
42        (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
43        (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
44        (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
45        (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
46        (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
47
48#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
49        (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
50        (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
51        (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
52
53/*
54 * Unicode characters less than this value are represented by themselves in
55 * UTF-8 strings.
56 */
57
58#define UNICODE_SELF    0x80
59
60/*
61 * The following structures are used when mapping between Unicode (UCS-2) and
62 * UTF-8.
63 */
64
65static CONST unsigned char totalBytes[256] = {
66    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
67    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
68    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
69    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
70    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
71    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
72    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
73    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
74#if TCL_UTF_MAX > 3
75    4,4,4,4,4,4,4,4,
76#else
77    1,1,1,1,1,1,1,1,
78#endif
79#if TCL_UTF_MAX > 4
80    5,5,5,5,
81#else
82    1,1,1,1,
83#endif
84#if TCL_UTF_MAX > 5
85    6,6,6,6
86#else
87    1,1,1,1
88#endif
89};
90
91/*
92 * Functions used only in this module.
93 */
94
95static int              UtfCount(int ch);
96
97/*
98 *---------------------------------------------------------------------------
99 *
100 * UtfCount --
101 *
102 *      Find the number of bytes in the Utf character "ch".
103 *
104 * Results:
105 *      The return values is the number of bytes in the Utf character "ch".
106 *
107 * Side effects:
108 *      None.
109 *
110 *---------------------------------------------------------------------------
111 */
112
113INLINE static int
114UtfCount(
115    int ch)                     /* The Tcl_UniChar whose size is returned. */
116{
117    if ((ch > 0) && (ch < UNICODE_SELF)) {
118        return 1;
119    }
120    if (ch <= 0x7FF) {
121        return 2;
122    }
123    if (ch <= 0xFFFF) {
124        return 3;
125    }
126#if TCL_UTF_MAX > 3
127    if (ch <= 0x1FFFFF) {
128        return 4;
129    }
130    if (ch <= 0x3FFFFFF) {
131        return 5;
132    }
133    if (ch <= 0x7FFFFFFF) {
134        return 6;
135    }
136#endif
137    return 3;
138}
139
140/*
141 *---------------------------------------------------------------------------
142 *
143 * Tcl_UniCharToUtf --
144 *
145 *      Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
146 *      provided buffer. Equivalent to Plan 9 runetochar().
147 *
148 * Results:
149 *      The return values is the number of bytes in the buffer that were
150 *      consumed.
151 *
152 * Side effects:
153 *      None.
154 *
155 *---------------------------------------------------------------------------
156 */
157
158INLINE int
159Tcl_UniCharToUtf(
160    int ch,                     /* The Tcl_UniChar to be stored in the
161                                 * buffer. */
162    char *buf)                  /* Buffer in which the UTF-8 representation of
163                                 * the Tcl_UniChar is stored. Buffer must be
164                                 * large enough to hold the UTF-8 character
165                                 * (at most TCL_UTF_MAX bytes). */
166{
167    if ((ch > 0) && (ch < UNICODE_SELF)) {
168        buf[0] = (char) ch;
169        return 1;
170    }
171    if (ch >= 0) {
172        if (ch <= 0x7FF) {
173            buf[1] = (char) ((ch | 0x80) & 0xBF);
174            buf[0] = (char) ((ch >> 6) | 0xC0);
175            return 2;
176        }
177        if (ch <= 0xFFFF) {
178        three:
179            buf[2] = (char) ((ch | 0x80) & 0xBF);
180            buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
181            buf[0] = (char) ((ch >> 12) | 0xE0);
182            return 3;
183        }
184
185#if TCL_UTF_MAX > 3
186        if (ch <= 0x1FFFFF) {
187            buf[3] = (char) ((ch | 0x80) & 0xBF);
188            buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
189            buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
190            buf[0] = (char) ((ch >> 18) | 0xF0);
191            return 4;
192        }
193        if (ch <= 0x3FFFFFF) {
194            buf[4] = (char) ((ch | 0x80) & 0xBF);
195            buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
196            buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
197            buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
198            buf[0] = (char) ((ch >> 24) | 0xF8);
199            return 5;
200        }
201        if (ch <= 0x7FFFFFFF) {
202            buf[5] = (char) ((ch | 0x80) & 0xBF);
203            buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
204            buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
205            buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
206            buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
207            buf[0] = (char) ((ch >> 30) | 0xFC);
208            return 6;
209        }
210#endif
211    }
212
213    ch = 0xFFFD;
214    goto three;
215}
216
217/*
218 *---------------------------------------------------------------------------
219 *
220 * Tcl_UniCharToUtfDString --
221 *
222 *      Convert the given Unicode string to UTF-8.
223 *
224 * Results:
225 *      The return value is a pointer to the UTF-8 representation of the
226 *      Unicode string. Storage for the return value is appended to the end of
227 *      dsPtr.
228 *
229 * Side effects:
230 *      None.
231 *
232 *---------------------------------------------------------------------------
233 */
234
235char *
236Tcl_UniCharToUtfDString(
237    CONST Tcl_UniChar *uniStr,  /* Unicode string to convert to UTF-8. */
238    int uniLength,              /* Length of Unicode string in Tcl_UniChars
239                                 * (must be >= 0). */
240    Tcl_DString *dsPtr)         /* UTF-8 representation of string is appended
241                                 * to this previously initialized DString. */
242{
243    CONST Tcl_UniChar *w, *wEnd;
244    char *p, *string;
245    int oldLength;
246
247    /*
248     * UTF-8 string length in bytes will be <= Unicode string length *
249     * TCL_UTF_MAX.
250     */
251
252    oldLength = Tcl_DStringLength(dsPtr);
253    Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX);
254    string = Tcl_DStringValue(dsPtr) + oldLength;
255
256    p = string;
257    wEnd = uniStr + uniLength;
258    for (w = uniStr; w < wEnd; ) {
259        p += Tcl_UniCharToUtf(*w, p);
260        w++;
261    }
262    Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
263
264    return string;
265}
266
267/*
268 *---------------------------------------------------------------------------
269 *
270 * Tcl_UtfToUniChar --
271 *
272 *      Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8
273 *      sequences are converted to valid Tcl_UniChars and processing
274 *      continues. Equivalent to Plan 9 chartorune().
275 *
276 *      The caller must ensure that the source buffer is long enough that this
277 *      routine does not run off the end and dereference non-existent memory
278 *      looking for trail bytes. If the source buffer is known to be '\0'
279 *      terminated, this cannot happen. Otherwise, the caller should call
280 *      Tcl_UtfCharComplete() before calling this routine to ensure that
281 *      enough bytes remain in the string.
282 *
283 * Results:
284 *      *chPtr is filled with the Tcl_UniChar, and the return value is the
285 *      number of bytes from the UTF-8 string that were consumed.
286 *
287 * Side effects:
288 *      None.
289 *
290 *---------------------------------------------------------------------------
291 */
292
293int
294Tcl_UtfToUniChar(
295    register CONST char *src,   /* The UTF-8 string. */
296    register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
297                                 * the UTF-8 string. */
298{
299    register int byte;
300
301    /*
302     * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
303     */
304
305    byte = *((unsigned char *) src);
306    if (byte < 0xC0) {
307        /*
308         * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
309         * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
310         * characters representing themselves.
311         */
312
313        *chPtr = (Tcl_UniChar) byte;
314        return 1;
315    } else if (byte < 0xE0) {
316        if ((src[1] & 0xC0) == 0x80) {
317            /*
318             * Two-byte-character lead-byte followed by a trail-byte.
319             */
320
321            *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
322            return 2;
323        }
324
325        /*
326         * A two-byte-character lead-byte not followed by trail-byte
327         * represents itself.
328         */
329
330        *chPtr = (Tcl_UniChar) byte;
331        return 1;
332    } else if (byte < 0xF0) {
333        if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
334            /*
335             * Three-byte-character lead byte followed by two trail bytes.
336             */
337
338            *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
339                    | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
340            return 3;
341        }
342
343        /*
344         * A three-byte-character lead-byte not followed by two trail-bytes
345         * represents itself.
346         */
347
348        *chPtr = (Tcl_UniChar) byte;
349        return 1;
350    }
351#if TCL_UTF_MAX > 3
352    {
353        int ch, total, trail;
354
355        total = totalBytes[byte];
356        trail = total - 1;
357        if (trail > 0) {
358            ch = byte & (0x3F >> trail);
359            do {
360                src++;
361                if ((*src & 0xC0) != 0x80) {
362                    *chPtr = byte;
363                    return 1;
364                }
365                ch <<= 6;
366                ch |= (*src & 0x3F);
367                trail--;
368            } while (trail > 0);
369            *chPtr = ch;
370            return total;
371        }
372    }
373#endif
374
375    *chPtr = (Tcl_UniChar) byte;
376    return 1;
377}
378
379/*
380 *---------------------------------------------------------------------------
381 *
382 * Tcl_UtfToUniCharDString --
383 *
384 *      Convert the UTF-8 string to Unicode.
385 *
386 * Results:
387 *      The return value is a pointer to the Unicode representation of the
388 *      UTF-8 string. Storage for the return value is appended to the end of
389 *      dsPtr. The Unicode string is terminated with a Unicode NULL character.
390 *
391 * Side effects:
392 *      None.
393 *
394 *---------------------------------------------------------------------------
395 */
396
397Tcl_UniChar *
398Tcl_UtfToUniCharDString(
399    CONST char *src,            /* UTF-8 string to convert to Unicode. */
400    int length,                 /* Length of UTF-8 string in bytes, or -1 for
401                                 * strlen(). */
402    Tcl_DString *dsPtr)         /* Unicode representation of string is
403                                 * appended to this previously initialized
404                                 * DString. */
405{
406    Tcl_UniChar *w, *wString;
407    CONST char *p, *end;
408    int oldLength;
409
410    if (length < 0) {
411        length = strlen(src);
412    }
413
414    /*
415     * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in
416     * bytes.
417     */
418
419    oldLength = Tcl_DStringLength(dsPtr);
420    Tcl_DStringSetLength(dsPtr,
421            (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
422    wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
423
424    w = wString;
425    end = src + length;
426    for (p = src; p < end; ) {
427        p += TclUtfToUniChar(p, w);
428        w++;
429    }
430    *w = '\0';
431    Tcl_DStringSetLength(dsPtr,
432            (oldLength + ((char *) w - (char *) wString)));
433
434    return wString;
435}
436
437/*
438 *---------------------------------------------------------------------------
439 *
440 * Tcl_UtfCharComplete --
441 *
442 *      Determine if the UTF-8 string of the given length is long enough to be
443 *      decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8
444 *      string is properly formed. Equivalent to Plan 9 fullrune().
445 *
446 * Results:
447 *      The return value is 0 if the string is not long enough, non-zero
448 *      otherwise.
449 *
450 * Side effects:
451 *      None.
452 *
453 *---------------------------------------------------------------------------
454 */
455
456int
457Tcl_UtfCharComplete(
458    CONST char *src,            /* String to check if first few bytes contain
459                                 * a complete UTF-8 character. */
460    int length)                 /* Length of above string in bytes. */
461{
462    int ch;
463
464    ch = *((unsigned char *) src);
465    return length >= totalBytes[ch];
466}
467
468/*
469 *---------------------------------------------------------------------------
470 *
471 * Tcl_NumUtfChars --
472 *
473 *      Returns the number of characters (not bytes) in the UTF-8 string, not
474 *      including the terminating NULL byte. This is equivalent to Plan 9
475 *      utflen() and utfnlen().
476 *
477 * Results:
478 *      As above.
479 *
480 * Side effects:
481 *      None.
482 *
483 *---------------------------------------------------------------------------
484 */
485
486int
487Tcl_NumUtfChars(
488    register CONST char *src,   /* The UTF-8 string to measure. */
489    int length)                 /* The length of the string in bytes, or -1
490                                 * for strlen(string). */
491{
492    Tcl_UniChar ch;
493    register Tcl_UniChar *chPtr = &ch;
494    register int i;
495
496    /*
497     * The separate implementations are faster.
498     *
499     * Since this is a time-sensitive function, we also do the check for the
500     * single-byte char case specially.
501     */
502
503    i = 0;
504    if (length < 0) {
505        while (*src != '\0') {
506            src += TclUtfToUniChar(src, chPtr);
507            i++;
508        }
509    } else {
510        register int n;
511
512        while (length > 0) {
513            if (UCHAR(*src) < 0xC0) {
514                length--;
515                src++;
516            } else {
517                n = Tcl_UtfToUniChar(src, chPtr);
518                length -= n;
519                src += n;
520            }
521            i++;
522        }
523    }
524    return i;
525}
526
527/*
528 *---------------------------------------------------------------------------
529 *
530 * Tcl_UtfFindFirst --
531 *
532 *      Returns a pointer to the first occurance of the given Tcl_UniChar in
533 *      the NULL-terminated UTF-8 string. The NULL terminator is considered
534 *      part of the UTF-8 string. Equivalent to Plan 9 utfrune().
535 *
536 * Results:
537 *      As above. If the Tcl_UniChar does not exist in the given string, the
538 *      return value is NULL.
539 *
540 * Side effects:
541 *      None.
542 *
543 *---------------------------------------------------------------------------
544 */
545
546CONST char *
547Tcl_UtfFindFirst(
548    CONST char *src,            /* The UTF-8 string to be searched. */
549    int ch)                     /* The Tcl_UniChar to search for. */
550{
551    int len;
552    Tcl_UniChar find;
553
554    while (1) {
555        len = TclUtfToUniChar(src, &find);
556        if (find == ch) {
557            return src;
558        }
559        if (*src == '\0') {
560            return NULL;
561        }
562        src += len;
563    }
564}
565
566/*
567 *---------------------------------------------------------------------------
568 *
569 * Tcl_UtfFindLast --
570 *
571 *      Returns a pointer to the last occurance of the given Tcl_UniChar in
572 *      the NULL-terminated UTF-8 string. The NULL terminator is considered
573 *      part of the UTF-8 string. Equivalent to Plan 9 utfrrune().
574 *
575 * Results:
576 *      As above. If the Tcl_UniChar does not exist in the given string, the
577 *      return value is NULL.
578 *
579 * Side effects:
580 *      None.
581 *
582 *---------------------------------------------------------------------------
583 */
584
585CONST char *
586Tcl_UtfFindLast(
587    CONST char *src,            /* The UTF-8 string to be searched. */
588    int ch)                     /* The Tcl_UniChar to search for. */
589{
590    int len;
591    Tcl_UniChar find;
592    CONST char *last;
593
594    last = NULL;
595    while (1) {
596        len = TclUtfToUniChar(src, &find);
597        if (find == ch) {
598            last = src;
599        }
600        if (*src == '\0') {
601            break;
602        }
603        src += len;
604    }
605    return last;
606}
607
608/*
609 *---------------------------------------------------------------------------
610 *
611 * Tcl_UtfNext --
612 *
613 *      Given a pointer to some current location in a UTF-8 string, move
614 *      forward one character. The caller must ensure that they are not asking
615 *      for the next character after the last character in the string.
616 *
617 * Results:
618 *      The return value is the pointer to the next character in the UTF-8
619 *      string.
620 *
621 * Side effects:
622 *      None.
623 *
624 *---------------------------------------------------------------------------
625 */
626
627CONST char *
628Tcl_UtfNext(
629    CONST char *src)            /* The current location in the string. */
630{
631    Tcl_UniChar ch;
632
633    return src + TclUtfToUniChar(src, &ch);
634}
635
636/*
637 *---------------------------------------------------------------------------
638 *
639 * Tcl_UtfPrev --
640 *
641 *      Given a pointer to some current location in a UTF-8 string, move
642 *      backwards one character. This works correctly when the pointer is in
643 *      the middle of a UTF-8 character.
644 *
645 * Results:
646 *      The return value is a pointer to the previous character in the UTF-8
647 *      string. If the current location was already at the beginning of the
648 *      string, the return value will also be a pointer to the beginning of
649 *      the string.
650 *
651 * Side effects:
652 *      None.
653 *
654 *---------------------------------------------------------------------------
655 */
656
657CONST char *
658Tcl_UtfPrev(
659    CONST char *src,            /* The current location in the string. */
660    CONST char *start)          /* Pointer to the beginning of the string, to
661                                 * avoid going backwards too far. */
662{
663    CONST char *look;
664    int i, byte;
665
666    src--;
667    look = src;
668    for (i = 0; i < TCL_UTF_MAX; i++) {
669        if (look < start) {
670            if (src < start) {
671                src = start;
672            }
673            break;
674        }
675        byte = *((unsigned char *) look);
676        if (byte < 0x80) {
677            break;
678        }
679        if (byte >= 0xC0) {
680            return look;
681        }
682        look--;
683    }
684    return src;
685}
686
687/*
688 *---------------------------------------------------------------------------
689 *
690 * Tcl_UniCharAtIndex --
691 *
692 *      Returns the Unicode character represented at the specified character
693 *      (not byte) position in the UTF-8 string.
694 *
695 * Results:
696 *      As above.
697 *
698 * Side effects:
699 *      None.
700 *
701 *---------------------------------------------------------------------------
702 */
703
704Tcl_UniChar
705Tcl_UniCharAtIndex(
706    register CONST char *src,   /* The UTF-8 string to dereference. */
707    register int index)         /* The position of the desired character. */
708{
709    Tcl_UniChar ch;
710
711    while (index >= 0) {
712        index--;
713        src += TclUtfToUniChar(src, &ch);
714    }
715    return ch;
716}
717
718/*
719 *---------------------------------------------------------------------------
720 *
721 * Tcl_UtfAtIndex --
722 *
723 *      Returns a pointer to the specified character (not byte) position in
724 *      the UTF-8 string.
725 *
726 * Results:
727 *      As above.
728 *
729 * Side effects:
730 *      None.
731 *
732 *---------------------------------------------------------------------------
733 */
734
735CONST char *
736Tcl_UtfAtIndex(
737    register CONST char *src,   /* The UTF-8 string. */
738    register int index)         /* The position of the desired character. */
739{
740    Tcl_UniChar ch;
741
742    while (index > 0) {
743        index--;
744        src += TclUtfToUniChar(src, &ch);
745    }
746    return src;
747}
748
749/*
750 *---------------------------------------------------------------------------
751 *
752 * Tcl_UtfBackslash --
753 *
754 *      Figure out how to handle a backslash sequence.
755 *
756 * Results:
757 *      Stores the bytes represented by the backslash sequence in dst and
758 *      returns the number of bytes written to dst. At most TCL_UTF_MAX bytes
759 *      are written to dst; dst must have been large enough to accept those
760 *      bytes. If readPtr isn't NULL then it is filled in with a count of the
761 *      number of bytes in the backslash sequence.
762 *
763 * Side effects:
764 *      The maximum number of bytes it takes to represent a Unicode character
765 *      in UTF-8 is guaranteed to be less than the number of bytes used to
766 *      express the backslash sequence that represents that Unicode character.
767 *      If the target buffer into which the caller is going to store the bytes
768 *      that represent the Unicode character is at least as large as the
769 *      source buffer from which the backslashed sequence was extracted, no
770 *      buffer overruns should occur.
771 *
772 *---------------------------------------------------------------------------
773 */
774
775int
776Tcl_UtfBackslash(
777    CONST char *src,            /* Points to the backslash character of a
778                                 * backslash sequence. */
779    int *readPtr,               /* Fill in with number of characters read from
780                                 * src, unless NULL. */
781    char *dst)                  /* Filled with the bytes represented by the
782                                 * backslash sequence. */
783{
784#define LINE_LENGTH 128
785    int numRead;
786    int result;
787
788    result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
789    if (numRead == LINE_LENGTH) {
790        /*
791         * We ate a whole line. Pay the price of a strlen()
792         */
793
794        result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
795    }
796    if (readPtr != NULL) {
797        *readPtr = numRead;
798    }
799    return result;
800}
801
802/*
803 *----------------------------------------------------------------------
804 *
805 * Tcl_UtfToUpper --
806 *
807 *      Convert lowercase characters to uppercase characters in a UTF string
808 *      in place. The conversion may shrink the UTF string.
809 *
810 * Results:
811 *      Returns the number of bytes in the resulting string excluding the
812 *      trailing null.
813 *
814 * Side effects:
815 *      Writes a terminating null after the last converted character.
816 *
817 *----------------------------------------------------------------------
818 */
819
820int
821Tcl_UtfToUpper(
822    char *str)                  /* String to convert in place. */
823{
824    Tcl_UniChar ch, upChar;
825    char *src, *dst;
826    int bytes;
827
828    /*
829     * Iterate over the string until we hit the terminating null.
830     */
831
832    src = dst = str;
833    while (*src) {
834        bytes = TclUtfToUniChar(src, &ch);
835        upChar = Tcl_UniCharToUpper(ch);
836
837        /*
838         * To keep badly formed Utf strings from getting inflated by the
839         * conversion (thereby causing a segfault), only copy the upper case
840         * char to dst if its size is <= the original char.
841         */
842
843        if (bytes < UtfCount(upChar)) {
844            memcpy(dst, src, (size_t) bytes);
845            dst += bytes;
846        } else {
847            dst += Tcl_UniCharToUtf(upChar, dst);
848        }
849        src += bytes;
850    }
851    *dst = '\0';
852    return (dst - str);
853}
854
855/*
856 *----------------------------------------------------------------------
857 *
858 * Tcl_UtfToLower --
859 *
860 *      Convert uppercase characters to lowercase characters in a UTF string
861 *      in place. The conversion may shrink the UTF string.
862 *
863 * Results:
864 *      Returns the number of bytes in the resulting string excluding the
865 *      trailing null.
866 *
867 * Side effects:
868 *      Writes a terminating null after the last converted character.
869 *
870 *----------------------------------------------------------------------
871 */
872
873int
874Tcl_UtfToLower(
875    char *str)                  /* String to convert in place. */
876{
877    Tcl_UniChar ch, lowChar;
878    char *src, *dst;
879    int bytes;
880
881    /*
882     * Iterate over the string until we hit the terminating null.
883     */
884
885    src = dst = str;
886    while (*src) {
887        bytes = TclUtfToUniChar(src, &ch);
888        lowChar = Tcl_UniCharToLower(ch);
889
890        /*
891         * To keep badly formed Utf strings from getting inflated by the
892         * conversion (thereby causing a segfault), only copy the lower case
893         * char to dst if its size is <= the original char.
894         */
895
896        if (bytes < UtfCount(lowChar)) {
897            memcpy(dst, src, (size_t) bytes);
898            dst += bytes;
899        } else {
900            dst += Tcl_UniCharToUtf(lowChar, dst);
901        }
902        src += bytes;
903    }
904    *dst = '\0';
905    return (dst - str);
906}
907
908/*
909 *----------------------------------------------------------------------
910 *
911 * Tcl_UtfToTitle --
912 *
913 *      Changes the first character of a UTF string to title case or uppercase
914 *      and the rest of the string to lowercase. The conversion happens in
915 *      place and may shrink the UTF string.
916 *
917 * Results:
918 *      Returns the number of bytes in the resulting string excluding the
919 *      trailing null.
920 *
921 * Side effects:
922 *      Writes a terminating null after the last converted character.
923 *
924 *----------------------------------------------------------------------
925 */
926
927int
928Tcl_UtfToTitle(
929    char *str)                  /* String to convert in place. */
930{
931    Tcl_UniChar ch, titleChar, lowChar;
932    char *src, *dst;
933    int bytes;
934
935    /*
936     * Capitalize the first character and then lowercase the rest of the
937     * characters until we get to a null.
938     */
939
940    src = dst = str;
941
942    if (*src) {
943        bytes = TclUtfToUniChar(src, &ch);
944        titleChar = Tcl_UniCharToTitle(ch);
945
946        if (bytes < UtfCount(titleChar)) {
947            memcpy(dst, src, (size_t) bytes);
948            dst += bytes;
949        } else {
950            dst += Tcl_UniCharToUtf(titleChar, dst);
951        }
952        src += bytes;
953    }
954    while (*src) {
955        bytes = TclUtfToUniChar(src, &ch);
956        lowChar = Tcl_UniCharToLower(ch);
957
958        if (bytes < UtfCount(lowChar)) {
959            memcpy(dst, src, (size_t) bytes);
960            dst += bytes;
961        } else {
962            dst += Tcl_UniCharToUtf(lowChar, dst);
963        }
964        src += bytes;
965    }
966    *dst = '\0';
967    return (dst - str);
968}
969
970/*
971 *----------------------------------------------------------------------
972 *
973 * TclpUtfNcmp2 --
974 *
975 *      Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and
976 *      ct are assumed to be at least numBytes bytes long.
977 *
978 * Results:
979 *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
980 *
981 * Side effects:
982 *      None.
983 *
984 *----------------------------------------------------------------------
985 */
986
987int
988TclpUtfNcmp2(
989    CONST char *cs,             /* UTF string to compare to ct. */
990    CONST char *ct,             /* UTF string cs is compared to. */
991    unsigned long numBytes)     /* Number of *bytes* to compare. */
992{
993    /*
994     * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to
995     * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes
996     * fine in the strcmp manner.
997     */
998
999    register int result = 0;
1000
1001    for ( ; numBytes != 0; numBytes--, cs++, ct++) {
1002        if (*cs != *ct) {
1003            result = UCHAR(*cs) - UCHAR(*ct);
1004            break;
1005        }
1006    }
1007    if (numBytes && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
1008        unsigned char c1, c2;
1009
1010        c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
1011        c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
1012        result = (c1 - c2);
1013    }
1014    return result;
1015}
1016
1017/*
1018 *----------------------------------------------------------------------
1019 *
1020 * Tcl_UtfNcmp --
1021 *
1022 *      Compare at most numChars UTF chars of string cs to string ct. Both cs
1023 *      and ct are assumed to be at least numChars UTF chars long.
1024 *
1025 * Results:
1026 *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1027 *
1028 * Side effects:
1029 *      None.
1030 *
1031 *----------------------------------------------------------------------
1032 */
1033
1034int
1035Tcl_UtfNcmp(
1036    CONST char *cs,             /* UTF string to compare to ct. */
1037    CONST char *ct,             /* UTF string cs is compared to. */
1038    unsigned long numChars)     /* Number of UTF chars to compare. */
1039{
1040    Tcl_UniChar ch1, ch2;
1041
1042    /*
1043     * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
1044     * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001
1045     * (the byte 0x01.)
1046     */
1047
1048    while (numChars-- > 0) {
1049        /*
1050         * n must be interpreted as chars, not bytes. This should be called
1051         * only when both strings are of at least n chars long (no need for \0
1052         * check)
1053         */
1054
1055        cs += TclUtfToUniChar(cs, &ch1);
1056        ct += TclUtfToUniChar(ct, &ch2);
1057        if (ch1 != ch2) {
1058            return (ch1 - ch2);
1059        }
1060    }
1061    return 0;
1062}
1063
1064/*
1065 *----------------------------------------------------------------------
1066 *
1067 * Tcl_UtfNcasecmp --
1068 *
1069 *      Compare at most numChars UTF chars of string cs to string ct case
1070 *      insensitive. Both cs and ct are assumed to be at least numChars UTF
1071 *      chars long.
1072 *
1073 * Results:
1074 *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
1075 *
1076 * Side effects:
1077 *      None.
1078 *
1079 *----------------------------------------------------------------------
1080 */
1081
1082int
1083Tcl_UtfNcasecmp(
1084    CONST char *cs,             /* UTF string to compare to ct. */
1085    CONST char *ct,             /* UTF string cs is compared to. */
1086    unsigned long numChars)     /* Number of UTF chars to compare. */
1087{
1088    Tcl_UniChar ch1, ch2;
1089    while (numChars-- > 0) {
1090        /*
1091         * n must be interpreted as chars, not bytes.
1092         * This should be called only when both strings are of
1093         * at least n chars long (no need for \0 check)
1094         */
1095        cs += TclUtfToUniChar(cs, &ch1);
1096        ct += TclUtfToUniChar(ct, &ch2);
1097        if (ch1 != ch2) {
1098            ch1 = Tcl_UniCharToLower(ch1);
1099            ch2 = Tcl_UniCharToLower(ch2);
1100            if (ch1 != ch2) {
1101                return (ch1 - ch2);
1102            }
1103        }
1104    }
1105    return 0;
1106}
1107
1108/*
1109 *----------------------------------------------------------------------
1110 *
1111 * Tcl_UniCharToUpper --
1112 *
1113 *      Compute the uppercase equivalent of the given Unicode character.
1114 *
1115 * Results:
1116 *      Returns the uppercase Unicode character.
1117 *
1118 * Side effects:
1119 *      None.
1120 *
1121 *----------------------------------------------------------------------
1122 */
1123
1124Tcl_UniChar
1125Tcl_UniCharToUpper(
1126    int ch)                     /* Unicode character to convert. */
1127{
1128    int info = GetUniCharInfo(ch);
1129
1130    if (GetCaseType(info) & 0x04) {
1131        return (Tcl_UniChar) (ch - GetDelta(info));
1132    } else {
1133        return ch;
1134    }
1135}
1136
1137/*
1138 *----------------------------------------------------------------------
1139 *
1140 * Tcl_UniCharToLower --
1141 *
1142 *      Compute the lowercase equivalent of the given Unicode character.
1143 *
1144 * Results:
1145 *      Returns the lowercase Unicode character.
1146 *
1147 * Side effects:
1148 *      None.
1149 *
1150 *----------------------------------------------------------------------
1151 */
1152
1153Tcl_UniChar
1154Tcl_UniCharToLower(
1155    int ch)                     /* Unicode character to convert. */
1156{
1157    int info = GetUniCharInfo(ch);
1158
1159    if (GetCaseType(info) & 0x02) {
1160        return (Tcl_UniChar) (ch + GetDelta(info));
1161    } else {
1162        return ch;
1163    }
1164}
1165
1166/*
1167 *----------------------------------------------------------------------
1168 *
1169 * Tcl_UniCharToTitle --
1170 *
1171 *      Compute the titlecase equivalent of the given Unicode character.
1172 *
1173 * Results:
1174 *      Returns the titlecase Unicode character.
1175 *
1176 * Side effects:
1177 *      None.
1178 *
1179 *----------------------------------------------------------------------
1180 */
1181
1182Tcl_UniChar
1183Tcl_UniCharToTitle(
1184    int ch)                     /* Unicode character to convert. */
1185{
1186    int info = GetUniCharInfo(ch);
1187    int mode = GetCaseType(info);
1188
1189    if (mode & 0x1) {
1190        /*
1191         * Subtract or add one depending on the original case.
1192         */
1193
1194        return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
1195    } else if (mode == 0x4) {
1196        return (Tcl_UniChar) (ch - GetDelta(info));
1197    } else {
1198        return ch;
1199    }
1200}
1201
1202/*
1203 *----------------------------------------------------------------------
1204 *
1205 * Tcl_UniCharLen --
1206 *
1207 *      Find the length of a UniChar string. The str input must be null
1208 *      terminated.
1209 *
1210 * Results:
1211 *      Returns the length of str in UniChars (not bytes).
1212 *
1213 * Side effects:
1214 *      None.
1215 *
1216 *----------------------------------------------------------------------
1217 */
1218
1219int
1220Tcl_UniCharLen(
1221    CONST Tcl_UniChar *uniStr)  /* Unicode string to find length of. */
1222{
1223    int len = 0;
1224
1225    while (*uniStr != '\0') {
1226        len++;
1227        uniStr++;
1228    }
1229    return len;
1230}
1231
1232/*
1233 *----------------------------------------------------------------------
1234 *
1235 * Tcl_UniCharNcmp --
1236 *
1237 *      Compare at most numChars unichars of string ucs to string uct.
1238 *      Both ucs and uct are assumed to be at least numChars unichars long.
1239 *
1240 * Results:
1241 *      Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
1242 *
1243 * Side effects:
1244 *      None.
1245 *
1246 *----------------------------------------------------------------------
1247 */
1248
1249int
1250Tcl_UniCharNcmp(
1251    CONST Tcl_UniChar *ucs,     /* Unicode string to compare to uct. */
1252    CONST Tcl_UniChar *uct,     /* Unicode string ucs is compared to. */
1253    unsigned long numChars)     /* Number of unichars to compare. */
1254{
1255#ifdef WORDS_BIGENDIAN
1256    /*
1257     * We are definitely on a big-endian machine; memcmp() is safe
1258     */
1259
1260    return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar));
1261
1262#else /* !WORDS_BIGENDIAN */
1263    /*
1264     * We can't simply call memcmp() because that is not lexically correct.
1265     */
1266
1267    for ( ; numChars != 0; ucs++, uct++, numChars--) {
1268        if (*ucs != *uct) {
1269            return (*ucs - *uct);
1270        }
1271    }
1272    return 0;
1273#endif /* WORDS_BIGENDIAN */
1274}
1275
1276/*
1277 *----------------------------------------------------------------------
1278 *
1279 * Tcl_UniCharNcasecmp --
1280 *
1281 *      Compare at most numChars unichars of string ucs to string uct case
1282 *      insensitive. Both ucs and uct are assumed to be at least numChars
1283 *      unichars long.
1284 *
1285 * Results:
1286 *      Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
1287 *
1288 * Side effects:
1289 *      None.
1290 *
1291 *----------------------------------------------------------------------
1292 */
1293
1294int
1295Tcl_UniCharNcasecmp(
1296    CONST Tcl_UniChar *ucs,     /* Unicode string to compare to uct. */
1297    CONST Tcl_UniChar *uct,     /* Unicode string ucs is compared to. */
1298    unsigned long numChars)     /* Number of unichars to compare. */
1299{
1300    for ( ; numChars != 0; numChars--, ucs++, uct++) {
1301        if (*ucs != *uct) {
1302            Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs);
1303            Tcl_UniChar lct = Tcl_UniCharToLower(*uct);
1304
1305            if (lcs != lct) {
1306                return (lcs - lct);
1307            }
1308        }
1309    }
1310    return 0;
1311}
1312
1313/*
1314 *----------------------------------------------------------------------
1315 *
1316 * Tcl_UniCharIsAlnum --
1317 *
1318 *      Test if a character is an alphanumeric Unicode character.
1319 *
1320 * Results:
1321 *      Returns 1 if character is alphanumeric.
1322 *
1323 * Side effects:
1324 *      None.
1325 *
1326 *----------------------------------------------------------------------
1327 */
1328
1329int
1330Tcl_UniCharIsAlnum(
1331    int ch)                     /* Unicode character to test. */
1332{
1333    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1334
1335    return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
1336}
1337
1338/*
1339 *----------------------------------------------------------------------
1340 *
1341 * Tcl_UniCharIsAlpha --
1342 *
1343 *      Test if a character is an alphabetic Unicode character.
1344 *
1345 * Results:
1346 *      Returns 1 if character is alphabetic.
1347 *
1348 * Side effects:
1349 *      None.
1350 *
1351 *----------------------------------------------------------------------
1352 */
1353
1354int
1355Tcl_UniCharIsAlpha(
1356    int ch)                     /* Unicode character to test. */
1357{
1358    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1359    return ((ALPHA_BITS >> category) & 1);
1360}
1361
1362/*
1363 *----------------------------------------------------------------------
1364 *
1365 * Tcl_UniCharIsControl --
1366 *
1367 *      Test if a character is a Unicode control character.
1368 *
1369 * Results:
1370 *      Returns non-zero if character is a control.
1371 *
1372 * Side effects:
1373 *      None.
1374 *
1375 *----------------------------------------------------------------------
1376 */
1377
1378int
1379Tcl_UniCharIsControl(
1380    int ch)                     /* Unicode character to test. */
1381{
1382    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
1383}
1384
1385/*
1386 *----------------------------------------------------------------------
1387 *
1388 * Tcl_UniCharIsDigit --
1389 *
1390 *      Test if a character is a numeric Unicode character.
1391 *
1392 * Results:
1393 *      Returns non-zero if character is a digit.
1394 *
1395 * Side effects:
1396 *      None.
1397 *
1398 *----------------------------------------------------------------------
1399 */
1400
1401int
1402Tcl_UniCharIsDigit(
1403    int ch)                     /* Unicode character to test. */
1404{
1405    return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER;
1406}
1407
1408/*
1409 *----------------------------------------------------------------------
1410 *
1411 * Tcl_UniCharIsGraph --
1412 *
1413 *      Test if a character is any Unicode print character except space.
1414 *
1415 * Results:
1416 *      Returns non-zero if character is printable, but not space.
1417 *
1418 * Side effects:
1419 *      None.
1420 *
1421 *----------------------------------------------------------------------
1422 */
1423
1424int
1425Tcl_UniCharIsGraph(
1426    int ch)                     /* Unicode character to test. */
1427{
1428    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1429    return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
1430}
1431
1432/*
1433 *----------------------------------------------------------------------
1434 *
1435 * Tcl_UniCharIsLower --
1436 *
1437 *      Test if a character is a lowercase Unicode character.
1438 *
1439 * Results:
1440 *      Returns non-zero if character is lowercase.
1441 *
1442 * Side effects:
1443 *      None.
1444 *
1445 *----------------------------------------------------------------------
1446 */
1447
1448int
1449Tcl_UniCharIsLower(
1450    int ch)                     /* Unicode character to test. */
1451{
1452    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
1453}
1454
1455/*
1456 *----------------------------------------------------------------------
1457 *
1458 * Tcl_UniCharIsPrint --
1459 *
1460 *      Test if a character is a Unicode print character.
1461 *
1462 * Results:
1463 *      Returns non-zero if character is printable.
1464 *
1465 * Side effects:
1466 *      None.
1467 *
1468 *----------------------------------------------------------------------
1469 */
1470
1471int
1472Tcl_UniCharIsPrint(
1473    int ch)                     /* Unicode character to test. */
1474{
1475    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1476    return ((PRINT_BITS >> category) & 1);
1477}
1478
1479/*
1480 *----------------------------------------------------------------------
1481 *
1482 * Tcl_UniCharIsPunct --
1483 *
1484 *      Test if a character is a Unicode punctuation character.
1485 *
1486 * Results:
1487 *      Returns non-zero if character is punct.
1488 *
1489 * Side effects:
1490 *      None.
1491 *
1492 *----------------------------------------------------------------------
1493 */
1494
1495int
1496Tcl_UniCharIsPunct(
1497    int ch)                     /* Unicode character to test. */
1498{
1499    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1500    return ((PUNCT_BITS >> category) & 1);
1501}
1502
1503/*
1504 *----------------------------------------------------------------------
1505 *
1506 * Tcl_UniCharIsSpace --
1507 *
1508 *      Test if a character is a whitespace Unicode character.
1509 *
1510 * Results:
1511 *      Returns non-zero if character is a space.
1512 *
1513 * Side effects:
1514 *      None.
1515 *
1516 *----------------------------------------------------------------------
1517 */
1518
1519int
1520Tcl_UniCharIsSpace(
1521    int ch)                     /* Unicode character to test. */
1522{
1523    register int category;
1524
1525    /*
1526     * If the character is within the first 127 characters, just use the
1527     * standard C function, otherwise consult the Unicode table.
1528     */
1529
1530    if (ch < 0x80) {
1531        return isspace(UCHAR(ch)); /* INTL: ISO space */
1532    } else {
1533        category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1534        return ((SPACE_BITS >> category) & 1);
1535    }
1536}
1537
1538/*
1539 *----------------------------------------------------------------------
1540 *
1541 * Tcl_UniCharIsUpper --
1542 *
1543 *      Test if a character is a uppercase Unicode character.
1544 *
1545 * Results:
1546 *      Returns non-zero if character is uppercase.
1547 *
1548 * Side effects:
1549 *      None.
1550 *
1551 *----------------------------------------------------------------------
1552 */
1553
1554int
1555Tcl_UniCharIsUpper(
1556    int ch)                     /* Unicode character to test. */
1557{
1558    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
1559}
1560
1561/*
1562 *----------------------------------------------------------------------
1563 *
1564 * Tcl_UniCharIsWordChar --
1565 *
1566 *      Test if a character is alphanumeric or a connector punctuation mark.
1567 *
1568 * Results:
1569 *      Returns 1 if character is a word character.
1570 *
1571 * Side effects:
1572 *      None.
1573 *
1574 *----------------------------------------------------------------------
1575 */
1576
1577int
1578Tcl_UniCharIsWordChar(
1579    int ch)                     /* Unicode character to test. */
1580{
1581    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
1582
1583    return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
1584}
1585
1586/*
1587 *----------------------------------------------------------------------
1588 *
1589 * Tcl_UniCharCaseMatch --
1590 *
1591 *      See if a particular Unicode string matches a particular pattern.
1592 *      Allows case insensitivity. This is the Unicode equivalent of the char*
1593 *      Tcl_StringCaseMatch. The UniChar strings must be NULL-terminated.
1594 *      This has no provision for counted UniChar strings, thus should not be
1595 *      used where NULLs are expected in the UniChar string. Use
1596 *      TclUniCharMatch where possible.
1597 *
1598 * Results:
1599 *      The return value is 1 if string matches pattern, and 0 otherwise. The
1600 *      matching operation permits the following special characters in the
1601 *      pattern: *?\[] (see the manual entry for details on what these mean).
1602 *
1603 * Side effects:
1604 *      None.
1605 *
1606 *----------------------------------------------------------------------
1607 */
1608
1609int
1610Tcl_UniCharCaseMatch(
1611    CONST Tcl_UniChar *uniStr,  /* Unicode String. */
1612    CONST Tcl_UniChar *uniPattern,
1613                                /* Pattern, which may contain special
1614                                 * characters. */
1615    int nocase)                 /* 0 for case sensitive, 1 for insensitive */
1616{
1617    Tcl_UniChar ch1, p;
1618
1619    while (1) {
1620        p = *uniPattern;
1621
1622        /*
1623         * See if we're at the end of both the pattern and the string. If so,
1624         * we succeeded. If we're at the end of the pattern but not at the end
1625         * of the string, we failed.
1626         */
1627
1628        if (p == 0) {
1629            return (*uniStr == 0);
1630        }
1631        if ((*uniStr == 0) && (p != '*')) {
1632            return 0;
1633        }
1634
1635        /*
1636         * Check for a "*" as the next pattern character. It matches any
1637         * substring. We handle this by skipping all the characters up to the
1638         * next matching one in the pattern, and then calling ourselves
1639         * recursively for each postfix of string, until either we match or we
1640         * reach the end of the string.
1641         */
1642
1643        if (p == '*') {
1644            /*
1645             * Skip all successive *'s in the pattern
1646             */
1647
1648            while (*(++uniPattern) == '*') {
1649                /* empty body */
1650            }
1651            p = *uniPattern;
1652            if (p == 0) {
1653                return 1;
1654            }
1655            if (nocase) {
1656                p = Tcl_UniCharToLower(p);
1657            }
1658            while (1) {
1659                /*
1660                 * Optimization for matching - cruise through the string
1661                 * quickly if the next char in the pattern isn't a special
1662                 * character
1663                 */
1664
1665                if ((p != '[') && (p != '?') && (p != '\\')) {
1666                    if (nocase) {
1667                        while (*uniStr && (p != *uniStr)
1668                                && (p != Tcl_UniCharToLower(*uniStr))) {
1669                            uniStr++;
1670                        }
1671                    } else {
1672                        while (*uniStr && (p != *uniStr)) {
1673                            uniStr++;
1674                        }
1675                    }
1676                }
1677                if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) {
1678                    return 1;
1679                }
1680                if (*uniStr == 0) {
1681                    return 0;
1682                }
1683                uniStr++;
1684            }
1685        }
1686
1687        /*
1688         * Check for a "?" as the next pattern character. It matches any
1689         * single character.
1690         */
1691
1692        if (p == '?') {
1693            uniPattern++;
1694            uniStr++;
1695            continue;
1696        }
1697
1698        /*
1699         * Check for a "[" as the next pattern character. It is followed by a
1700         * list of characters that are acceptable, or by a range (two
1701         * characters separated by "-").
1702         */
1703
1704        if (p == '[') {
1705            Tcl_UniChar startChar, endChar;
1706
1707            uniPattern++;
1708            ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr);
1709            uniStr++;
1710            while (1) {
1711                if ((*uniPattern == ']') || (*uniPattern == 0)) {
1712                    return 0;
1713                }
1714                startChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
1715                        : *uniPattern);
1716                uniPattern++;
1717                if (*uniPattern == '-') {
1718                    uniPattern++;
1719                    if (*uniPattern == 0) {
1720                        return 0;
1721                    }
1722                    endChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
1723                            : *uniPattern);
1724                    uniPattern++;
1725                    if (((startChar <= ch1) && (ch1 <= endChar))
1726                            || ((endChar <= ch1) && (ch1 <= startChar))) {
1727                        /*
1728                         * Matches ranges of form [a-z] or [z-a].
1729                         */
1730                        break;
1731                    }
1732                } else if (startChar == ch1) {
1733                    break;
1734                }
1735            }
1736            while (*uniPattern != ']') {
1737                if (*uniPattern == 0) {
1738                    uniPattern--;
1739                    break;
1740                }
1741                uniPattern++;
1742            }
1743            uniPattern++;
1744            continue;
1745        }
1746
1747        /*
1748         * If the next pattern character is '\', just strip off the '\' so we
1749         * do exact matching on the character that follows.
1750         */
1751
1752        if (p == '\\') {
1753            if (*(++uniPattern) == '\0') {
1754                return 0;
1755            }
1756        }
1757
1758        /*
1759         * There's no special character. Just make sure that the next bytes of
1760         * each string match.
1761         */
1762
1763        if (nocase) {
1764            if (Tcl_UniCharToLower(*uniStr) !=
1765                    Tcl_UniCharToLower(*uniPattern)) {
1766                return 0;
1767            }
1768        } else if (*uniStr != *uniPattern) {
1769            return 0;
1770        }
1771        uniStr++;
1772        uniPattern++;
1773    }
1774}
1775
1776/*
1777 *----------------------------------------------------------------------
1778 *
1779 * TclUniCharMatch --
1780 *
1781 *      See if a particular Unicode string matches a particular pattern.
1782 *      Allows case insensitivity. This is the Unicode equivalent of the char*
1783 *      Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted
1784 *      Strings, so embedded NULLs are allowed.
1785 *
1786 * Results:
1787 *      The return value is 1 if string matches pattern, and 0 otherwise. The
1788 *      matching operation permits the following special characters in the
1789 *      pattern: *?\[] (see the manual entry for details on what these mean).
1790 *
1791 * Side effects:
1792 *      None.
1793 *
1794 *----------------------------------------------------------------------
1795 */
1796
1797int
1798TclUniCharMatch(
1799    CONST Tcl_UniChar *string,  /* Unicode String. */
1800    int strLen,                 /* Length of String */
1801    CONST Tcl_UniChar *pattern, /* Pattern, which may contain special
1802                                 * characters. */
1803    int ptnLen,                 /* Length of Pattern */
1804    int nocase)                 /* 0 for case sensitive, 1 for insensitive */
1805{
1806    CONST Tcl_UniChar *stringEnd, *patternEnd;
1807    Tcl_UniChar p;
1808
1809    stringEnd = string + strLen;
1810    patternEnd = pattern + ptnLen;
1811
1812    while (1) {
1813        /*
1814         * See if we're at the end of both the pattern and the string. If so,
1815         * we succeeded. If we're at the end of the pattern but not at the end
1816         * of the string, we failed.
1817         */
1818
1819        if (pattern == patternEnd) {
1820            return (string == stringEnd);
1821        }
1822        p = *pattern;
1823        if ((string == stringEnd) && (p != '*')) {
1824            return 0;
1825        }
1826
1827        /*
1828         * Check for a "*" as the next pattern character. It matches any
1829         * substring. We handle this by skipping all the characters up to the
1830         * next matching one in the pattern, and then calling ourselves
1831         * recursively for each postfix of string, until either we match or we
1832         * reach the end of the string.
1833         */
1834
1835        if (p == '*') {
1836            /*
1837             * Skip all successive *'s in the pattern.
1838             */
1839
1840            while (*(++pattern) == '*') {
1841                /* empty body */
1842            }
1843            if (pattern == patternEnd) {
1844                return 1;
1845            }
1846            p = *pattern;
1847            if (nocase) {
1848                p = Tcl_UniCharToLower(p);
1849            }
1850            while (1) {
1851                /*
1852                 * Optimization for matching - cruise through the string
1853                 * quickly if the next char in the pattern isn't a special
1854                 * character.
1855                 */
1856
1857                if ((p != '[') && (p != '?') && (p != '\\')) {
1858                    if (nocase) {
1859                        while ((string < stringEnd) && (p != *string)
1860                                && (p != Tcl_UniCharToLower(*string))) {
1861                            string++;
1862                        }
1863                    } else {
1864                        while ((string < stringEnd) && (p != *string)) {
1865                            string++;
1866                        }
1867                    }
1868                }
1869                if (TclUniCharMatch(string, stringEnd - string,
1870                        pattern, patternEnd - pattern, nocase)) {
1871                    return 1;
1872                }
1873                if (string == stringEnd) {
1874                    return 0;
1875                }
1876                string++;
1877            }
1878        }
1879
1880        /*
1881         * Check for a "?" as the next pattern character. It matches any
1882         * single character.
1883         */
1884
1885        if (p == '?') {
1886            pattern++;
1887            string++;
1888            continue;
1889        }
1890
1891        /*
1892         * Check for a "[" as the next pattern character. It is followed by a
1893         * list of characters that are acceptable, or by a range (two
1894         * characters separated by "-").
1895         */
1896
1897        if (p == '[') {
1898            Tcl_UniChar ch1, startChar, endChar;
1899
1900            pattern++;
1901            ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
1902            string++;
1903            while (1) {
1904                if ((*pattern == ']') || (pattern == patternEnd)) {
1905                    return 0;
1906                }
1907                startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
1908                pattern++;
1909                if (*pattern == '-') {
1910                    pattern++;
1911                    if (pattern == patternEnd) {
1912                        return 0;
1913                    }
1914                    endChar = (nocase ? Tcl_UniCharToLower(*pattern)
1915                            : *pattern);
1916                    pattern++;
1917                    if (((startChar <= ch1) && (ch1 <= endChar))
1918                            || ((endChar <= ch1) && (ch1 <= startChar))) {
1919                        /*
1920                         * Matches ranges of form [a-z] or [z-a].
1921                         */
1922                        break;
1923                    }
1924                } else if (startChar == ch1) {
1925                    break;
1926                }
1927            }
1928            while (*pattern != ']') {
1929                if (pattern == patternEnd) {
1930                    pattern--;
1931                    break;
1932                }
1933                pattern++;
1934            }
1935            pattern++;
1936            continue;
1937        }
1938
1939        /*
1940         * If the next pattern character is '\', just strip off the '\' so we
1941         * do exact matching on the character that follows.
1942         */
1943
1944        if (p == '\\') {
1945            if (++pattern == patternEnd) {
1946                return 0;
1947            }
1948        }
1949
1950        /*
1951         * There's no special character. Just make sure that the next bytes of
1952         * each string match.
1953         */
1954
1955        if (nocase) {
1956            if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
1957                return 0;
1958            }
1959        } else if (*string != *pattern) {
1960            return 0;
1961        }
1962        string++;
1963        pattern++;
1964    }
1965}
1966
1967/*
1968 * Local Variables:
1969 * mode: c
1970 * c-basic-offset: 4
1971 * fill-column: 78
1972 * End:
1973 */
Note: See TracBrowser for help on using the repository browser.