root/cherokee/trunk/cherokee/pcre/pcre_exec.c

Revision 905, 145.6 kB (checked in by alo, 1 year ago)

--

Line 
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8                        Written by Philip Hazel
9            Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #include "local_config.h"
46
47 #define NLBLOCK md             /* Block containing newline information */
48 #define PSSTART start_subject  /* Field containing processed string start */
49 #define PSEND   end_subject    /* Field containing processed string end */
50
51 #include "pcre_internal.h"
52
53 /* Undefine some potentially clashing cpp symbols */
54
55 #undef min
56 #undef max
57
58 /* Flag bits for the match() function */
59
60 #define match_condassert     0x01  /* Called to check a condition assertion */
61 #define match_cbegroup       0x02  /* Could-be-empty unlimited repeat group */
62
63 /* Non-error returns from the match() function. Error returns are externally
64 defined PCRE_ERROR_xxx codes, which are all negative. */
65
66 #define MATCH_MATCH        1
67 #define MATCH_NOMATCH      0
68
69 /* Special internal returns from the match() function. Make them sufficiently
70 negative to avoid the external error codes. */
71
72 #define MATCH_COMMIT       (-999)
73 #define MATCH_PRUNE        (-998)
74 #define MATCH_SKIP         (-997)
75 #define MATCH_THEN         (-996)
76
77 /* Maximum number of ints of offset to save on the stack for recursive calls.
78 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
79 because the offset vector is always a multiple of 3 long. */
80
81 #define REC_STACK_SAVE_MAX 30
82
83 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
84
85 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
86 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
87
88
89
90 #ifdef DEBUG
91 /*************************************************
92 *        Debugging function to print chars       *
93 *************************************************/
94
95 /* Print a sequence of chars in printable format, stopping at the end of the
96 subject if the requested.
97
98 Arguments:
99   p           points to characters
100   length      number to print
101   is_subject  TRUE if printing from within md->start_subject
102   md          pointer to matching data block, if is_subject is TRUE
103
104 Returns:     nothing
105 */
106
107 static void
108 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
109 {
110 unsigned int c;
111 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
112 while (length-- > 0)
113   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
114 }
115 #endif
116
117
118
119 /*************************************************
120 *          Match a back-reference                *
121 *************************************************/
122
123 /* If a back reference hasn't been set, the length that is passed is greater
124 than the number of characters left in the string, so the match fails.
125
126 Arguments:
127   offset      index into the offset vector
128   eptr        points into the subject
129   length      length to be matched
130   md          points to match data block
131   ims         the ims flags
132
133 Returns:      TRUE if matched
134 */
135
136 static BOOL
137 match_ref(int offset, register USPTR eptr, int length, match_data *md,
138   unsigned long int ims)
139 {
140 USPTR p = md->start_subject + md->offset_vector[offset];
141
142 #ifdef DEBUG
143 if (eptr >= md->end_subject)
144   printf("matching subject <null>");
145 else
146   {
147   printf("matching subject ");
148   pchars(eptr, length, TRUE, md);
149   }
150 printf(" against backref ");
151 pchars(p, length, FALSE, md);
152 printf("\n");
153 #endif
154
155 /* Always fail if not enough characters left */
156
157 if (length > md->end_subject - eptr) return FALSE;
158
159 /* Separate the caselesss case for speed */
160
161 if ((ims & PCRE_CASELESS) != 0)
162   {
163   while (length-- > 0)
164     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
165   }
166 else
167   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
168
169 return TRUE;
170 }
171
172
173
174 /***************************************************************************
175 ****************************************************************************
176                    RECURSION IN THE match() FUNCTION
177
178 The match() function is highly recursive, though not every recursive call
179 increases the recursive depth. Nevertheless, some regular expressions can cause
180 it to recurse to a great depth. I was writing for Unix, so I just let it call
181 itself recursively. This uses the stack for saving everything that has to be
182 saved for a recursive call. On Unix, the stack can be large, and this works
183 fine.
184
185 It turns out that on some non-Unix-like systems there are problems with
186 programs that use a lot of stack. (This despite the fact that every last chip
187 has oodles of memory these days, and techniques for extending the stack have
188 been known for decades.) So....
189
190 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
191 calls by keeping local variables that need to be preserved in blocks of memory
192 obtained from malloc() instead instead of on the stack. Macros are used to
193 achieve this so that the actual code doesn't look very different to what it
194 always used to.
195
196 The original heap-recursive code used longjmp(). However, it seems that this
197 can be very slow on some operating systems. Following a suggestion from Stan
198 Switzer, the use of longjmp() has been abolished, at the cost of having to
199 provide a unique number for each call to RMATCH. There is no way of generating
200 a sequence of numbers at compile time in C. I have given them names, to make
201 them stand out more clearly.
202
203 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
204 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
205 tests. Furthermore, not using longjmp() means that local dynamic variables
206 don't have indeterminate values; this has meant that the frame size can be
207 reduced because the result can be "passed back" by straight setting of the
208 variable instead of being passed in the frame.
209 ****************************************************************************
210 ***************************************************************************/
211
212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
213 below must be updated in sync.  */
214
215 enum { RM1=1, RM2,  RM3,  RM4,  RM5,  RM6,  RM7,  RM8,  RM9,  RM10,
216        RM11,  RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
217        RM21,  RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
218        RM31,  RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
219        RM41,  RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
220        RM51,  RM52, RM53, RM54 };
221
222 /* These versions of the macros use the stack, as normal. There are debugging
223 versions and production versions. Note that the "rw" argument of RMATCH isn't
224 actuall used in this definition. */
225
226 #ifndef NO_RECURSE
227 #define REGISTER register
228
229 #ifdef DEBUG
230 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
231   { \
232   printf("match() called in line %d\n", __LINE__); \
233   rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
234   printf("to line %d\n", __LINE__); \
235   }
236 #define RRETURN(ra) \
237   { \
238   printf("match() returned %d from line %d ", ra, __LINE__); \
239   return ra; \
240   }
241 #else
242 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
243   rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
244 #define RRETURN(ra) return ra
245 #endif
246
247 #else
248
249
250 /* These versions of the macros manage a private stack on the heap. Note that
251 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
252 argument of match(), which never changes. */
253
254 #define REGISTER
255
256 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
257   {\
258   heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
259   frame->Xwhere = rw; \
260   newframe->Xeptr = ra;\
261   newframe->Xecode = rb;\
262   newframe->Xmstart = mstart;\
263   newframe->Xoffset_top = rc;\
264   newframe->Xims = re;\
265   newframe->Xeptrb = rf;\
266   newframe->Xflags = rg;\
267   newframe->Xrdepth = frame->Xrdepth + 1;\
268   newframe->Xprevframe = frame;\
269   frame = newframe;\
270   DPRINTF(("restarting from line %d\n", __LINE__));\
271   goto HEAP_RECURSE;\
272   L_##rw:\
273   DPRINTF(("jumped back to line %d\n", __LINE__));\
274   }
275
276 #define RRETURN(ra)\
277   {\
278   heapframe *newframe = frame;\
279   frame = newframe->Xprevframe;\
280   (pcre_stack_free)(newframe);\
281   if (frame != NULL)\
282     {\
283     rrc = ra;\
284     goto HEAP_RETURN;\
285     }\
286   return ra;\
287   }
288
289
290 /* Structure for remembering the local variables in a private frame */
291
292 typedef struct heapframe {
293   struct heapframe *Xprevframe;
294
295   /* Function arguments that may change */
296
297   const uschar *Xeptr;
298   const uschar *Xecode;
299   const uschar *Xmstart;
300   int Xoffset_top;
301   long int Xims;
302   eptrblock *Xeptrb;
303   int Xflags;
304   unsigned int Xrdepth;
305
306   /* Function local variables */
307
308   const uschar *Xcallpat;
309   const uschar *Xcharptr;
310   const uschar *Xdata;
311   const uschar *Xnext;
312   const uschar *Xpp;
313   const uschar *Xprev;
314   const uschar *Xsaved_eptr;
315
316   recursion_info Xnew_recursive;
317
318   BOOL Xcur_is_word;
319   BOOL Xcondition;
320   BOOL Xprev_is_word;
321
322   unsigned long int Xoriginal_ims;
323
324 #ifdef SUPPORT_UCP
325   int Xprop_type;
326   int Xprop_value;
327   int Xprop_fail_result;
328   int Xprop_category;
329   int Xprop_chartype;
330   int Xprop_script;
331   int Xoclength;
332   uschar Xocchars[8];
333 #endif
334
335   int Xctype;
336   unsigned int Xfc;
337   int Xfi;
338   int Xlength;
339   int Xmax;
340   int Xmin;
341   int Xnumber;
342   int Xoffset;
343   int Xop;
344   int Xsave_capture_last;
345   int Xsave_offset1, Xsave_offset2, Xsave_offset3;
346   int Xstacksave[REC_STACK_SAVE_MAX];
347
348   eptrblock Xnewptrb;
349
350   /* Where to jump back to */
351
352   int Xwhere;
353
354 } heapframe;
355
356 #endif
357
358
359 /***************************************************************************
360 ***************************************************************************/
361
362
363
364 /*************************************************
365 *         Match from current position            *
366 *************************************************/
367
368 /* This function is called recursively in many circumstances. Whenever it
369 returns a negative (error) response, the outer incarnation must also return the
370 same response.
371
372 Performance note: It might be tempting to extract commonly used fields from the
373 md structure (e.g. utf8, end_subject) into individual variables to improve
374 performance. Tests using gcc on a SPARC disproved this; in the first case, it
375 made performance worse.
376
377 Arguments:
378    eptr        pointer to current character in subject
379    ecode       pointer to current position in compiled code
380    mstart      pointer to the current match start position (can be modified
381                  by encountering \K)
382    offset_top  current top pointer
383    md          pointer to "static" info for the match
384    ims         current /i, /m, and /s options
385    eptrb       pointer to chain of blocks containing eptr at start of
386                  brackets - for testing for empty matches
387    flags       can contain
388                  match_condassert - this is an assertion condition
389                  match_cbegroup - this is the start of an unlimited repeat
390                    group that can match an empty string
391    rdepth      the recursion depth
392
393 Returns:       MATCH_MATCH if matched            )  these values are >= 0
394                MATCH_NOMATCH if failed to match  )
395                a negative PCRE_ERROR_xxx value if aborted by an error condition
396                  (e.g. stopped by repeated call or recursion limit)
397 */
398
399 static int
400 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
401   int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
402   int flags, unsigned int rdepth)
403 {
404 /* These variables do not need to be preserved over recursion in this function,
405 so they can be ordinary variables in all cases. Mark some of them with
406 "register" because they are used a lot in loops. */
407
408 register int  rrc;         /* Returns from recursive calls */
409 register int  i;           /* Used for loops not involving calls to RMATCH() */
410 register unsigned int c;   /* Character values not kept over RMATCH() calls */
411 register BOOL utf8;        /* Local copy of UTF-8 flag for speed */
412
413 BOOL minimize, possessive; /* Quantifier options */
414
415 /* When recursion is not being used, all "local" variables that have to be
416 preserved over calls to RMATCH() are part of a "frame" which is obtained from
417 heap storage. Set up the top-level frame here; others are obtained from the
418 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
419
420 #ifdef NO_RECURSE
421 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
422 frame->Xprevframe = NULL;            /* Marks the top level */
423
424 /* Copy in the original argument variables */
425
426 frame->Xeptr = eptr;
427 frame->Xecode = ecode;
428 frame->Xmstart = mstart;
429 frame->Xoffset_top = offset_top;
430 frame->Xims = ims;
431 frame->Xeptrb = eptrb;
432 frame->Xflags = flags;
433 frame->Xrdepth = rdepth;
434
435 /* This is where control jumps back to to effect "recursion" */
436
437 HEAP_RECURSE:
438
439 /* Macros make the argument variables come from the current frame */
440
441 #define eptr               frame->Xeptr
442 #define ecode              frame->Xecode
443 #define mstart             frame->Xmstart
444 #define offset_top         frame->Xoffset_top
445 #define ims                frame->Xims
446 #define eptrb              frame->Xeptrb
447 #define flags              frame->Xflags
448 #define rdepth             frame->Xrdepth
449
450 /* Ditto for the local variables */
451
452 #ifdef SUPPORT_UTF8
453 #define charptr            frame->Xcharptr
454 #endif
455 #define callpat            frame->Xcallpat
456 #define data               frame->Xdata
457 #define next               frame->Xnext
458 #define pp                 frame->Xpp
459 #define prev               frame->Xprev
460 #define saved_eptr         frame->Xsaved_eptr
461
462 #define new_recursive      frame->Xnew_recursive
463
464 #define cur_is_word        frame->Xcur_is_word
465 #define condition          frame->Xcondition
466 #define prev_is_word       frame->Xprev_is_word
467
468 #define original_ims       frame->Xoriginal_ims
469
470 #ifdef SUPPORT_UCP
471 #define prop_type          frame->Xprop_type
472 #define prop_value         frame->Xprop_value
473 #define prop_fail_result   frame->Xprop_fail_result
474 #define prop_category      frame->Xprop_category
475 #define prop_chartype      frame->Xprop_chartype
476 #define prop_script        frame->Xprop_script
477 #define oclength           frame->Xoclength
478 #define occhars            frame->Xocchars
479 #endif
480
481 #define ctype              frame->Xctype
482 #define fc                 frame->Xfc
483 #define fi                 frame->Xfi
484 #define length             frame->Xlength
485 #define max                frame->Xmax
486 #define min                frame->Xmin
487 #define number             frame->Xnumber
488 #define offset             frame->Xoffset
489 #define op                 frame->Xop
490 #define save_capture_last  frame->Xsave_capture_last
491 #define save_offset1       frame->Xsave_offset1
492 #define save_offset2       frame->Xsave_offset2
493 #define save_offset3       frame->Xsave_offset3
494 #define stacksave          frame->Xstacksave
495
496 #define newptrb            frame->Xnewptrb
497
498 /* When recursion is being used, local variables are allocated on the stack and
499 get preserved during recursion in the normal way. In this environment, fi and
500 i, and fc and c, can be the same variables. */
501
502 #else         /* NO_RECURSE not defined */
503 #define fi i
504 #define fc c
505
506
507 #ifdef SUPPORT_UTF8                /* Many of these variables are used only  */
508 const uschar *charptr;             /* in small blocks of the code. My normal */
509 #endif                             /* style of coding would have declared    */
510 const uschar *callpat;             /* them within each of those blocks.      */
511 const uschar *data;                /* However, in order to accommodate the   */
512 const uschar *next;                /* version of this code that uses an      */
513 USPTR         pp;                  /* external "stack" implemented on the    */
514 const uschar *prev;                /* heap, it is easier to declare them all */
515 USPTR         saved_eptr;          /* here, so the declarations can be cut   */
516                                    /* out in a block. The only declarations  */
517 recursion_info new_recursive;      /* within blocks below are for variables  */
518                                    /* that do not have to be preserved over  */
519 BOOL cur_is_word;                  /* a recursive call to RMATCH().          */
520 BOOL condition;
521 BOOL prev_is_word;
522
523 unsigned long int original_ims;
524
525 #ifdef SUPPORT_UCP
526 int prop_type;
527 int prop_value;
528 int prop_fail_result;
529 int prop_category;
530 int prop_chartype;
531 int prop_script;
532 int oclength;
533 uschar occhars[8];
534 #endif
535
536 int ctype;
537 int length;
538 int max;
539 int min;
540 int number;
541 int offset;
542 int op;
543 int save_capture_last;
544 int save_offset1, save_offset2, save_offset3;
545 int stacksave[REC_STACK_SAVE_MAX];
546
547 eptrblock newptrb;
548 #endif     /* NO_RECURSE */
549
550 /* These statements are here to stop the compiler complaining about unitialized
551 variables. */
552
553 #ifdef SUPPORT_UCP
554 prop_value = 0;
555 prop_fail_result = 0;
556 #endif
557
558
559 /* This label is used for tail recursion, which is used in a few cases even
560 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
561 used. Thanks to Ian Taylor for noticing this possibility and sending the
562 original patch. */
563
564 TAIL_RECURSE:
565
566 /* OK, now we can get on with the real code of the function. Recursive calls
567 are specified by the macro RMATCH and RRETURN is used to return. When
568 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
569 and a "return", respectively (possibly with some debugging if DEBUG is
570 defined). However, RMATCH isn't like a function call because it's quite a
571 complicated macro. It has to be used in one particular way. This shouldn't,
572 however, impact performance when true recursion is being used. */
573
574 #ifdef SUPPORT_UTF8
575 utf8 = md->utf8;       /* Local copy of the flag */
576 #else
577 utf8 = FALSE;
578 #endif
579
580 /* First check that we haven't called match() too many times, or that we
581 haven't exceeded the recursive call limit. */
582
583 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
584 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
585
586 original_ims = ims;    /* Save for resetting on ')' */
587
588 /* At the start of a group with an unlimited repeat that may match an empty
589 string, the match_cbegroup flag is set. When this is the case, add the current
590 subject pointer to the chain of such remembered pointers, to be checked when we
591 hit the closing ket, in order to break infinite loops that match no characters.
592 When match() is called in other circumstances, don't add to the chain. The
593 match_cbegroup flag must NOT be used with tail recursion, because the memory
594 block that is used is on the stack, so a new one may be required for each
595 match(). */
596
597 if ((flags & match_cbegroup) != 0)
598   {
599   newptrb.epb_saved_eptr = eptr;
600   newptrb.epb_prev = eptrb;
601   eptrb = &newptrb;
602   }
603
604 /* Now start processing the opcodes. */
605
606 for (;;)
607   {
608   minimize = possessive = FALSE;
609   op = *ecode;
610
611   /* For partial matching, remember if we ever hit the end of the subject after
612   matching at least one subject character. */
613
614   if (md->partial &&
615       eptr >= md->end_subject &&
616       eptr > mstart)
617     md->hitend = TRUE;
618
619   switch(op)
620     {
621     case OP_FAIL:
622     RRETURN(MATCH_NOMATCH);
623
624     case OP_PRUNE:
625     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
626       ims, eptrb, flags, RM51);
627     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
628     RRETURN(MATCH_PRUNE);
629
630     case OP_COMMIT:
631     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
632       ims, eptrb, flags, RM52);
633     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
634     RRETURN(MATCH_COMMIT);
635
636     case OP_SKIP:
637     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
638       ims, eptrb, flags, RM53);
639     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
640     md->start_match_ptr = eptr;   /* Pass back current position */
641     RRETURN(MATCH_SKIP);
642
643     case OP_THEN:
644     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
645       ims, eptrb, flags, RM54);
646     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
647     RRETURN(MATCH_THEN);
648
649     /* Handle a capturing bracket. If there is space in the offset vector, save
650     the current subject position in the working slot at the top of the vector.
651     We mustn't change the current values of the data slot, because they may be
652     set from a previous iteration of this group, and be referred to by a
653     reference inside the group.
654
655     If the bracket fails to match, we need to restore this value and also the
656     values of the final offsets, in case they were set by a previous iteration
657     of the same bracket.
658
659     If there isn't enough space in the offset vector, treat this as if it were
660     a non-capturing bracket. Don't worry about setting the flag for the error
661     case here; that is handled in the code for KET. */
662
663     case OP_CBRA:
664     case OP_SCBRA:
665     number = GET2(ecode, 1+LINK_SIZE);
666     offset = number << 1;
667
668 #ifdef DEBUG
669     printf("start bracket %d\n", number);
670     printf("subject=");
671     pchars(eptr, 16, TRUE, md);
672     printf("\n");
673 #endif
674
675     if (offset < md->offset_max)
676       {
677       save_offset1 = md->offset_vector[offset];
678       save_offset2 = md->offset_vector[offset+1];
679       save_offset3 = md->offset_vector[md->offset_end - number];
680       save_capture_last = md->capture_last;
681
682       DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
683       md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
684
685       flags = (op == OP_SCBRA)? match_cbegroup : 0;
686       do
687         {
688         RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
689           ims, eptrb, flags, RM1);
690         if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
691         md->capture_last = save_capture_last;
692         ecode += GET(ecode, 1);
693         }
694       while (*ecode == OP_ALT);
695
696       DPRINTF(("bracket %d failed\n", number));
697
698       md->offset_vector[offset] = save_offset1;
699       md->offset_vector[offset+1] = save_offset2;
700       md->offset_vector[md->offset_end - number] = save_offset3;
701
702       RRETURN(MATCH_NOMATCH);
703       }
704
705     /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
706     as a non-capturing bracket. */
707
708     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
709     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
710
711     DPRINTF(("insufficient capture room: treat as non-capturing\n"));
712
713     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
714     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
715
716     /* Non-capturing bracket. Loop for all the alternatives. When we get to the
717     final alternative within the brackets, we would return the result of a
718     recursive call to match() whatever happened. We can reduce stack usage by
719     turning this into a tail recursion, except in the case when match_cbegroup
720     is set.*/
721
722     case OP_BRA:
723     case OP_SBRA:
724     DPRINTF(("start non-capturing bracket\n"));
725     flags = (op >= OP_SBRA)? match_cbegroup : 0;
726     for (;;)
727       {
728       if (ecode[GET(ecode, 1)] != OP_ALT)   /* Final alternative */
729         {
730         if (flags == 0)    /* Not a possibly empty group */
731           {
732           ecode += _pcre_OP_lengths[*ecode];
733           DPRINTF(("bracket 0 tail recursion\n"));
734           goto TAIL_RECURSE;
735           }
736
737         /* Possibly empty group; can't use tail recursion. */
738
739         RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
740           eptrb, flags, RM48);
741         RRETURN(rrc);
742         }
743
744       /* For non-final alternatives, continue the loop for a NOMATCH result;
745       otherwise return. */
746
747       RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
748         eptrb, flags, RM2);
749       if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
750       ecode += GET(ecode, 1);
751       }
752     /* Control never reaches here. */
753
754     /* Conditional group: compilation checked that there are no more than
755     two branches. If the condition is false, skipping the first branch takes us
756     past the end if there is only one branch, but that's OK because that is
757     exactly what going to the ket would do. As there is only one branch to be
758     obeyed, we can use tail recursion to avoid using another stack frame. */
759
760     case OP_COND:
761     case OP_SCOND:
762     if (ecode[LINK_SIZE+1] == OP_RREF)         /* Recursion test */
763       {
764       offset = GET2(ecode, LINK_SIZE + 2);     /* Recursion group number*/
765       condition = md->recursive != NULL &&
766         (offset == RREF_ANY || offset == md->recursive->group_num);
767       ecode += condition? 3 : GET(ecode, 1);
768       }
769
770     else if (ecode[LINK_SIZE+1] == OP_CREF)    /* Group used test */
771       {
772       offset = GET2(ecode, LINK_SIZE+2) << 1;  /* Doubled ref number */
773       condition = offset < offset_top && md->offset_vector[offset] >= 0;
774       ecode += condition? 3 : GET(ecode, 1);
775       }
776
777     else if (ecode[LINK_SIZE+1] == OP_DEF)     /* DEFINE - always false */
778       {
779       condition = FALSE;
780       ecode += GET(ecode, 1);
781       }
782
783     /* The condition is an assertion. Call match() to evaluate it - setting
784     the final argument match_condassert causes it to stop at the end of an
785     assertion. */
786
787     else
788       {
789       RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
790           match_condassert, RM3);
791       if (rrc == MATCH_MATCH)
792         {
793         condition = TRUE;
794         ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
795         while (*ecode == OP_ALT) ecode += GET(ecode, 1);
796         }
797       else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
798         {
799         RRETURN(rrc);         /* Need braces because of following else */
800         }
801       else
802         {
803         condition = FALSE;
804         ecode += GET(ecode, 1);
805         }
806       }
807
808     /* We are now at the branch that is to be obeyed. As there is only one,
809     we can use tail recursion to avoid using another stack frame, except when
810     match_cbegroup is required for an unlimited repeat of a possibly empty
811     group. If the second alternative doesn't exist, we can just plough on. */
812
813     if (condition || *ecode == OP_ALT)
814       {
815       ecode += 1 + LINK_SIZE;
816       if (op == OP_SCOND)        /* Possibly empty group */
817         {
818         RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
819         RRETURN(rrc);
820         }
821       else                       /* Group must match something */
822         {
823         flags = 0;
824         goto TAIL_RECURSE;
825         }
826       }
827     else                         /* Condition false & no 2nd alternative */
828       {
829       ecode += 1 + LINK_SIZE;
830       }
831     break;
832
833
834     /* End of the pattern, either real or forced. If we are in a top-level
835     recursion, we should restore the offsets appropriately and continue from
836     after the call. */
837
838     case OP_ACCEPT:
839     case OP_END:
840     if (md->recursive != NULL && md->recursive->group_num == 0)
841       {
842       recursion_info *rec = md->recursive;
843       DPRINTF(("End of pattern in a (?0) recursion\n"));
844       md->recursive = rec->prevrec;
845       memmove(md->offset_vector, rec->offset_save,
846         rec->saved_max * sizeof(int));
847       mstart = rec->save_start;
848       ims = original_ims;
849       ecode = rec->after_call;
850       break;
851       }
852
853     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
854     string - backtracking will then try other alternatives, if any. */
855
856     if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
857     md->end_match_ptr = eptr;           /* Record where we ended */
858     md->end_offset_top = offset_top;    /* and how many extracts were taken */
859     md->start_match_ptr = mstart;       /* and the start (\K can modify) */
860     RRETURN(MATCH_MATCH);
861
862     /* Change option settings */
863
864     case OP_OPT:
865     ims = ecode[1];
866     ecode += 2;
867     DPRINTF(("ims set to %02lx\n", ims));
868     break;
869
870     /* Assertion brackets. Check the alternative branches in turn - the
871     matching won't pass the KET for an assertion. If any one branch matches,
872     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
873     start of each branch to move the current point backwards, so the code at
874     this level is identical to the lookahead case. */
875
876     case OP_ASSERT:
877     case OP_ASSERTBACK:
878     do
879       {
880       RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
881         RM4);
882       if (rrc == MATCH_MATCH) break;
883       if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
884       ecode += GET(ecode, 1);
885       }
886     while (*ecode == OP_ALT);
887     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
888
889     /* If checking an assertion for a condition, return MATCH_MATCH. */
890
891     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
892
893     /* Continue from after the assertion, updating the offsets high water
894     mark, since extracts may have been taken during the assertion. */
895
896     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
897     ecode += 1 + LINK_SIZE;
898     offset_top = md->end_offset_top;
899     continue;
900
901     /* Negative assertion: all branches must fail to match */
902
903     case OP_ASSERT_NOT:
904     case OP_ASSERTBACK_NOT:
905     do
906       {
907       RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
908         RM5);
909       if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
910       if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
911       ecode += GET(ecode,1);
912       }
913     while (*ecode == OP_ALT);
914
915     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
916
917     ecode += 1 + LINK_SIZE;
918     continue;
919
920     /* Move the subject pointer back. This occurs only at the start of
921     each branch of a lookbehind assertion. If we are too close to the start to
922     move back, this match function fails. When working with UTF-8 we move
923     back a number of characters, not bytes. */
924
925     case OP_REVERSE:
926 #ifdef SUPPORT_UTF8
927     if (utf8)
928       {
929       i = GET(ecode, 1);
930       while (i-- > 0)
931         {
932         eptr--;
933         if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
934         BACKCHAR(eptr);
935         }
936       }
937     else
938 #endif
939
940     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
941
942       {
943       eptr -= GET(ecode, 1);
944       if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
945       }
946
947     /* Skip to next op code */
948
949     ecode += 1 + LINK_SIZE;
950     break;
951
952     /* The callout item calls an external function, if one is provided, passing
953     details of the match so far. This is mainly for debugging, though the
954     function is able to force a failure. */
955
956     case OP_CALLOUT:
957     if (pcre_callout != NULL)
958       {
959       pcre_callout_block cb;
960       cb.version          = 1;   /* Version 1 of the callout block */
961       cb.callout_number   = ecode[1];
962       cb.offset_vector    = md->offset_vector;
963       cb.subject          = (PCRE_SPTR)md->start_subject;
964       cb.subject_length   = md->end_subject - md->start_subject;
965       cb.start_match      = mstart - md->start_subject;
966       cb.current_position = eptr - md->start_subject;
967       cb.pattern_position = GET(ecode, 2);
968       cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
969       cb.capture_top      = offset_top/2;
970       cb.capture_last     = md->capture_last;
971       cb.callout_data     = md->callout_data;
972       if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
973       if (rrc < 0) RRETURN(rrc);
974       }
975     ecode += 2 + 2*LINK_SIZE;
976     break;
977
978     /* Recursion either matches the current regex, or some subexpression. The
979     offset data is the offset to the starting bracket from the start of the
980     whole pattern. (This is so that it works from duplicated subpatterns.)
981
982     If there are any capturing brackets started but not finished, we have to
983     save their starting points and reinstate them after the recursion. However,
984     we don't know how many such there are (offset_top records the completed
985     total) so we just have to save all the potential data. There may be up to
986     65535 such values, which is too large to put on the stack, but using malloc
987     for small numbers seems expensive. As a compromise, the stack is used when
988     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
989     is used. A problem is what to do if the malloc fails ... there is no way of
990     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
991     values on the stack, and accept that the rest may be wrong.
992
993     There are also other values that have to be saved. We use a chained
994     sequence of blocks that actually live on the stack. Thanks to Robin Houston
995     for the original version of this logic. */
996
997     case OP_RECURSE:
998       {
999       callpat = md->start_code + GET(ecode, 1);
1000       new_recursive.group_num = (callpat == md->start_code)? 0 :
1001         GET2(callpat, 1 + LINK_SIZE);
1002
1003       /* Add to "recursing stack" */
1004
1005       new_recursive.prevrec = md->recursive;
1006       md->recursive = &new_recursive;
1007
1008       /* Find where to continue from afterwards */
1009
1010       ecode += 1 + LINK_SIZE;
1011       new_recursive.after_call = ecode;
1012
1013       /* Now save the offset data. */
1014
1015       new_recursive.saved_max = md->offset_end;
1016       if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1017         new_recursive.offset_save = stacksave;
1018       else
1019         {
1020         new_recursive.offset_save =
1021           (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1022         if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1023         }
1024
1025       memcpy(new_recursive.offset_save, md->offset_vector,
1026             new_recursive.saved_max * sizeof(int));
1027       new_recursive.save_start = mstart;
1028       mstart = eptr;
1029
1030       /* OK, now we can do the recursion. For each top-level alternative we
1031       restore the offset and recursion data. */
1032
1033       DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1034       flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1035       do
1036         {
1037         RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1038           md, ims, eptrb, flags, RM6);
1039         if (rrc == MATCH_MATCH)
1040           {
1041           DPRINTF(("Recursion matched\n"));
1042           md->recursive = new_recursive.prevrec;
1043           if (new_recursive.offset_save != stacksave)
1044             (pcre_free)(new_recursive.offset_save);
1045           RRETURN(MATCH_MATCH);
1046           }
1047         else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1048           {
1049           DPRINTF(("Recursion gave error %d\n", rrc));
1050           RRETURN(rrc);
1051           }
1052
1053         md->recursive = &new_recursive;
1054         memcpy(md->offset_vector, new_recursive.offset_save,
1055             new_recursive.saved_max * sizeof(int));
1056         callpat += GET(callpat, 1);
1057         }
1058       while (*callpat == OP_ALT);
1059
1060       DPRINTF(("Recursion didn't match\n"));
1061       md->recursive = new_recursive.prevrec;
1062       if (new_recursive.offset_save != stacksave)
1063         (pcre_free)(new_recursive.offset_save);
1064       RRETURN(MATCH_NOMATCH);
1065       }
1066     /* Control never reaches here */
1067
1068     /* "Once" brackets are like assertion brackets except that after a match,
1069     the point in the subject string is not moved back. Thus there can never be
1070     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1071     Check the alternative branches in turn - the matching won't pass the KET
1072     for this kind of subpattern. If any one branch matches, we carry on as at
1073     the end of a normal bracket, leaving the subject pointer. */
1074
1075     case OP_ONCE:
1076     prev = ecode;
1077     saved_eptr = eptr;
1078
1079     do
1080       {
1081       RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1082       if (rrc == MATCH_MATCH) break;
1083       if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1084       ecode += GET(ecode,1);
1085       }
1086     while (*ecode == OP_ALT);
1087
1088     /* If hit the end of the group (which could be repeated), fail */
1089
1090     if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1091
1092     /* Continue as from after the assertion, updating the offsets high water
1093     mark, since extracts may have been taken. */
1094
1095     do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1096
1097     offset_top = md->end_offset_top;
1098     eptr = md->end_match_ptr;
1099
1100     /* For a non-repeating ket, just continue at this level. This also
1101     happens for a repeating ket if no characters were matched in the group.
1102     This is the forcible breaking of infinite loops as implemented in Perl
1103     5.005. If there is an options reset, it will get obeyed in the normal
1104     course of events. */
1105
1106     if (*ecode == OP_KET || eptr == saved_eptr)
1107       {
1108       ecode += 1+LINK_SIZE;
1109       break;
1110       }
1111
1112     /* The repeating kets try the rest of the pattern or restart from the
1113     preceding bracket, in the appropriate order. The second "call" of match()
1114     uses tail recursion, to avoid using another stack frame. We need to reset
1115     any options that changed within the bracket before re-running it, so
1116     check the next opcode. */
1117
1118     if (ecode[1+LINK_SIZE] == OP_OPT)
1119       {
1120       ims = (ims & ~PCRE_IMS) | ecode[4];
1121       DPRINTF(("ims set to %02lx at group repeat\n", ims));
1122       }
1123
1124     if (*ecode == OP_KETRMIN)
1125       {
1126       RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1127       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1128       ecode = prev;
1129       flags = 0;
1130       goto TAIL_RECURSE;
1131       }
1132     else  /* OP_KETRMAX */
1133       {
1134       RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1135       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1136       ecode += 1 + LINK_SIZE;
1137       flags = 0;
1138       goto TAIL_RECURSE;
1139       }
1140     /* Control never gets here */
1141
1142     /* An alternation is the end of a branch; scan along to find the end of the
1143     bracketed group and go to there. */
1144
1145     case OP_ALT:
1146     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1147     break;
1148
1149     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1150     that it may occur zero times. It may repeat infinitely, or not at all -
1151     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1152     repeat limits are compiled as a number of copies, with the optional ones
1153     preceded by BRAZERO or BRAMINZERO. */
1154
1155     case OP_BRAZERO:
1156       {
1157       next = ecode+1;
1158       RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1159       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1160       do next += GET(next,1); while (*next == OP_ALT);
1161       ecode = next + 1 + LINK_SIZE;
1162       }
1163     break;
1164
1165     case OP_BRAMINZERO:
1166       {
1167       next = ecode+1;
1168       do next += GET(next, 1); while (*next == OP_ALT);
1169       RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1170       if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1171       ecode++;
1172       }
1173     break;
1174
1175     /* End of a group, repeated or non-repeating. */
1176
1177     case OP_KET:
1178     case OP_KETRMIN:
1179     case OP_KETRMAX:
1180     prev