| 1 |
/************************************************* |
|---|
| 2 |
* Perl-Compatible Regular Expressions * |
|---|
| 3 |
*************************************************/ |
|---|
| 4 |
|
|---|
| 5 |
/* PCRE is a library of functions to support regular expressions whose syntax |
|---|
| 6 |
and semantics are as close as possible to those of the Perl 5 language. |
|---|
| 7 |
|
|---|
| 8 |
Written by Philip Hazel |
|---|
| 9 |
Copyright (c) 1997-2007 University of Cambridge |
|---|
| 10 |
|
|---|
| 11 |
----------------------------------------------------------------------------- |
|---|
| 12 |
Redistribution and use in source and binary forms, with or without |
|---|
| 13 |
modification, are permitted provided that the following conditions are met: |
|---|
| 14 |
|
|---|
| 15 |
* Redistributions of source code must retain the above copyright notice, |
|---|
| 16 |
this list of conditions and the following disclaimer. |
|---|
| 17 |
|
|---|
| 18 |
* Redistributions in binary form must reproduce the above copyright |
|---|
| 19 |
notice, this list of conditions and the following disclaimer in the |
|---|
| 20 |
documentation and/or other materials provided with the distribution. |
|---|
| 21 |
|
|---|
| 22 |
* Neither the name of the University of Cambridge nor the names of its |
|---|
| 23 |
contributors may be used to endorse or promote products derived from |
|---|
| 24 |
this software without specific prior written permission. |
|---|
| 25 |
|
|---|
| 26 |
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
|---|
| 27 |
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|---|
| 28 |
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|---|
| 29 |
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|---|
| 30 |
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|---|
| 31 |
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|---|
| 32 |
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|---|
| 33 |
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|---|
| 34 |
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|---|
| 35 |
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|---|
| 36 |
POSSIBILITY OF SUCH DAMAGE. |
|---|
| 37 |
----------------------------------------------------------------------------- |
|---|
| 38 |
*/ |
|---|
| 39 |
|
|---|
| 40 |
|
|---|
| 41 |
/* This module contains pcre_exec(), the externally visible function that does |
|---|
| 42 |
pattern matching using an NFA algorithm, trying to mimic Perl as closely as |
|---|
| 43 |
possible. There are also some static supporting functions. */ |
|---|
| 44 |
|
|---|
| 45 |
#include "local_config.h" |
|---|
| 46 |
|
|---|
| 47 |
#define NLBLOCK md /* Block containing newline information */ |
|---|
| 48 |
#define PSSTART start_subject /* Field containing processed string start */ |
|---|
| 49 |
#define PSEND end_subject /* Field containing processed string end */ |
|---|
| 50 |
|
|---|
| 51 |
#include "pcre_internal.h" |
|---|
| 52 |
|
|---|
| 53 |
/* Undefine some potentially clashing cpp symbols */ |
|---|
| 54 |
|
|---|
| 55 |
#undef min |
|---|
| 56 |
#undef max |
|---|
| 57 |
|
|---|
| 58 |
/* Flag bits for the match() function */ |
|---|
| 59 |
|
|---|
| 60 |
#define match_condassert 0x01 /* Called to check a condition assertion */ |
|---|
| 61 |
#define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */ |
|---|
| 62 |
|
|---|
| 63 |
/* Non-error returns from the match() function. Error returns are externally |
|---|
| 64 |
defined PCRE_ERROR_xxx codes, which are all negative. */ |
|---|
| 65 |
|
|---|
| 66 |
#define MATCH_MATCH 1 |
|---|
| 67 |
#define MATCH_NOMATCH 0 |
|---|
| 68 |
|
|---|
| 69 |
/* Special internal returns from the match() function. Make them sufficiently |
|---|
| 70 |
negative to avoid the external error codes. */ |
|---|
| 71 |
|
|---|
| 72 |
#define MATCH_COMMIT (-999) |
|---|
| 73 |
#define MATCH_PRUNE (-998) |
|---|
| 74 |
#define MATCH_SKIP (-997) |
|---|
| 75 |
#define MATCH_THEN (-996) |
|---|
| 76 |
|
|---|
| 77 |
/* Maximum number of ints of offset to save on the stack for recursive calls. |
|---|
| 78 |
If the offset vector is bigger, malloc is used. This should be a multiple of 3, |
|---|
| 79 |
because the offset vector is always a multiple of 3 long. */ |
|---|
| 80 |
|
|---|
| 81 |
#define REC_STACK_SAVE_MAX 30 |
|---|
| 82 |
|
|---|
| 83 |
/* Min and max values for the common repeats; for the maxima, 0 => infinity */ |
|---|
| 84 |
|
|---|
| 85 |
static const char rep_min[] = { 0, 0, 1, 1, 0, 0 }; |
|---|
| 86 |
static const char rep_max[] = { 0, 0, 0, 0, 1, 1 }; |
|---|
| 87 |
|
|---|
| 88 |
|
|---|
| 89 |
|
|---|
| 90 |
#ifdef DEBUG |
|---|
| 91 |
/************************************************* |
|---|
| 92 |
* Debugging function to print chars * |
|---|
| 93 |
*************************************************/ |
|---|
| 94 |
|
|---|
| 95 |
/* Print a sequence of chars in printable format, stopping at the end of the |
|---|
| 96 |
subject if the requested. |
|---|
| 97 |
|
|---|
| 98 |
Arguments: |
|---|
| 99 |
p points to characters |
|---|
| 100 |
length number to print |
|---|
| 101 |
is_subject TRUE if printing from within md->start_subject |
|---|
| 102 |
md pointer to matching data block, if is_subject is TRUE |
|---|
| 103 |
|
|---|
| 104 |
Returns: nothing |
|---|
| 105 |
*/ |
|---|
| 106 |
|
|---|
| 107 |
static void |
|---|
| 108 |
pchars(const uschar *p, int length, BOOL is_subject, match_data *md) |
|---|
| 109 |
{ |
|---|
| 110 |
unsigned int c; |
|---|
| 111 |
if (is_subject && length > md->end_subject - p) length = md->end_subject - p; |
|---|
| 112 |
while (length-- > 0) |
|---|
| 113 |
if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c); |
|---|
| 114 |
} |
|---|
| 115 |
#endif |
|---|
| 116 |
|
|---|
| 117 |
|
|---|
| 118 |
|
|---|
| 119 |
/************************************************* |
|---|
| 120 |
* Match a back-reference * |
|---|
| 121 |
*************************************************/ |
|---|
| 122 |
|
|---|
| 123 |
/* If a back reference hasn't been set, the length that is passed is greater |
|---|
| 124 |
than the number of characters left in the string, so the match fails. |
|---|
| 125 |
|
|---|
| 126 |
Arguments: |
|---|
| 127 |
offset index into the offset vector |
|---|
| 128 |
eptr points into the subject |
|---|
| 129 |
length length to be matched |
|---|
| 130 |
md points to match data block |
|---|
| 131 |
ims the ims flags |
|---|
| 132 |
|
|---|
| 133 |
Returns: TRUE if matched |
|---|
| 134 |
*/ |
|---|
| 135 |
|
|---|
| 136 |
static BOOL |
|---|
| 137 |
match_ref(int offset, register USPTR eptr, int length, match_data *md, |
|---|
| 138 |
unsigned long int ims) |
|---|
| 139 |
{ |
|---|
| 140 |
USPTR p = md->start_subject + md->offset_vector[offset]; |
|---|
| 141 |
|
|---|
| 142 |
#ifdef DEBUG |
|---|
| 143 |
if (eptr >= md->end_subject) |
|---|
| 144 |
printf("matching subject <null>"); |
|---|
| 145 |
else |
|---|
| 146 |
{ |
|---|
| 147 |
printf("matching subject "); |
|---|
| 148 |
pchars(eptr, length, TRUE, md); |
|---|
| 149 |
} |
|---|
| 150 |
printf(" against backref "); |
|---|
| 151 |
pchars(p, length, FALSE, md); |
|---|
| 152 |
printf("\n"); |
|---|
| 153 |
#endif |
|---|
| 154 |
|
|---|
| 155 |
/* Always fail if not enough characters left */ |
|---|
| 156 |
|
|---|
| 157 |
if (length > md->end_subject - eptr) return FALSE; |
|---|
| 158 |
|
|---|
| 159 |
/* Separate the caselesss case for speed */ |
|---|
| 160 |
|
|---|
| 161 |
if ((ims & PCRE_CASELESS) != 0) |
|---|
| 162 |
{ |
|---|
| 163 |
while (length-- > 0) |
|---|
| 164 |
if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; |
|---|
| 165 |
} |
|---|
| 166 |
else |
|---|
| 167 |
{ while (length-- > 0) if (*p++ != *eptr++) return FALSE; } |
|---|
| 168 |
|
|---|
| 169 |
return TRUE; |
|---|
| 170 |
} |
|---|
| 171 |
|
|---|
| 172 |
|
|---|
| 173 |
|
|---|
| 174 |
/*************************************************************************** |
|---|
| 175 |
**************************************************************************** |
|---|
| 176 |
RECURSION IN THE match() FUNCTION |
|---|
| 177 |
|
|---|
| 178 |
The match() function is highly recursive, though not every recursive call |
|---|
| 179 |
increases the recursive depth. Nevertheless, some regular expressions can cause |
|---|
| 180 |
it to recurse to a great depth. I was writing for Unix, so I just let it call |
|---|
| 181 |
itself recursively. This uses the stack for saving everything that has to be |
|---|
| 182 |
saved for a recursive call. On Unix, the stack can be large, and this works |
|---|
| 183 |
fine. |
|---|
| 184 |
|
|---|
| 185 |
It turns out that on some non-Unix-like systems there are problems with |
|---|
| 186 |
programs that use a lot of stack. (This despite the fact that every last chip |
|---|
| 187 |
has oodles of memory these days, and techniques for extending the stack have |
|---|
| 188 |
been known for decades.) So.... |
|---|
| 189 |
|
|---|
| 190 |
There is a fudge, triggered by defining NO_RECURSE, which avoids recursive |
|---|
| 191 |
calls by keeping local variables that need to be preserved in blocks of memory |
|---|
| 192 |
obtained from malloc() instead instead of on the stack. Macros are used to |
|---|
| 193 |
achieve this so that the actual code doesn't look very different to what it |
|---|
| 194 |
always used to. |
|---|
| 195 |
|
|---|
| 196 |
The original heap-recursive code used longjmp(). However, it seems that this |
|---|
| 197 |
can be very slow on some operating systems. Following a suggestion from Stan |
|---|
| 198 |
Switzer, the use of longjmp() has been abolished, at the cost of having to |
|---|
| 199 |
provide a unique number for each call to RMATCH. There is no way of generating |
|---|
| 200 |
a sequence of numbers at compile time in C. I have given them names, to make |
|---|
| 201 |
them stand out more clearly. |
|---|
| 202 |
|
|---|
| 203 |
Crude tests on x86 Linux show a small speedup of around 5-8%. However, on |
|---|
| 204 |
FreeBSD, avoiding longjmp() more than halves the time taken to run the standard |
|---|
| 205 |
tests. Furthermore, not using longjmp() means that local dynamic variables |
|---|
| 206 |
don't have indeterminate values; this has meant that the frame size can be |
|---|
| 207 |
reduced because the result can be "passed back" by straight setting of the |
|---|
| 208 |
variable instead of being passed in the frame. |
|---|
| 209 |
**************************************************************************** |
|---|
| 210 |
***************************************************************************/ |
|---|
| 211 |
|
|---|
| 212 |
/* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN |
|---|
| 213 |
below must be updated in sync. */ |
|---|
| 214 |
|
|---|
| 215 |
enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, |
|---|
| 216 |
RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, |
|---|
| 217 |
RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, |
|---|
| 218 |
RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40, |
|---|
| 219 |
RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50, |
|---|
| 220 |
RM51, RM52, RM53, RM54 }; |
|---|
| 221 |
|
|---|
| 222 |
/* These versions of the macros use the stack, as normal. There are debugging |
|---|
| 223 |
versions and production versions. Note that the "rw" argument of RMATCH isn't |
|---|
| 224 |
actuall used in this definition. */ |
|---|
| 225 |
|
|---|
| 226 |
#ifndef NO_RECURSE |
|---|
| 227 |
#define REGISTER register |
|---|
| 228 |
|
|---|
| 229 |
#ifdef DEBUG |
|---|
| 230 |
#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \ |
|---|
| 231 |
{ \ |
|---|
| 232 |
printf("match() called in line %d\n", __LINE__); \ |
|---|
| 233 |
rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \ |
|---|
| 234 |
printf("to line %d\n", __LINE__); \ |
|---|
| 235 |
} |
|---|
| 236 |
#define RRETURN(ra) \ |
|---|
| 237 |
{ \ |
|---|
| 238 |
printf("match() returned %d from line %d ", ra, __LINE__); \ |
|---|
| 239 |
return ra; \ |
|---|
| 240 |
} |
|---|
| 241 |
#else |
|---|
| 242 |
#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \ |
|---|
| 243 |
rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1) |
|---|
| 244 |
#define RRETURN(ra) return ra |
|---|
| 245 |
#endif |
|---|
| 246 |
|
|---|
| 247 |
#else |
|---|
| 248 |
|
|---|
| 249 |
|
|---|
| 250 |
/* These versions of the macros manage a private stack on the heap. Note that |
|---|
| 251 |
the "rd" argument of RMATCH isn't actually used in this definition. It's the md |
|---|
| 252 |
argument of match(), which never changes. */ |
|---|
| 253 |
|
|---|
| 254 |
#define REGISTER |
|---|
| 255 |
|
|---|
| 256 |
#define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\ |
|---|
| 257 |
{\ |
|---|
| 258 |
heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\ |
|---|
| 259 |
frame->Xwhere = rw; \ |
|---|
| 260 |
newframe->Xeptr = ra;\ |
|---|
| 261 |
newframe->Xecode = rb;\ |
|---|
| 262 |
newframe->Xmstart = mstart;\ |
|---|
| 263 |
newframe->Xoffset_top = rc;\ |
|---|
| 264 |
newframe->Xims = re;\ |
|---|
| 265 |
newframe->Xeptrb = rf;\ |
|---|
| 266 |
newframe->Xflags = rg;\ |
|---|
| 267 |
newframe->Xrdepth = frame->Xrdepth + 1;\ |
|---|
| 268 |
newframe->Xprevframe = frame;\ |
|---|
| 269 |
frame = newframe;\ |
|---|
| 270 |
DPRINTF(("restarting from line %d\n", __LINE__));\ |
|---|
| 271 |
goto HEAP_RECURSE;\ |
|---|
| 272 |
L_##rw:\ |
|---|
| 273 |
DPRINTF(("jumped back to line %d\n", __LINE__));\ |
|---|
| 274 |
} |
|---|
| 275 |
|
|---|
| 276 |
#define RRETURN(ra)\ |
|---|
| 277 |
{\ |
|---|
| 278 |
heapframe *newframe = frame;\ |
|---|
| 279 |
frame = newframe->Xprevframe;\ |
|---|
| 280 |
(pcre_stack_free)(newframe);\ |
|---|
| 281 |
if (frame != NULL)\ |
|---|
| 282 |
{\ |
|---|
| 283 |
rrc = ra;\ |
|---|
| 284 |
goto HEAP_RETURN;\ |
|---|
| 285 |
}\ |
|---|
| 286 |
return ra;\ |
|---|
| 287 |
} |
|---|
| 288 |
|
|---|
| 289 |
|
|---|
| 290 |
/* Structure for remembering the local variables in a private frame */ |
|---|
| 291 |
|
|---|
| 292 |
typedef struct heapframe { |
|---|
| 293 |
struct heapframe *Xprevframe; |
|---|
| 294 |
|
|---|
| 295 |
/* Function arguments that may change */ |
|---|
| 296 |
|
|---|
| 297 |
const uschar *Xeptr; |
|---|
| 298 |
const uschar *Xecode; |
|---|
| 299 |
const uschar *Xmstart; |
|---|
| 300 |
int Xoffset_top; |
|---|
| 301 |
long int Xims; |
|---|
| 302 |
eptrblock *Xeptrb; |
|---|
| 303 |
int Xflags; |
|---|
| 304 |
unsigned int Xrdepth; |
|---|
| 305 |
|
|---|
| 306 |
/* Function local variables */ |
|---|
| 307 |
|
|---|
| 308 |
const uschar *Xcallpat; |
|---|
| 309 |
const uschar *Xcharptr; |
|---|
| 310 |
const uschar *Xdata; |
|---|
| 311 |
const uschar *Xnext; |
|---|
| 312 |
const uschar *Xpp; |
|---|
| 313 |
const uschar *Xprev; |
|---|
| 314 |
const uschar *Xsaved_eptr; |
|---|
| 315 |
|
|---|
| 316 |
recursion_info Xnew_recursive; |
|---|
| 317 |
|
|---|
| 318 |
BOOL Xcur_is_word; |
|---|
| 319 |
BOOL Xcondition; |
|---|
| 320 |
BOOL Xprev_is_word; |
|---|
| 321 |
|
|---|
| 322 |
unsigned long int Xoriginal_ims; |
|---|
| 323 |
|
|---|
| 324 |
#ifdef SUPPORT_UCP |
|---|
| 325 |
int Xprop_type; |
|---|
| 326 |
int Xprop_value; |
|---|
| 327 |
int Xprop_fail_result; |
|---|
| 328 |
int Xprop_category; |
|---|
| 329 |
int Xprop_chartype; |
|---|
| 330 |
int Xprop_script; |
|---|
| 331 |
int Xoclength; |
|---|
| 332 |
uschar Xocchars[8]; |
|---|
| 333 |
#endif |
|---|
| 334 |
|
|---|
| 335 |
int Xctype; |
|---|
| 336 |
unsigned int Xfc; |
|---|
| 337 |
int Xfi; |
|---|
| 338 |
int Xlength; |
|---|
| 339 |
int Xmax; |
|---|
| 340 |
int Xmin; |
|---|
| 341 |
int Xnumber; |
|---|
| 342 |
int Xoffset; |
|---|
| 343 |
int Xop; |
|---|
| 344 |
int Xsave_capture_last; |
|---|
| 345 |
int Xsave_offset1, Xsave_offset2, Xsave_offset3; |
|---|
| 346 |
int Xstacksave[REC_STACK_SAVE_MAX]; |
|---|
| 347 |
|
|---|
| 348 |
eptrblock Xnewptrb; |
|---|
| 349 |
|
|---|
| 350 |
/* Where to jump back to */ |
|---|
| 351 |
|
|---|
| 352 |
int Xwhere; |
|---|
| 353 |
|
|---|
| 354 |
} heapframe; |
|---|
| 355 |
|
|---|
| 356 |
#endif |
|---|
| 357 |
|
|---|
| 358 |
|
|---|
| 359 |
/*************************************************************************** |
|---|
| 360 |
***************************************************************************/ |
|---|
| 361 |
|
|---|
| 362 |
|
|---|
| 363 |
|
|---|
| 364 |
/************************************************* |
|---|
| 365 |
* Match from current position * |
|---|
| 366 |
*************************************************/ |
|---|
| 367 |
|
|---|
| 368 |
/* This function is called recursively in many circumstances. Whenever it |
|---|
| 369 |
returns a negative (error) response, the outer incarnation must also return the |
|---|
| 370 |
same response. |
|---|
| 371 |
|
|---|
| 372 |
Performance note: It might be tempting to extract commonly used fields from the |
|---|
| 373 |
md structure (e.g. utf8, end_subject) into individual variables to improve |
|---|
| 374 |
performance. Tests using gcc on a SPARC disproved this; in the first case, it |
|---|
| 375 |
made performance worse. |
|---|
| 376 |
|
|---|
| 377 |
Arguments: |
|---|
| 378 |
eptr pointer to current character in subject |
|---|
| 379 |
ecode pointer to current position in compiled code |
|---|
| 380 |
mstart pointer to the current match start position (can be modified |
|---|
| 381 |
by encountering \K) |
|---|
| 382 |
offset_top current top pointer |
|---|
| 383 |
md pointer to "static" info for the match |
|---|
| 384 |
ims current /i, /m, and /s options |
|---|
| 385 |
eptrb pointer to chain of blocks containing eptr at start of |
|---|
| 386 |
brackets - for testing for empty matches |
|---|
| 387 |
flags can contain |
|---|
| 388 |
match_condassert - this is an assertion condition |
|---|
| 389 |
match_cbegroup - this is the start of an unlimited repeat |
|---|
| 390 |
group that can match an empty string |
|---|
| 391 |
rdepth the recursion depth |
|---|
| 392 |
|
|---|
| 393 |
Returns: MATCH_MATCH if matched ) these values are >= 0 |
|---|
| 394 |
MATCH_NOMATCH if failed to match ) |
|---|
| 395 |
a negative PCRE_ERROR_xxx value if aborted by an error condition |
|---|
| 396 |
(e.g. stopped by repeated call or recursion limit) |
|---|
| 397 |
*/ |
|---|
| 398 |
|
|---|
| 399 |
static int |
|---|
| 400 |
match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart, |
|---|
| 401 |
int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, |
|---|
| 402 |
int flags, unsigned int rdepth) |
|---|
| 403 |
{ |
|---|
| 404 |
/* These variables do not need to be preserved over recursion in this function, |
|---|
| 405 |
so they can be ordinary variables in all cases. Mark some of them with |
|---|
| 406 |
"register" because they are used a lot in loops. */ |
|---|
| 407 |
|
|---|
| 408 |
register int rrc; /* Returns from recursive calls */ |
|---|
| 409 |
register int i; /* Used for loops not involving calls to RMATCH() */ |
|---|
| 410 |
register unsigned int c; /* Character values not kept over RMATCH() calls */ |
|---|
| 411 |
register BOOL utf8; /* Local copy of UTF-8 flag for speed */ |
|---|
| 412 |
|
|---|
| 413 |
BOOL minimize, possessive; /* Quantifier options */ |
|---|
| 414 |
|
|---|
| 415 |
/* When recursion is not being used, all "local" variables that have to be |
|---|
| 416 |
preserved over calls to RMATCH() are part of a "frame" which is obtained from |
|---|
| 417 |
heap storage. Set up the top-level frame here; others are obtained from the |
|---|
| 418 |
heap whenever RMATCH() does a "recursion". See the macro definitions above. */ |
|---|
| 419 |
|
|---|
| 420 |
#ifdef NO_RECURSE |
|---|
| 421 |
heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe)); |
|---|
| 422 |
frame->Xprevframe = NULL; /* Marks the top level */ |
|---|
| 423 |
|
|---|
| 424 |
/* Copy in the original argument variables */ |
|---|
| 425 |
|
|---|
| 426 |
frame->Xeptr = eptr; |
|---|
| 427 |
frame->Xecode = ecode; |
|---|
| 428 |
frame->Xmstart = mstart; |
|---|
| 429 |
frame->Xoffset_top = offset_top; |
|---|
| 430 |
frame->Xims = ims; |
|---|
| 431 |
frame->Xeptrb = eptrb; |
|---|
| 432 |
frame->Xflags = flags; |
|---|
| 433 |
frame->Xrdepth = rdepth; |
|---|
| 434 |
|
|---|
| 435 |
/* This is where control jumps back to to effect "recursion" */ |
|---|
| 436 |
|
|---|
| 437 |
HEAP_RECURSE: |
|---|
| 438 |
|
|---|
| 439 |
/* Macros make the argument variables come from the current frame */ |
|---|
| 440 |
|
|---|
| 441 |
#define eptr frame->Xeptr |
|---|
| 442 |
#define ecode frame->Xecode |
|---|
| 443 |
#define mstart frame->Xmstart |
|---|
| 444 |
#define offset_top frame->Xoffset_top |
|---|
| 445 |
#define ims frame->Xims |
|---|
| 446 |
#define eptrb frame->Xeptrb |
|---|
| 447 |
#define flags frame->Xflags |
|---|
| 448 |
#define rdepth frame->Xrdepth |
|---|
| 449 |
|
|---|
| 450 |
/* Ditto for the local variables */ |
|---|
| 451 |
|
|---|
| 452 |
#ifdef SUPPORT_UTF8 |
|---|
| 453 |
#define charptr frame->Xcharptr |
|---|
| 454 |
#endif |
|---|
| 455 |
#define callpat frame->Xcallpat |
|---|
| 456 |
#define data frame->Xdata |
|---|
| 457 |
#define next frame->Xnext |
|---|
| 458 |
#define pp frame->Xpp |
|---|
| 459 |
#define prev frame->Xprev |
|---|
| 460 |
#define saved_eptr frame->Xsaved_eptr |
|---|
| 461 |
|
|---|
| 462 |
#define new_recursive frame->Xnew_recursive |
|---|
| 463 |
|
|---|
| 464 |
#define cur_is_word frame->Xcur_is_word |
|---|
| 465 |
#define condition frame->Xcondition |
|---|
| 466 |
#define prev_is_word frame->Xprev_is_word |
|---|
| 467 |
|
|---|
| 468 |
#define original_ims frame->Xoriginal_ims |
|---|
| 469 |
|
|---|
| 470 |
#ifdef SUPPORT_UCP |
|---|
| 471 |
#define prop_type frame->Xprop_type |
|---|
| 472 |
#define prop_value frame->Xprop_value |
|---|
| 473 |
#define prop_fail_result frame->Xprop_fail_result |
|---|
| 474 |
#define prop_category frame->Xprop_category |
|---|
| 475 |
#define prop_chartype frame->Xprop_chartype |
|---|
| 476 |
#define prop_script frame->Xprop_script |
|---|
| 477 |
#define oclength frame->Xoclength |
|---|
| 478 |
#define occhars frame->Xocchars |
|---|
| 479 |
#endif |
|---|
| 480 |
|
|---|
| 481 |
#define ctype frame->Xctype |
|---|
| 482 |
#define fc frame->Xfc |
|---|
| 483 |
#define fi frame->Xfi |
|---|
| 484 |
#define length frame->Xlength |
|---|
| 485 |
#define max frame->Xmax |
|---|
| 486 |
#define min frame->Xmin |
|---|
| 487 |
#define number frame->Xnumber |
|---|
| 488 |
#define offset frame->Xoffset |
|---|
| 489 |
#define op frame->Xop |
|---|
| 490 |
#define save_capture_last frame->Xsave_capture_last |
|---|
| 491 |
#define save_offset1 frame->Xsave_offset1 |
|---|
| 492 |
#define save_offset2 frame->Xsave_offset2 |
|---|
| 493 |
#define save_offset3 frame->Xsave_offset3 |
|---|
| 494 |
#define stacksave frame->Xstacksave |
|---|
| 495 |
|
|---|
| 496 |
#define newptrb frame->Xnewptrb |
|---|
| 497 |
|
|---|
| 498 |
/* When recursion is being used, local variables are allocated on the stack and |
|---|
| 499 |
get preserved during recursion in the normal way. In this environment, fi and |
|---|
| 500 |
i, and fc and c, can be the same variables. */ |
|---|
| 501 |
|
|---|
| 502 |
#else /* NO_RECURSE not defined */ |
|---|
| 503 |
#define fi i |
|---|
| 504 |
#define fc c |
|---|
| 505 |
|
|---|
| 506 |
|
|---|
| 507 |
#ifdef SUPPORT_UTF8 /* Many of these variables are used only */ |
|---|
| 508 |
const uschar *charptr; /* in small blocks of the code. My normal */ |
|---|
| 509 |
#endif /* style of coding would have declared */ |
|---|
| 510 |
const uschar *callpat; /* them within each of those blocks. */ |
|---|
| 511 |
const uschar *data; /* However, in order to accommodate the */ |
|---|
| 512 |
const uschar *next; /* version of this code that uses an */ |
|---|
| 513 |
USPTR pp; /* external "stack" implemented on the */ |
|---|
| 514 |
const uschar *prev; /* heap, it is easier to declare them all */ |
|---|
| 515 |
USPTR saved_eptr; /* here, so the declarations can be cut */ |
|---|
| 516 |
/* out in a block. The only declarations */ |
|---|
| 517 |
recursion_info new_recursive; /* within blocks below are for variables */ |
|---|
| 518 |
/* that do not have to be preserved over */ |
|---|
| 519 |
BOOL cur_is_word; /* a recursive call to RMATCH(). */ |
|---|
| 520 |
BOOL condition; |
|---|
| 521 |
BOOL prev_is_word; |
|---|
| 522 |
|
|---|
| 523 |
unsigned long int original_ims; |
|---|
| 524 |
|
|---|
| 525 |
#ifdef SUPPORT_UCP |
|---|
| 526 |
int prop_type; |
|---|
| 527 |
int prop_value; |
|---|
| 528 |
int prop_fail_result; |
|---|
| 529 |
int prop_category; |
|---|
| 530 |
int prop_chartype; |
|---|
| 531 |
int prop_script; |
|---|
| 532 |
int oclength; |
|---|
| 533 |
uschar occhars[8]; |
|---|
| 534 |
#endif |
|---|
| 535 |
|
|---|
| 536 |
int ctype; |
|---|
| 537 |
int length; |
|---|
| 538 |
int max; |
|---|
| 539 |
int min; |
|---|
| 540 |
int number; |
|---|
| 541 |
int offset; |
|---|
| 542 |
int op; |
|---|
| 543 |
int save_capture_last; |
|---|
| 544 |
int save_offset1, save_offset2, save_offset3; |
|---|
| 545 |
int stacksave[REC_STACK_SAVE_MAX]; |
|---|
| 546 |
|
|---|
| 547 |
eptrblock newptrb; |
|---|
| 548 |
#endif /* NO_RECURSE */ |
|---|
| 549 |
|
|---|
| 550 |
/* These statements are here to stop the compiler complaining about unitialized |
|---|
| 551 |
variables. */ |
|---|
| 552 |
|
|---|
| 553 |
#ifdef SUPPORT_UCP |
|---|
| 554 |
prop_value = 0; |
|---|
| 555 |
prop_fail_result = 0; |
|---|
| 556 |
#endif |
|---|
| 557 |
|
|---|
| 558 |
|
|---|
| 559 |
/* This label is used for tail recursion, which is used in a few cases even |
|---|
| 560 |
when NO_RECURSE is not defined, in order to reduce the amount of stack that is |
|---|
| 561 |
used. Thanks to Ian Taylor for noticing this possibility and sending the |
|---|
| 562 |
original patch. */ |
|---|
| 563 |
|
|---|
| 564 |
TAIL_RECURSE: |
|---|
| 565 |
|
|---|
| 566 |
/* OK, now we can get on with the real code of the function. Recursive calls |
|---|
| 567 |
are specified by the macro RMATCH and RRETURN is used to return. When |
|---|
| 568 |
NO_RECURSE is *not* defined, these just turn into a recursive call to match() |
|---|
| 569 |
and a "return", respectively (possibly with some debugging if DEBUG is |
|---|
| 570 |
defined). However, RMATCH isn't like a function call because it's quite a |
|---|
| 571 |
complicated macro. It has to be used in one particular way. This shouldn't, |
|---|
| 572 |
however, impact performance when true recursion is being used. */ |
|---|
| 573 |
|
|---|
| 574 |
#ifdef SUPPORT_UTF8 |
|---|
| 575 |
utf8 = md->utf8; /* Local copy of the flag */ |
|---|
| 576 |
#else |
|---|
| 577 |
utf8 = FALSE; |
|---|
| 578 |
#endif |
|---|
| 579 |
|
|---|
| 580 |
/* First check that we haven't called match() too many times, or that we |
|---|
| 581 |
haven't exceeded the recursive call limit. */ |
|---|
| 582 |
|
|---|
| 583 |
if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT); |
|---|
| 584 |
if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT); |
|---|
| 585 |
|
|---|
| 586 |
original_ims = ims; /* Save for resetting on ')' */ |
|---|
| 587 |
|
|---|
| 588 |
/* At the start of a group with an unlimited repeat that may match an empty |
|---|
| 589 |
string, the match_cbegroup flag is set. When this is the case, add the current |
|---|
| 590 |
subject pointer to the chain of such remembered pointers, to be checked when we |
|---|
| 591 |
hit the closing ket, in order to break infinite loops that match no characters. |
|---|
| 592 |
When match() is called in other circumstances, don't add to the chain. The |
|---|
| 593 |
match_cbegroup flag must NOT be used with tail recursion, because the memory |
|---|
| 594 |
block that is used is on the stack, so a new one may be required for each |
|---|
| 595 |
match(). */ |
|---|
| 596 |
|
|---|
| 597 |
if ((flags & match_cbegroup) != 0) |
|---|
| 598 |
{ |
|---|
| 599 |
newptrb.epb_saved_eptr = eptr; |
|---|
| 600 |
newptrb.epb_prev = eptrb; |
|---|
| 601 |
eptrb = &newptrb; |
|---|
| 602 |
} |
|---|
| 603 |
|
|---|
| 604 |
/* Now start processing the opcodes. */ |
|---|
| 605 |
|
|---|
| 606 |
for (;;) |
|---|
| 607 |
{ |
|---|
| 608 |
minimize = possessive = FALSE; |
|---|
| 609 |
op = *ecode; |
|---|
| 610 |
|
|---|
| 611 |
/* For partial matching, remember if we ever hit the end of the subject after |
|---|
| 612 |
matching at least one subject character. */ |
|---|
| 613 |
|
|---|
| 614 |
if (md->partial && |
|---|
| 615 |
eptr >= md->end_subject && |
|---|
| 616 |
eptr > mstart) |
|---|
| 617 |
md->hitend = TRUE; |
|---|
| 618 |
|
|---|
| 619 |
switch(op) |
|---|
| 620 |
{ |
|---|
| 621 |
case OP_FAIL: |
|---|
| 622 |
RRETURN(MATCH_NOMATCH); |
|---|
| 623 |
|
|---|
| 624 |
case OP_PRUNE: |
|---|
| 625 |
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, |
|---|
| 626 |
ims, eptrb, flags, RM51); |
|---|
| 627 |
if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|---|
| 628 |
RRETURN(MATCH_PRUNE); |
|---|
| 629 |
|
|---|
| 630 |
case OP_COMMIT: |
|---|
| 631 |
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, |
|---|
| 632 |
ims, eptrb, flags, RM52); |
|---|
| 633 |
if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|---|
| 634 |
RRETURN(MATCH_COMMIT); |
|---|
| 635 |
|
|---|
| 636 |
case OP_SKIP: |
|---|
| 637 |
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, |
|---|
| 638 |
ims, eptrb, flags, RM53); |
|---|
| 639 |
if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|---|
| 640 |
md->start_match_ptr = eptr; /* Pass back current position */ |
|---|
| 641 |
RRETURN(MATCH_SKIP); |
|---|
| 642 |
|
|---|
| 643 |
case OP_THEN: |
|---|
| 644 |
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, |
|---|
| 645 |
ims, eptrb, flags, RM54); |
|---|
| 646 |
if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|---|
| 647 |
RRETURN(MATCH_THEN); |
|---|
| 648 |
|
|---|
| 649 |
/* Handle a capturing bracket. If there is space in the offset vector, save |
|---|
| 650 |
the current subject position in the working slot at the top of the vector. |
|---|
| 651 |
We mustn't change the current values of the data slot, because they may be |
|---|
| 652 |
set from a previous iteration of this group, and be referred to by a |
|---|
| 653 |
reference inside the group. |
|---|
| 654 |
|
|---|
| 655 |
If the bracket fails to match, we need to restore this value and also the |
|---|
| 656 |
values of the final offsets, in case they were set by a previous iteration |
|---|
| 657 |
of the same bracket. |
|---|
| 658 |
|
|---|
| 659 |
If there isn't enough space in the offset vector, treat this as if it were |
|---|
| 660 |
a non-capturing bracket. Don't worry about setting the flag for the error |
|---|
| 661 |
case here; that is handled in the code for KET. */ |
|---|
| 662 |
|
|---|
| 663 |
case OP_CBRA: |
|---|
| 664 |
case OP_SCBRA: |
|---|
| 665 |
number = GET2(ecode, 1+LINK_SIZE); |
|---|
| 666 |
offset = number << 1; |
|---|
| 667 |
|
|---|
| 668 |
#ifdef DEBUG |
|---|
| 669 |
printf("start bracket %d\n", number); |
|---|
| 670 |
printf("subject="); |
|---|
| 671 |
pchars(eptr, 16, TRUE, md); |
|---|
| 672 |
printf("\n"); |
|---|
| 673 |
#endif |
|---|
| 674 |
|
|---|
| 675 |
if (offset < md->offset_max) |
|---|
| 676 |
{ |
|---|
| 677 |
save_offset1 = md->offset_vector[offset]; |
|---|
| 678 |
save_offset2 = md->offset_vector[offset+1]; |
|---|
| 679 |
save_offset3 = md->offset_vector[md->offset_end - number]; |
|---|
| 680 |
save_capture_last = md->capture_last; |
|---|
| 681 |
|
|---|
| 682 |
DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3)); |
|---|
| 683 |
md->offset_vector[md->offset_end - number] = eptr - md->start_subject; |
|---|
| 684 |
|
|---|
| 685 |
flags = (op == OP_SCBRA)? match_cbegroup : 0; |
|---|
| 686 |
do |
|---|
| 687 |
{ |
|---|
| 688 |
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, |
|---|
| 689 |
ims, eptrb, flags, RM1); |
|---|
| 690 |
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); |
|---|
| 691 |
md->capture_last = save_capture_last; |
|---|
| 692 |
ecode += GET(ecode, 1); |
|---|
| 693 |
} |
|---|
| 694 |
while (*ecode == OP_ALT); |
|---|
| 695 |
|
|---|
| 696 |
DPRINTF(("bracket %d failed\n", number)); |
|---|
| 697 |
|
|---|
| 698 |
md->offset_vector[offset] = save_offset1; |
|---|
| 699 |
md->offset_vector[offset+1] = save_offset2; |
|---|
| 700 |
md->offset_vector[md->offset_end - number] = save_offset3; |
|---|
| 701 |
|
|---|
| 702 |
RRETURN(MATCH_NOMATCH); |
|---|
| 703 |
} |
|---|
| 704 |
|
|---|
| 705 |
/* FALL THROUGH ... Insufficient room for saving captured contents. Treat |
|---|
| 706 |
as a non-capturing bracket. */ |
|---|
| 707 |
|
|---|
| 708 |
/* VVVVVVVVVVVVVVVVVVVVVVVVV */ |
|---|
| 709 |
/* VVVVVVVVVVVVVVVVVVVVVVVVV */ |
|---|
| 710 |
|
|---|
| 711 |
DPRINTF(("insufficient capture room: treat as non-capturing\n")); |
|---|
| 712 |
|
|---|
| 713 |
/* VVVVVVVVVVVVVVVVVVVVVVVVV */ |
|---|
| 714 |
/* VVVVVVVVVVVVVVVVVVVVVVVVV */ |
|---|
| 715 |
|
|---|
| 716 |
/* Non-capturing bracket. Loop for all the alternatives. When we get to the |
|---|
| 717 |
final alternative within the brackets, we would return the result of a |
|---|
| 718 |
recursive call to match() whatever happened. We can reduce stack usage by |
|---|
| 719 |
turning this into a tail recursion, except in the case when match_cbegroup |
|---|
| 720 |
is set.*/ |
|---|
| 721 |
|
|---|
| 722 |
case OP_BRA: |
|---|
| 723 |
case OP_SBRA: |
|---|
| 724 |
DPRINTF(("start non-capturing bracket\n")); |
|---|
| 725 |
flags = (op >= OP_SBRA)? match_cbegroup : 0; |
|---|
| 726 |
for (;;) |
|---|
| 727 |
{ |
|---|
| 728 |
if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */ |
|---|
| 729 |
{ |
|---|
| 730 |
if (flags == 0) /* Not a possibly empty group */ |
|---|
| 731 |
{ |
|---|
| 732 |
ecode += _pcre_OP_lengths[*ecode]; |
|---|
| 733 |
DPRINTF(("bracket 0 tail recursion\n")); |
|---|
| 734 |
goto TAIL_RECURSE; |
|---|
| 735 |
} |
|---|
| 736 |
|
|---|
| 737 |
/* Possibly empty group; can't use tail recursion. */ |
|---|
| 738 |
|
|---|
| 739 |
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, |
|---|
| 740 |
eptrb, flags, RM48); |
|---|
| 741 |
RRETURN(rrc); |
|---|
| 742 |
} |
|---|
| 743 |
|
|---|
| 744 |
/* For non-final alternatives, continue the loop for a NOMATCH result; |
|---|
| 745 |
otherwise return. */ |
|---|
| 746 |
|
|---|
| 747 |
RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims, |
|---|
| 748 |
eptrb, flags, RM2); |
|---|
| 749 |
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); |
|---|
| 750 |
ecode += GET(ecode, 1); |
|---|
| 751 |
} |
|---|
| 752 |
/* Control never reaches here. */ |
|---|
| 753 |
|
|---|
| 754 |
/* Conditional group: compilation checked that there are no more than |
|---|
| 755 |
two branches. If the condition is false, skipping the first branch takes us |
|---|
| 756 |
past the end if there is only one branch, but that's OK because that is |
|---|
| 757 |
exactly what going to the ket would do. As there is only one branch to be |
|---|
| 758 |
obeyed, we can use tail recursion to avoid using another stack frame. */ |
|---|
| 759 |
|
|---|
| 760 |
case OP_COND: |
|---|
| 761 |
case OP_SCOND: |
|---|
| 762 |
if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */ |
|---|
| 763 |
{ |
|---|
| 764 |
offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/ |
|---|
| 765 |
condition = md->recursive != NULL && |
|---|
| 766 |
(offset == RREF_ANY || offset == md->recursive->group_num); |
|---|
| 767 |
ecode += condition? 3 : GET(ecode, 1); |
|---|
| 768 |
} |
|---|
| 769 |
|
|---|
| 770 |
else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */ |
|---|
| 771 |
{ |
|---|
| 772 |
offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */ |
|---|
| 773 |
condition = offset < offset_top && md->offset_vector[offset] >= 0; |
|---|
| 774 |
ecode += condition? 3 : GET(ecode, 1); |
|---|
| 775 |
} |
|---|
| 776 |
|
|---|
| 777 |
else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */ |
|---|
| 778 |
{ |
|---|
| 779 |
condition = FALSE; |
|---|
| 780 |
ecode += GET(ecode, 1); |
|---|
| 781 |
} |
|---|
| 782 |
|
|---|
| 783 |
/* The condition is an assertion. Call match() to evaluate it - setting |
|---|
| 784 |
the final argument match_condassert causes it to stop at the end of an |
|---|
| 785 |
assertion. */ |
|---|
| 786 |
|
|---|
| 787 |
else |
|---|
| 788 |
{ |
|---|
| 789 |
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, |
|---|
| 790 |
match_condassert, RM3); |
|---|
| 791 |
if (rrc == MATCH_MATCH) |
|---|
| 792 |
{ |
|---|
| 793 |
condition = TRUE; |
|---|
| 794 |
ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2); |
|---|
| 795 |
while (*ecode == OP_ALT) ecode += GET(ecode, 1); |
|---|
| 796 |
} |
|---|
| 797 |
else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) |
|---|
| 798 |
{ |
|---|
| 799 |
RRETURN(rrc); /* Need braces because of following else */ |
|---|
| 800 |
} |
|---|
| 801 |
else |
|---|
| 802 |
{ |
|---|
| 803 |
condition = FALSE; |
|---|
| 804 |
ecode += GET(ecode, 1); |
|---|
| 805 |
} |
|---|
| 806 |
} |
|---|
| 807 |
|
|---|
| 808 |
/* We are now at the branch that is to be obeyed. As there is only one, |
|---|
| 809 |
we can use tail recursion to avoid using another stack frame, except when |
|---|
| 810 |
match_cbegroup is required for an unlimited repeat of a possibly empty |
|---|
| 811 |
group. If the second alternative doesn't exist, we can just plough on. */ |
|---|
| 812 |
|
|---|
| 813 |
if (condition || *ecode == OP_ALT) |
|---|
| 814 |
{ |
|---|
| 815 |
ecode += 1 + LINK_SIZE; |
|---|
| 816 |
if (op == OP_SCOND) /* Possibly empty group */ |
|---|
| 817 |
{ |
|---|
| 818 |
RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49); |
|---|
| 819 |
RRETURN(rrc); |
|---|
| 820 |
} |
|---|
| 821 |
else /* Group must match something */ |
|---|
| 822 |
{ |
|---|
| 823 |
flags = 0; |
|---|
| 824 |
goto TAIL_RECURSE; |
|---|
| 825 |
} |
|---|
| 826 |
} |
|---|
| 827 |
else /* Condition false & no 2nd alternative */ |
|---|
| 828 |
{ |
|---|
| 829 |
ecode += 1 + LINK_SIZE; |
|---|
| 830 |
} |
|---|
| 831 |
break; |
|---|
| 832 |
|
|---|
| 833 |
|
|---|
| 834 |
/* End of the pattern, either real or forced. If we are in a top-level |
|---|
| 835 |
recursion, we should restore the offsets appropriately and continue from |
|---|
| 836 |
after the call. */ |
|---|
| 837 |
|
|---|
| 838 |
case OP_ACCEPT: |
|---|
| 839 |
case OP_END: |
|---|
| 840 |
if (md->recursive != NULL && md->recursive->group_num == 0) |
|---|
| 841 |
{ |
|---|
| 842 |
recursion_info *rec = md->recursive; |
|---|
| 843 |
DPRINTF(("End of pattern in a (?0) recursion\n")); |
|---|
| 844 |
md->recursive = rec->prevrec; |
|---|
| 845 |
memmove(md->offset_vector, rec->offset_save, |
|---|
| 846 |
rec->saved_max * sizeof(int)); |
|---|
| 847 |
mstart = rec->save_start; |
|---|
| 848 |
ims = original_ims; |
|---|
| 849 |
ecode = rec->after_call; |
|---|
| 850 |
break; |
|---|
| 851 |
} |
|---|
| 852 |
|
|---|
| 853 |
/* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty |
|---|
| 854 |
string - backtracking will then try other alternatives, if any. */ |
|---|
| 855 |
|
|---|
| 856 |
if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH); |
|---|
| 857 |
md->end_match_ptr = eptr; /* Record where we ended */ |
|---|
| 858 |
md->end_offset_top = offset_top; /* and how many extracts were taken */ |
|---|
| 859 |
md->start_match_ptr = mstart; /* and the start (\K can modify) */ |
|---|
| 860 |
RRETURN(MATCH_MATCH); |
|---|
| 861 |
|
|---|
| 862 |
/* Change option settings */ |
|---|
| 863 |
|
|---|
| 864 |
case OP_OPT: |
|---|
| 865 |
ims = ecode[1]; |
|---|
| 866 |
ecode += 2; |
|---|
| 867 |
DPRINTF(("ims set to %02lx\n", ims)); |
|---|
| 868 |
break; |
|---|
| 869 |
|
|---|
| 870 |
/* Assertion brackets. Check the alternative branches in turn - the |
|---|
| 871 |
matching won't pass the KET for an assertion. If any one branch matches, |
|---|
| 872 |
the assertion is true. Lookbehind assertions have an OP_REVERSE item at the |
|---|
| 873 |
start of each branch to move the current point backwards, so the code at |
|---|
| 874 |
this level is identical to the lookahead case. */ |
|---|
| 875 |
|
|---|
| 876 |
case OP_ASSERT: |
|---|
| 877 |
case OP_ASSERTBACK: |
|---|
| 878 |
do |
|---|
| 879 |
{ |
|---|
| 880 |
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, |
|---|
| 881 |
RM4); |
|---|
| 882 |
if (rrc == MATCH_MATCH) break; |
|---|
| 883 |
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); |
|---|
| 884 |
ecode += GET(ecode, 1); |
|---|
| 885 |
} |
|---|
| 886 |
while (*ecode == OP_ALT); |
|---|
| 887 |
if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH); |
|---|
| 888 |
|
|---|
| 889 |
/* If checking an assertion for a condition, return MATCH_MATCH. */ |
|---|
| 890 |
|
|---|
| 891 |
if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); |
|---|
| 892 |
|
|---|
| 893 |
/* Continue from after the assertion, updating the offsets high water |
|---|
| 894 |
mark, since extracts may have been taken during the assertion. */ |
|---|
| 895 |
|
|---|
| 896 |
do ecode += GET(ecode,1); while (*ecode == OP_ALT); |
|---|
| 897 |
ecode += 1 + LINK_SIZE; |
|---|
| 898 |
offset_top = md->end_offset_top; |
|---|
| 899 |
continue; |
|---|
| 900 |
|
|---|
| 901 |
/* Negative assertion: all branches must fail to match */ |
|---|
| 902 |
|
|---|
| 903 |
case OP_ASSERT_NOT: |
|---|
| 904 |
case OP_ASSERTBACK_NOT: |
|---|
| 905 |
do |
|---|
| 906 |
{ |
|---|
| 907 |
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0, |
|---|
| 908 |
RM5); |
|---|
| 909 |
if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH); |
|---|
| 910 |
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); |
|---|
| 911 |
ecode += GET(ecode,1); |
|---|
| 912 |
} |
|---|
| 913 |
while (*ecode == OP_ALT); |
|---|
| 914 |
|
|---|
| 915 |
if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH); |
|---|
| 916 |
|
|---|
| 917 |
ecode += 1 + LINK_SIZE; |
|---|
| 918 |
continue; |
|---|
| 919 |
|
|---|
| 920 |
/* Move the subject pointer back. This occurs only at the start of |
|---|
| 921 |
each branch of a lookbehind assertion. If we are too close to the start to |
|---|
| 922 |
move back, this match function fails. When working with UTF-8 we move |
|---|
| 923 |
back a number of characters, not bytes. */ |
|---|
| 924 |
|
|---|
| 925 |
case OP_REVERSE: |
|---|
| 926 |
#ifdef SUPPORT_UTF8 |
|---|
| 927 |
if (utf8) |
|---|
| 928 |
{ |
|---|
| 929 |
i = GET(ecode, 1); |
|---|
| 930 |
while (i-- > 0) |
|---|
| 931 |
{ |
|---|
| 932 |
eptr--; |
|---|
| 933 |
if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); |
|---|
| 934 |
BACKCHAR(eptr); |
|---|
| 935 |
} |
|---|
| 936 |
} |
|---|
| 937 |
else |
|---|
| 938 |
#endif |
|---|
| 939 |
|
|---|
| 940 |
/* No UTF-8 support, or not in UTF-8 mode: count is byte count */ |
|---|
| 941 |
|
|---|
| 942 |
{ |
|---|
| 943 |
eptr -= GET(ecode, 1); |
|---|
| 944 |
if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH); |
|---|
| 945 |
} |
|---|
| 946 |
|
|---|
| 947 |
/* Skip to next op code */ |
|---|
| 948 |
|
|---|
| 949 |
ecode += 1 + LINK_SIZE; |
|---|
| 950 |
break; |
|---|
| 951 |
|
|---|
| 952 |
/* The callout item calls an external function, if one is provided, passing |
|---|
| 953 |
details of the match so far. This is mainly for debugging, though the |
|---|
| 954 |
function is able to force a failure. */ |
|---|
| 955 |
|
|---|
| 956 |
case OP_CALLOUT: |
|---|
| 957 |
if (pcre_callout != NULL) |
|---|
| 958 |
{ |
|---|
| 959 |
pcre_callout_block cb; |
|---|
| 960 |
cb.version = 1; /* Version 1 of the callout block */ |
|---|
| 961 |
cb.callout_number = ecode[1]; |
|---|
| 962 |
cb.offset_vector = md->offset_vector; |
|---|
| 963 |
cb.subject = (PCRE_SPTR)md->start_subject; |
|---|
| 964 |
cb.subject_length = md->end_subject - md->start_subject; |
|---|
| 965 |
cb.start_match = mstart - md->start_subject; |
|---|
| 966 |
cb.current_position = eptr - md->start_subject; |
|---|
| 967 |
cb.pattern_position = GET(ecode, 2); |
|---|
| 968 |
cb.next_item_length = GET(ecode, 2 + LINK_SIZE); |
|---|
| 969 |
cb.capture_top = offset_top/2; |
|---|
| 970 |
cb.capture_last = md->capture_last; |
|---|
| 971 |
cb.callout_data = md->callout_data; |
|---|
| 972 |
if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH); |
|---|
| 973 |
if (rrc < 0) RRETURN(rrc); |
|---|
| 974 |
} |
|---|
| 975 |
ecode += 2 + 2*LINK_SIZE; |
|---|
| 976 |
break; |
|---|
| 977 |
|
|---|
| 978 |
/* Recursion either matches the current regex, or some subexpression. The |
|---|
| 979 |
offset data is the offset to the starting bracket from the start of the |
|---|
| 980 |
whole pattern. (This is so that it works from duplicated subpatterns.) |
|---|
| 981 |
|
|---|
| 982 |
If there are any capturing brackets started but not finished, we have to |
|---|
| 983 |
save their starting points and reinstate them after the recursion. However, |
|---|
| 984 |
we don't know how many such there are (offset_top records the completed |
|---|
| 985 |
total) so we just have to save all the potential data. There may be up to |
|---|
| 986 |
65535 such values, which is too large to put on the stack, but using malloc |
|---|
| 987 |
for small numbers seems expensive. As a compromise, the stack is used when |
|---|
| 988 |
there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc |
|---|
| 989 |
is used. A problem is what to do if the malloc fails ... there is no way of |
|---|
| 990 |
returning to the top level with an error. Save the top REC_STACK_SAVE_MAX |
|---|
| 991 |
values on the stack, and accept that the rest may be wrong. |
|---|
| 992 |
|
|---|
| 993 |
There are also other values that have to be saved. We use a chained |
|---|
| 994 |
sequence of blocks that actually live on the stack. Thanks to Robin Houston |
|---|
| 995 |
for the original version of this logic. */ |
|---|
| 996 |
|
|---|
| 997 |
case OP_RECURSE: |
|---|
| 998 |
{ |
|---|
| 999 |
callpat = md->start_code + GET(ecode, 1); |
|---|
| 1000 |
new_recursive.group_num = (callpat == md->start_code)? 0 : |
|---|
| 1001 |
GET2(callpat, 1 + LINK_SIZE); |
|---|
| 1002 |
|
|---|
| 1003 |
/* Add to "recursing stack" */ |
|---|
| 1004 |
|
|---|
| 1005 |
new_recursive.prevrec = md->recursive; |
|---|
| 1006 |
md->recursive = &new_recursive; |
|---|
| 1007 |
|
|---|
| 1008 |
/* Find where to continue from afterwards */ |
|---|
| 1009 |
|
|---|
| 1010 |
ecode += 1 + LINK_SIZE; |
|---|
| 1011 |
new_recursive.after_call = ecode; |
|---|
| 1012 |
|
|---|
| 1013 |
/* Now save the offset data. */ |
|---|
| 1014 |
|
|---|
| 1015 |
new_recursive.saved_max = md->offset_end; |
|---|
| 1016 |
if (new_recursive.saved_max <= REC_STACK_SAVE_MAX) |
|---|
| 1017 |
new_recursive.offset_save = stacksave; |
|---|
| 1018 |
else |
|---|
| 1019 |
{ |
|---|
| 1020 |
new_recursive.offset_save = |
|---|
| 1021 |
(int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int)); |
|---|
| 1022 |
if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY); |
|---|
| 1023 |
} |
|---|
| 1024 |
|
|---|
| 1025 |
memcpy(new_recursive.offset_save, md->offset_vector, |
|---|
| 1026 |
new_recursive.saved_max * sizeof(int)); |
|---|
| 1027 |
new_recursive.save_start = mstart; |
|---|
| 1028 |
mstart = eptr; |
|---|
| 1029 |
|
|---|
| 1030 |
/* OK, now we can do the recursion. For each top-level alternative we |
|---|
| 1031 |
restore the offset and recursion data. */ |
|---|
| 1032 |
|
|---|
| 1033 |
DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); |
|---|
| 1034 |
flags = (*callpat >= OP_SBRA)? match_cbegroup : 0; |
|---|
| 1035 |
do |
|---|
| 1036 |
{ |
|---|
| 1037 |
RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top, |
|---|
| 1038 |
md, ims, eptrb, flags, RM6); |
|---|
| 1039 |
if (rrc == MATCH_MATCH) |
|---|
| 1040 |
{ |
|---|
| 1041 |
DPRINTF(("Recursion matched\n")); |
|---|
| 1042 |
md->recursive = new_recursive.prevrec; |
|---|
| 1043 |
if (new_recursive.offset_save != stacksave) |
|---|
| 1044 |
(pcre_free)(new_recursive.offset_save); |
|---|
| 1045 |
RRETURN(MATCH_MATCH); |
|---|
| 1046 |
} |
|---|
| 1047 |
else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) |
|---|
| 1048 |
{ |
|---|
| 1049 |
DPRINTF(("Recursion gave error %d\n", rrc)); |
|---|
| 1050 |
RRETURN(rrc); |
|---|
| 1051 |
} |
|---|
| 1052 |
|
|---|
| 1053 |
md->recursive = &new_recursive; |
|---|
| 1054 |
memcpy(md->offset_vector, new_recursive.offset_save, |
|---|
| 1055 |
new_recursive.saved_max * sizeof(int)); |
|---|
| 1056 |
callpat += GET(callpat, 1); |
|---|
| 1057 |
} |
|---|
| 1058 |
while (*callpat == OP_ALT); |
|---|
| 1059 |
|
|---|
| 1060 |
DPRINTF(("Recursion didn't match\n")); |
|---|
| 1061 |
md->recursive = new_recursive.prevrec; |
|---|
| 1062 |
if (new_recursive.offset_save != stacksave) |
|---|
| 1063 |
(pcre_free)(new_recursive.offset_save); |
|---|
| 1064 |
RRETURN(MATCH_NOMATCH); |
|---|
| 1065 |
} |
|---|
| 1066 |
/* Control never reaches here */ |
|---|
| 1067 |
|
|---|
| 1068 |
/* "Once" brackets are like assertion brackets except that after a match, |
|---|
| 1069 |
the point in the subject string is not moved back. Thus there can never be |
|---|
| 1070 |
a move back into the brackets. Friedl calls these "atomic" subpatterns. |
|---|
| 1071 |
Check the alternative branches in turn - the matching won't pass the KET |
|---|
| 1072 |
for this kind of subpattern. If any one branch matches, we carry on as at |
|---|
| 1073 |
the end of a normal bracket, leaving the subject pointer. */ |
|---|
| 1074 |
|
|---|
| 1075 |
case OP_ONCE: |
|---|
| 1076 |
prev = ecode; |
|---|
| 1077 |
saved_eptr = eptr; |
|---|
| 1078 |
|
|---|
| 1079 |
do |
|---|
| 1080 |
{ |
|---|
| 1081 |
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7); |
|---|
| 1082 |
if (rrc == MATCH_MATCH) break; |
|---|
| 1083 |
if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); |
|---|
| 1084 |
ecode += GET(ecode,1); |
|---|
| 1085 |
} |
|---|
| 1086 |
while (*ecode == OP_ALT); |
|---|
| 1087 |
|
|---|
| 1088 |
/* If hit the end of the group (which could be repeated), fail */ |
|---|
| 1089 |
|
|---|
| 1090 |
if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH); |
|---|
| 1091 |
|
|---|
| 1092 |
/* Continue as from after the assertion, updating the offsets high water |
|---|
| 1093 |
mark, since extracts may have been taken. */ |
|---|
| 1094 |
|
|---|
| 1095 |
do ecode += GET(ecode, 1); while (*ecode == OP_ALT); |
|---|
| 1096 |
|
|---|
| 1097 |
offset_top = md->end_offset_top; |
|---|
| 1098 |
eptr = md->end_match_ptr; |
|---|
| 1099 |
|
|---|
| 1100 |
/* For a non-repeating ket, just continue at this level. This also |
|---|
| 1101 |
happens for a repeating ket if no characters were matched in the group. |
|---|
| 1102 |
This is the forcible breaking of infinite loops as implemented in Perl |
|---|
| 1103 |
5.005. If there is an options reset, it will get obeyed in the normal |
|---|
| 1104 |
course of events. */ |
|---|
| 1105 |
|
|---|
| 1106 |
if (*ecode == OP_KET || eptr == saved_eptr) |
|---|
| 1107 |
{ |
|---|
| 1108 |
ecode += 1+LINK_SIZE; |
|---|
| 1109 |
break; |
|---|
| 1110 |
} |
|---|
| 1111 |
|
|---|
| 1112 |
/* The repeating kets try the rest of the pattern or restart from the |
|---|
| 1113 |
preceding bracket, in the appropriate order. The second "call" of match() |
|---|
| 1114 |
uses tail recursion, to avoid using another stack frame. We need to reset |
|---|
| 1115 |
any options that changed within the bracket before re-running it, so |
|---|
| 1116 |
check the next opcode. */ |
|---|
| 1117 |
|
|---|
| 1118 |
if (ecode[1+LINK_SIZE] == OP_OPT) |
|---|
| 1119 |
{ |
|---|
| 1120 |
ims = (ims & ~PCRE_IMS) | ecode[4]; |
|---|
| 1121 |
DPRINTF(("ims set to %02lx at group repeat\n", ims)); |
|---|
| 1122 |
} |
|---|
| 1123 |
|
|---|
| 1124 |
if (*ecode == OP_KETRMIN) |
|---|
| 1125 |
{ |
|---|
| 1126 |
RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8); |
|---|
| 1127 |
if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|---|
| 1128 |
ecode = prev; |
|---|
| 1129 |
flags = 0; |
|---|
| 1130 |
goto TAIL_RECURSE; |
|---|
| 1131 |
} |
|---|
| 1132 |
else /* OP_KETRMAX */ |
|---|
| 1133 |
{ |
|---|
| 1134 |
RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9); |
|---|
| 1135 |
if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|---|
| 1136 |
ecode += 1 + LINK_SIZE; |
|---|
| 1137 |
flags = 0; |
|---|
| 1138 |
goto TAIL_RECURSE; |
|---|
| 1139 |
} |
|---|
| 1140 |
/* Control never gets here */ |
|---|
| 1141 |
|
|---|
| 1142 |
/* An alternation is the end of a branch; scan along to find the end of the |
|---|
| 1143 |
bracketed group and go to there. */ |
|---|
| 1144 |
|
|---|
| 1145 |
case OP_ALT: |
|---|
| 1146 |
do ecode += GET(ecode,1); while (*ecode == OP_ALT); |
|---|
| 1147 |
break; |
|---|
| 1148 |
|
|---|
| 1149 |
/* BRAZERO and BRAMINZERO occur just before a bracket group, indicating |
|---|
| 1150 |
that it may occur zero times. It may repeat infinitely, or not at all - |
|---|
| 1151 |
i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper |
|---|
| 1152 |
repeat limits are compiled as a number of copies, with the optional ones |
|---|
| 1153 |
preceded by BRAZERO or BRAMINZERO. */ |
|---|
| 1154 |
|
|---|
| 1155 |
case OP_BRAZERO: |
|---|
| 1156 |
{ |
|---|
| 1157 |
next = ecode+1; |
|---|
| 1158 |
RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10); |
|---|
| 1159 |
if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|---|
| 1160 |
do next += GET(next,1); while (*next == OP_ALT); |
|---|
| 1161 |
ecode = next + 1 + LINK_SIZE; |
|---|
| 1162 |
} |
|---|
| 1163 |
break; |
|---|
| 1164 |
|
|---|
| 1165 |
case OP_BRAMINZERO: |
|---|
| 1166 |
{ |
|---|
| 1167 |
next = ecode+1; |
|---|
| 1168 |
do next += GET(next, 1); while (*next == OP_ALT); |
|---|
| 1169 |
RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11); |
|---|
| 1170 |
if (rrc != MATCH_NOMATCH) RRETURN(rrc); |
|---|
| 1171 |
ecode++; |
|---|
| 1172 |
} |
|---|
| 1173 |
break; |
|---|
| 1174 |
|
|---|
| 1175 |
/* End of a group, repeated or non-repeating. */ |
|---|
| 1176 |
|
|---|
| 1177 |
case OP_KET: |
|---|
| 1178 |
case OP_KETRMIN: |
|---|
| 1179 |
case OP_KETRMAX: |
|---|
| 1180 |
prev |
|---|