root/cherokee/trunk/cherokee/pcre/pcre_compile.c

Revision 908, 186.0 kB (checked in by alo, 1 year ago)

--

<
Line 
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8                        Written by Philip Hazel
9            Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43
44
45 #include "local_config.h"
46
47 #define NLBLOCK cd             /* Block containing newline information */
48 #define PSSTART start_pattern  /* Field containing processed string start */
49 #define PSEND   end_pattern    /* Field containing processed string end */
50
51 #include "pcre_internal.h"
52
53
54
55 /* Macro for setting individual bits in class bitmaps. */
56
57 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
58
59 /* Maximum length value to check against when making sure that the integer that
60 holds the compiled pattern length does not overflow. We make it a bit less than
61 INT_MAX to allow for adding in group terminating bytes, so that we don't have
62 to check them every time. */
63
64 #define OFLOW_MAX (INT_MAX - 20)
65
66
67 /*************************************************
68 *      Code parameters and static tables         *
69 *************************************************/
70
71 /* This value specifies the size of stack workspace that is used during the
72 first pre-compile phase that determines how much memory is required. The regex
73 is partly compiled into this space, but the compiled parts are discarded as
74 soon as they can be, so that hopefully there will never be an overrun. The code
75 does, however, check for an overrun. The largest amount I've seen used is 218,
76 so this number is very generous.
77
78 The same workspace is used during the second, actual compile phase for
79 remembering forward references to groups so that they can be filled in at the
80 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
81 is 4 there is plenty of room. */
82
83 #define COMPILE_WORK_SIZE (4096)
84
85
86 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
87 are simple data values; negative values are for special things like \d and so
88 on. Zero means further processing is needed (for things like \x), or the escape
89 is invalid. */
90
91 #ifndef EBCDIC  /* This is the "normal" table for ASCII systems */
92 static const short int escapes[] = {
93      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
94      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
95    '@', -ESC_A, -ESC_B, -ESC_C, -ESC_D, -ESC_E,      0, -ESC_G,   /* @ - G */
96 -ESC_H,      0,      0, -ESC_K,      0,      0,      0,      0,   /* H - O */
97 -ESC_P, -ESC_Q, -ESC_R, -ESC_S,      0,      0, -ESC_V, -ESC_W,   /* P - W */
98 -ESC_X,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
99    '`',      7, -ESC_b,      0, -ESC_d,  ESC_e,  ESC_f,      0,   /* ` - g */
100 -ESC_h,      0,      0, -ESC_k,      0,      0,  ESC_n,      0,   /* h - o */
101 -ESC_p,      0,  ESC_r, -ESC_s,  ESC_tee,    0, -ESC_v, -ESC_w,   /* p - w */
102      0,      0, -ESC_z                                            /* x - z */
103 };
104
105 #else           /* This is the "abnormal" table for EBCDIC systems */
106 static const short int escapes[] = {
107 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
108 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
109 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
110 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
111 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
112 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
113 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
114 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
115 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
116 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
117 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
118 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
119 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
120 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
121 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
122 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
123 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
124 /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
125 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
126 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
127 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
128 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
129 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
130 };
131 #endif
132
133
134 /* Table of special "verbs" like (*PRUNE) */
135
136 typedef struct verbitem {
137   const char *name;
138   int   len;
139   int   op;
140 } verbitem;
141
142 static verbitem verbs[] = {
143   { "ACCEPT", 6, OP_ACCEPT },
144   { "COMMIT", 6, OP_COMMIT },
145   { "F",      1, OP_FAIL },
146   { "FAIL",   4, OP_FAIL },
147   { "PRUNE",  5, OP_PRUNE },
148   { "SKIP",   4, OP_SKIP  },
149   { "THEN",   4, OP_THEN  }
150 };
151
152 static int verbcount = sizeof(verbs)/sizeof(verbitem);
153
154
155 /* Tables of names of POSIX character classes and their lengths. The list is
156 terminated by a zero length entry. The first three must be alpha, lower, upper,
157 as this is assumed for handling case independence. */
158
159 static const char *const posix_names[] = {
160   "alpha", "lower", "upper",
161   "alnum", "ascii", "blank", "cntrl", "digit", "graph",
162   "print", "punct", "space", "word",  "xdigit" };
163
164 static const uschar posix_name_lengths[] = {
165   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
166
167 /* Table of class bit maps for each POSIX class. Each class is formed from a
168 base map, with an optional addition or removal of another map. Then, for some
169 classes, there is some additional tweaking: for [:blank:] the vertical space
170 characters are removed, and for [:alpha:] and [:alnum:] the underscore
171 character is removed. The triples in the table consist of the base map offset,
172 second map offset or -1 if no second map, and a non-negative value for map
173 addition or a negative value for map subtraction (if there are two maps). The
174 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
175 remove vertical space characters, 2 => remove underscore. */
176
177 static const int posix_class_maps[] = {
178   cbit_word,  cbit_digit, -2,             /* alpha */
179   cbit_lower, -1,          0,             /* lower */
180   cbit_upper, -1,          0,             /* upper */
181   cbit_word,  -1,          2,             /* alnum - word without underscore */
182   cbit_print, cbit_cntrl,  0,             /* ascii */
183   cbit_space, -1,          1,             /* blank - a GNU extension */
184   cbit_cntrl, -1,          0,             /* cntrl */
185   cbit_digit, -1,          0,             /* digit */
186   cbit_graph, -1,          0,             /* graph */
187   cbit_print, -1,          0,             /* print */
188   cbit_punct, -1,          0,             /* punct */
189   cbit_space, -1,          0,             /* space */
190   cbit_word,  -1,          0,             /* word - a Perl extension */
191   cbit_xdigit,-1,          0              /* xdigit */
192 };
193
194
195 #define STRING(a)  # a
196 #define XSTRING(s) STRING(s)
197
198 /* The texts of compile-time error messages. These are "char *" because they
199 are passed to the outside world. Do not ever re-use any error number, because
200 they are documented. Always add a new error instead. Messages marked DEAD below
201 are no longer used. */
202
203 static const char *error_texts[] = {
204   "no error",
205   "\\ at end of pattern",
206   "\\c at end of pattern",
207   "unrecognized character follows \\",
208   "numbers out of order in {} quantifier",
209   /* 5 */
210   "number too big in {} quantifier",
211   "missing terminating ] for character class",
212   "invalid escape sequence in character class",
213   "range out of order in character class",
214   "nothing to repeat",
215   /* 10 */
216   "operand of unlimited repeat could match the empty string",  /** DEAD **/
217   "internal error: unexpected repeat",
218   "unrecognized character after (?",
219   "POSIX named classes are supported only within a class",
220   "missing )",
221   /* 15 */
222   "reference to non-existent subpattern",
223   "erroffset passed as NULL",
224   "unknown option bit(s) set",
225   "missing ) after comment",
226   "parentheses nested too deeply",  /** DEAD **/
227   /* 20 */
228   "regular expression is too large",
229   "failed to get memory",
230   "unmatched parentheses",
231   "internal error: code overflow",
232   "unrecognized character after (?<",
233   /* 25 */
234   "lookbehind assertion is not fixed length",
235   "malformed number or name after (?(",
236   "conditional group contains more than two branches",
237   "assertion expected after (?(",
238   "(?R or (?[+-]digits must be followed by )",
239   /* 30 */
240   "unknown POSIX class name",
241   "POSIX collating elements are not supported",
242   "this version of PCRE is not compiled with PCRE_UTF8 support",
243   "spare error",  /** DEAD **/
244   "character value in \\x{...} sequence is too large",
245   /* 35 */
246   "invalid condition (?(0)",
247   "\\C not allowed in lookbehind assertion",
248   "PCRE does not support \\L, \\l, \\N, \\U, or \\u",
249   "number after (?C is > 255",
250   "closing ) for (?C expected",
251   /* 40 */
252   "recursive call could loop indefinitely",
253   "unrecognized character after (?P",
254   "syntax error in subpattern name (missing terminator)",
255   "two named subpatterns have the same name",
256   "invalid UTF-8 string",
257   /* 45 */
258   "support for \\P, \\p, and \\X has not been compiled",
259   "malformed \\P or \\p sequence",
260   "unknown property name after \\P or \\p",
261   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)",
262   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")",
263   /* 50 */
264   "repeated subpattern is too long",    /** DEAD **/
265   "octal value is greater than \\377 (not in UTF-8 mode)",
266   "internal error: overran compiling workspace",
267   "internal error: previously-checked referenced subpattern not found",
268   "DEFINE group contains more than one branch",
269   /* 55 */
270   "repeating a DEFINE group is not allowed",
271   "inconsistent NEWLINE options",
272   "\\g is not followed by a braced name or an optionally braced non-zero number",
273   "(?+ or (?- or (?(+ or (?(- must be followed by a non-zero number",
274   "(*VERB) with an argument is not supported",
275   /* 60 */
276   "(*VERB) not recognized",
277   "number is too big"
278 };
279
280
281 /* Table to identify digits and hex digits. This is used when compiling
282 patterns. Note that the tables in chartables are dependent on the locale, and
283 may mark arbitrary characters as digits - but the PCRE compiling code expects
284 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
285 a private table here. It costs 256 bytes, but it is a lot faster than doing
286 character value tests (at least in some simple cases I timed), and in some
287 applications one wants PCRE to compile efficiently as well as match
288 efficiently.
289
290 For convenience, we use the same bit definitions as in chartables:
291
292   0x04   decimal digit
293   0x08   hexadecimal digit
294
295 Then we can use ctype_digit and ctype_xdigit in the code. */
296
297 #ifndef EBCDIC  /* This is the "normal" case, for ASCII systems */
298 static const unsigned char digitab[] =
299   {
300   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
301   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
302   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
303   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
304   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
305   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
306   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
307   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
308   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
309   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
310   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
311   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
312   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
313   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
314   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
315   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
316   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
317   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
318   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
319   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
320   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
321   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
322   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
323   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
324   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
325   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
326   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
327   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
328   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
329   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
330   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
331   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
332
333 #else           /* This is the "abnormal" case, for EBCDIC systems */
334 static const unsigned char digitab[] =
335   {
336   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
337   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
338   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
339   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
340   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
341   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
342   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
343   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
344   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
345   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
346   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
347   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
348   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
349   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
350   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
351   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
352   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
353   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
354   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
355   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
356   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
357   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
358   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
359   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
360   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
361   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
362   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
363   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
364   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
365   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
366   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
367   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
368
369 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
370   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
371   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
372   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
373   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
374   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
375   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
376   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
377   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
378   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
379   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
380   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
381   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
382   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
383   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
384   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
385   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
386   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
387   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
388   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
389   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
390   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
391   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
392   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
393   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
394   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
395   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
396   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
397   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
398   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
399   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
400   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
401   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
402 #endif
403
404
405 /* Definition to allow mutual recursion */
406
407 static BOOL
408   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
409     int *, int *, branch_chain *, compile_data *, int *);
410
411
412
413 /*************************************************
414 *            Handle escapes                      *
415 *************************************************/
416
417 /* This function is called when a \ has been encountered. It either returns a
418 positive value for a simple escape such as \n, or a negative value which
419 encodes one of the more complicated things such as \d. A backreference to group
420 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
421 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
422 ptr is pointing at the \. On exit, it is on the final character of the escape
423 sequence.
424
425 Arguments:
426   ptrptr         points to the pattern position pointer
427   errorcodeptr   points to the errorcode variable
428   bracount       number of previous extracting brackets
429   options        the options bits
430   isclass        TRUE if inside a character class
431
432 Returns:         zero or positive => a data character
433                  negative => a special escape sequence
434                  on error, errorcodeptr is set
435 */
436
437 static int
438 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
439   int options, BOOL isclass)
440 {
441 BOOL utf8 = (options & PCRE_UTF8) != 0;
442 const uschar *ptr = *ptrptr + 1;
443 int c, i;
444
445 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
446 ptr--;                            /* Set pointer back to the last byte */
447
448 /* If backslash is at the end of the pattern, it's an error. */
449
450 if (c == 0) *errorcodeptr = ERR1;
451
452 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
453 a table. A non-zero result is something that can be returned immediately.
454 Otherwise further processing may be required. */
455
456 #ifndef EBCDIC  /* ASCII coding */
457 else if (c < '0' || c > 'z') {}                           /* Not alphameric */
458 else if ((i = escapes[c - '0']) != 0) c = i;
459
460 #else           /* EBCDIC coding */
461 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphameric */
462 else if ((i = escapes[c - 0x48]) != 0)  c = i;
463 #endif
464
465 /* Escapes that need further processing, or are illegal. */
466
467 else
468   {
469   const uschar *oldptr;
470   BOOL braced, negated;
471
472   switch (c)
473     {
474     /* A number of Perl escapes are not handled by PCRE. We give an explicit
475     error. */
476
477     case 'l':
478     case 'L':
479     case 'N':
480     case 'u':
481     case 'U':
482     *errorcodeptr = ERR37;
483     break;
484
485     /* \g must be followed by a number, either plain or braced. If positive, it
486     is an absolute backreference. If negative, it is a relative backreference.
487     This is a Perl 5.10 feature. Perl 5.10 also supports \g{name} as a
488     reference to a named group. This is part of Perl's movement towards a
489     unified syntax for back references. As this is synonymous with \k{name}, we
490     fudge it up by pretending it really was \k. */
491
492     case 'g':
493     if (ptr[1] == '{')
494       {
495       const uschar *p;
496       for (p = ptr+2; *p != 0 && *p != '}'; p++)
497         if (*p != '-' && (digitab[*p] & ctype_digit) == 0) break;
498       if (*p != 0 && *p != '}')
499         {
500         c = -ESC_k;
501         break;
502         }
503       braced = TRUE;
504       ptr++;
505       }
506     else braced = FALSE;
507
508     if (ptr[1] == '-')
509       {
510       negated = TRUE;
511       ptr++;
512       }
513     else negated = FALSE;
514
515     c = 0;
516     while ((digitab[ptr[1]] & ctype_digit) != 0)
517       c = c * 10 + *(++ptr) - '0';
518
519     if (c < 0)
520       {
521       *errorcodeptr = ERR61;
522       break;
523       }
524
525     if (c == 0 || (braced && *(++ptr) != '}'))
526       {
527       *errorcodeptr = ERR57;
528       break;
529       }
530
531     if (negated)
532       {
533       if (c > bracount)
534         {
535         *errorcodeptr = ERR15;
536         break;
537         }
538       c = bracount - (c - 1);
539       }
540
541     c = -(ESC_REF + c);
542     break;
543
544     /* The handling of escape sequences consisting of a string of digits
545     starting with one that is not zero is not straightforward. By experiment,
546     the way Perl works seems to be as follows:
547
548     Outside a character class, the digits are read as a decimal number. If the
549     number is less than 10, or if there are that many previous extracting
550     left brackets, then it is a back reference. Otherwise, up to three octal
551     digits are read to form an escaped byte. Thus \123 is likely to be octal
552     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
553     value is greater than 377, the least significant 8 bits are taken. Inside a
554     character class, \ followed by a digit is always an octal number. */
555
556     case '1': case '2': case '3': case '4': case '5':
557     case '6': case '7': case '8': case '9':
558
559     if (!isclass)
560       {
561       oldptr = ptr;
562       c -= '0';
563       while ((digitab[ptr[1]] & ctype_digit) != 0)
564         c = c * 10 + *(++ptr) - '0';
565       if (c < 0)
566         {
567         *errorcodeptr = ERR61;
568         break;
569         }
570       if (c < 10 || c <= bracount)
571         {
572         c = -(ESC_REF + c);
573         break;
574         }
575       ptr = oldptr;      /* Put the pointer back and fall through */
576       }
577
578     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
579     generates a binary zero byte and treats the digit as a following literal.
580     Thus we have to pull back the pointer by one. */
581
582     if ((c = *ptr) >= '8')
583       {
584       ptr--;
585       c = 0;
586       break;
587       }
588
589     /* \0 always starts an octal number, but we may drop through to here with a
590     larger first octal digit. The original code used just to take the least
591     significant 8 bits of octal numbers (I think this is what early Perls used
592     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
593     than 3 octal digits. */
594
595     case '0':
596     c -= '0';
597     while(i++ < 2 && ptr[1] >= '0' && ptr[1] <= '7')
598         c = c * 8 + *(++ptr) - '0';
599     if (!utf8 && c > 255) *errorcodeptr = ERR51;
600     break;
601
602     /* \x is complicated. \x{ddd} is a character number which can be greater
603     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
604     treated as a data character. */
605
606     case 'x':
607     if (ptr[1] == '{')
608       {
609       const uschar *pt = ptr + 2;
610       int count = 0;
611
612       c = 0;
613       while ((digitab[*pt] & ctype_xdigit) != 0)
614         {
615         register int cc = *pt++;
616         if (c == 0 && cc == '0') continue;     /* Leading zeroes */
617         count++;
618
619 #ifndef EBCDIC  /* ASCII coding */
620         if (cc >= 'a') cc -= 32;               /* Convert to upper case */
621         c = (c << 4) + cc - ((cc < 'A')? '0' : ('A' - 10));
622 #else           /* EBCDIC coding */
623         if (cc >= 'a' && cc <= 'z') cc += 64;  /* Convert to upper case */
624         c = (c << 4) + cc - ((cc >= '0')? '0' : ('A' - 10));
625 #endif
626         }
627
628       if (*pt == '}')
629         {
630         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
631         ptr = pt;
632         break;
633         }
634
635       /* If the sequence of hex digits does not end with '}', then we don't
636       recognize this construct; fall through to the normal \x handling. */
637       }
638
639     /* Read just a single-byte hex-defined char */
640
641     c = 0;
642     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
643       {
644       int cc;                               /* Some compilers don't like ++ */
645       cc = *(++ptr);                        /* in initializers */
646 #ifndef EBCDIC  /* ASCII coding */
647       if (cc >= 'a') cc -= 32;              /* Convert to upper case */
648       c = c * 16 + cc - ((cc < 'A')? '0' : ('A' - 10));
649 #else           /* EBCDIC coding */
650       if (cc <= 'z') cc += 64;              /* Convert to upper case */
651       c = c * 16 + cc - ((cc >= '0')? '0' : ('A' - 10));
652 #endif
653       }
654     break;
655
656     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
657     This coding is ASCII-specific, but then the whole concept of \cx is
658     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
659
660     case 'c':
661     c = *(++ptr);
662     if (c == 0)
663       {
664       *errorcodeptr = ERR2;
665       break;
666       }
667
668 #ifndef EBCDIC  /* ASCII coding */
669     if (c >= 'a' && c <= 'z') c -= 32;
670     c ^= 0x40;
671 #else           /* EBCDIC coding */
672     if (c >= 'a' && c <= 'z') c += 64;
673     c ^= 0xC0;
674 #endif
675     break;
676
677     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
678     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
679     for Perl compatibility, it is a literal. This code looks a bit odd, but
680     there used to be some cases other than the default, and there may be again
681     in future, so I haven't "optimized" it. */
682
683     default:
684     if ((options & PCRE_EXTRA) != 0) switch(c)
685       {
686       default:
687       *errorcodeptr = ERR3;
688       break;
689       }
690     break;
691     }
692   }
693
694 *ptrptr = ptr;
695 return c;
696 }
697
698
699
700 #ifdef SUPPORT_UCP
701 /*************************************************
702 *               Handle \P and \p                 *
703 *************************************************/
704
705 /* This function is called after \P or \p has been encountered, provided that
706 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
707 pointing at the P or p. On exit, it is pointing at the final character of the
708 escape sequence.
709
710 Argument:
711   ptrptr         points to the pattern position pointer
712   negptr         points to a boolean that is set TRUE for negation else FALSE
713   dptr           points to an int that is set to the detailed property value
714   errorcodeptr   points to the error code variable
715
716 Returns:         type value from ucp_type_table, or -1 for an invalid type
717 */
718
719 static int
720 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
721 {
722 int c, i, bot, top;
723 const uschar *ptr = *ptrptr;
724 char name[32];
725
726 c = *(++ptr);
727 if (c == 0) goto ERROR_RETURN;
728
729 *negptr = FALSE;
730
731 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
732 negation. */
733
734 if (c == '{')
735   {
736   if (ptr[1] == '^')
737     {
738     *negptr = TRUE;
739     ptr++;
740     }
741   for (i = 0; i < (int)sizeof(name) - 1; i++)
742     {
743     c = *(++ptr);
744     if (c == 0) goto ERROR_RETURN;
745     if (c == '}') break;
746     name[i] = c;
747     }
748   if (c !='}') goto ERROR_RETURN;
749   name[i] = 0;
750   }
751
752 /* Otherwise there is just one following character */
753
754 else
755   {
756   name[0] = c;
757   name[1] = 0;
758   }
759
760 *ptrptr = ptr;
761
762 /* Search for a recognized property name using binary chop */
763
764 bot = 0;
765 top = _pcre_utt_size;
766
767 while (bot < top)
768   {
769   i = (bot + top) >> 1;
770   c = strcmp(name, _pcre_utt[i].name);
771   if (c == 0)
772     {
773     *dptr = _pcre_utt[i].value;
774     return _pcre_utt[i].type;
775     }
776   if (c > 0) bot = i + 1; else top = i;
777   }
778
779 *errorcodeptr = ERR47;
780 *ptrptr = ptr;
781 return -1;
782
783 ERROR_RETURN:
784 *errorcodeptr = ERR46;
785 *ptrptr = ptr;
786 return -1;
787 }
788 #endif
789
790
791
792
793 /*************************************************
794 *            Check for counted repeat            *
795 *************************************************/
796
797 /* This function is called when a '{' is encountered in a place where it might
798 start a quantifier. It looks ahead to see if it really is a quantifier or not.
799 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
800 where the ddds are digits.
801
802 Arguments:
803   p         pointer to the first char after '{'
804
805 Returns:    TRUE or FALSE
806 */
807
808 static BOOL
809 is_counted_repeat(const uschar *p)
810 {
811 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
812 while ((digitab[*p] & ctype_digit) != 0) p++;
813 if (*p == '}') return TRUE;
814
815 if (*p++ != ',') return FALSE;
816 if (*p == '}') return TRUE;
817
818 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
819 while ((digitab[*p] & ctype_digit) != 0) p++;
820
821 return (*p == '}');
822 }
823
824
825
826 /*************************************************
827 *         Read repeat counts                     *
828 *************************************************/
829
830 /* Read an item of the form {n,m} and return the values. This is called only
831 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
832 so the syntax is guaranteed to be correct, but we need to check the values.
833
834 Arguments:
835   p              pointer to first char after '{'
836   minp           pointer to int for min
837   maxp           pointer to int for max
838                  returned as -1 if no max
839   errorcodeptr   points to error code variable
840
841 Returns:         pointer to '}' on success;
842                  current ptr on error, with errorcodeptr set non-zero
843 */
844
845 static const uschar *
846 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
847 {
848 int min = 0;
849 int max = -1;
850
851 /* Read the minimum value and do a paranoid check: a negative value indicates
852 an integer overflow. */
853
854 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
855 if (min < 0 || min > 65535)
856   {
857   *errorcodeptr = ERR5;
858   return p;
859   }
860
861 /* Read the maximum value if there is one, and again do a paranoid on its size.
862 Also, max must not be less than min. */
863
864 if (*p == '}') max = min; else
865   {
866   if (*(++p) != '}')
867     {
868     max = 0;
869     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
870     if (max < 0 || max > 65535)
871       {
872       *errorcodeptr = ERR5;
873       return p;
874       }
875     if (max < min)
876       {
877       *errorcodeptr = ERR4;
878       return p;
879       }
880     }
881   }
882
883 /* Fill in the required variables, and pass back the pointer to the terminating
884 '}'. */
885
886 *minp = min;
887 *maxp = max;
888 return p;
889 }
890
891
892
893 /*************************************************
894 *       Find forward referenced subpattern       *
895 *************************************************/
896
897 /* This function scans along a pattern's text looking for capturing
898 subpatterns, and counting them. If it finds a named pattern that matches the
899 name it is given, it returns its number. Alternatively, if the name is NULL, it
900 returns when it reaches a given numbered subpattern. This is used for forward
901 references to subpatterns. We know that if (?P< is encountered, the name will
902 be terminated by '>' because that is checked in the first pass.
903
904 Arguments:
905   ptr          current position in the pattern
906   count        current count of capturing parens so far encountered
907   name         name to seek, or NULL if seeking a numbered subpattern
908   lorn         name length, or subpattern number if name is NULL
909   xmode        TRUE if we are in /x mode
910
911 Returns:       the number of the named subpattern, or -1 if not found
912 */
913
914 static int
915 find_parens(const uschar *ptr, int count, const uschar *name, int lorn,
916   BOOL xmode)
917 {
918 const uschar *thisname;
919
920 for (; *ptr != 0; ptr++)
921   {
922   int term;
923
924   /* Skip over backslashed characters and also entire \Q...\E */
925
926   if (*ptr == '\\')
927     {
928     if (*(++ptr) == 0) return -1;
929     if (*ptr == 'Q') for (;;)
930       {
931       while (*(++ptr) != 0 && *ptr != '\\');
932       if (*ptr == 0) return -1;
933       if (*(++ptr) == 'E') break;
934       }
935     continue;
936     }
937
938   /* Skip over character classes */
939
940   if (*ptr == '[')
941     {
942     while (*(++ptr) != ']')
943       {
944       if (*ptr == 0) return -1;
945       if (*ptr == '\\')
946         {
947         if (*(++ptr) == 0) return -1;
948         if (*ptr == 'Q') for (;;)
949           {
950           while (*(++ptr) != 0 && *ptr != '\\');
951           if (*ptr == 0) return -1;
952           if (*(++ptr) == 'E') break;
953           }
954         continue;
955         }
956       }
957     continue;
958     }
959
960   /* Skip comments in /x mode */
961
962   if (xmode && *ptr == '#')
963     {
964     while (*(++ptr) != 0 && *ptr != '\n');
965     if (*ptr == 0) return -1;
966     continue;
967     }
968
969   /* An opening parens must now be a real metacharacter */
970
971   if (*ptr != '(') continue;
972   if (ptr[1] != '?' && ptr[1] != '*')
973     {
974     count++;
975     if (name == NULL && count == lorn) return count;
976     continue;
977     }
978
979   ptr += 2;
980   if (*ptr == 'P') ptr++;                      /* Allow optional P */
981
982   /* We have to disambiguate (?<! and (?<= from (?<name> */
983
984   if ((*ptr != '<' || ptr[1] == '!' || ptr[1] == '=') &&
985        *ptr != '\'')
986     continue;
987
988   count++;
989
990   if (name == NULL && count == lorn) return count;
991   term = *ptr++;
992   if (term == '<') term = '>';
993   thisname = ptr;
994   while (*ptr != term) ptr++;
995   if (name != NULL && lorn == ptr - thisname &&
996       strncmp((const char *)name, (const char *)thisname, lorn) == 0)
997     return count;
998   }
999
1000 return -1;
1001 }
1002
1003
1004
1005 /*************************************************
1006 *      Find first significant op code            *
1007 *************************************************/
1008
1009 /* This is called by several functions that scan a compiled expression looking
1010 for a fixed first character, or an anchoring op code etc. It skips over things
1011 that do not influence this. For some calls, a change of option is important.
1012 For some calls, it makes sense to skip negative forward and all backward
1013 assertions, and also the \b assertion; for others it does not.
1014
1015 Arguments:
1016   code         pointer to the start of the group
1017   options      pointer to external options
1018   optbit       the option bit whose changing is significant, or
1019                  zero if none are
1020   skipassert   TRUE if certain assertions are to be skipped
1021
1022 Returns:       pointer to the first significant opcode
1023 */
1024
1025 static const uschar*
1026 first_significant_code(const uschar *code, int *options, int optbit,
1027   BOOL skipassert)
1028 {
1029 for (;;)
1030   {
1031   switch ((int)*code)
1032     {
1033     case OP_OPT:
1034     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1035       *options = (int)code[1];
1036     code += 2;
1037     break;
1038
1039     case OP_ASSERT_NOT:
1040     case OP_ASSERTBACK:
1041     case OP_ASSERTBACK_NOT:
1042     if (!skipassert) return code;
1043     do code += GET(code, 1); while (*code == OP_ALT);
1044     code += _pcre_OP_lengths[*code];
1045     break;
1046
1047     case OP_WORD_BOUNDARY:
1048     case OP_NOT_WORD_BOUNDARY:
1049     if (!skipassert) return code;
1050     /* Fall through */
1051
1052     case OP_CALLOUT:
1053     case OP_CREF:
1054     case OP_RREF:
1055     case OP_DEF:
1056     code += _pcre_OP_lengths[*code];
1057     break;
1058
1059     default:
1060     return code;
1061     }
1062   }
1063 /* Control never reaches here */
1064 }
1065
1066
1067
1068
1069 /*************************************************
1070 *        Find the fixed length of a pattern      *
1071 *************************************************/
1072
1073 /* Scan a pattern and compute the fixed length of subject that will match it,
1074 if the length is fixed. This is needed for dealing with backward assertions.
1075 In UTF8 mode, the result is in characters rather than bytes.
1076
1077 Arguments:
1078   code     points to the start of the pattern (the bracket)
1079   options  the compiling options
1080
1081 Returns:   the fixed length, or -1 if there is no fixed length,
1082              or -2 if \C was encountered
1083 */
1084
1085 static int
1086 find_fixedlength(uschar *code, int options)
1087 {
1088 int length = -1;
1089
1090 register int branchlength = 0;
1091 register uschar *cc = code + 1 + LINK_SIZE;
1092
1093 /* Scan along the opcodes for this branch. If we get to the end of the
1094 branch, check the length against that of the other branches. */
1095
1096 for (;;)
1097   {
1098   int d;
1099   register int op = *cc;
1100   switch (op)
1101     {
1102     case OP_CBRA:
1103     case OP_BRA:
1104     case OP_ONCE:
1105     case OP_COND:
1106     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1107     if (d < 0) return d;
1108     branchlength += d;
1109     do cc += GET(cc, 1); while (*cc == OP_ALT);
1110     cc += 1 + LINK_SIZE;
1111     break;
1112
1113     /* Reached end of a branch; if it's a ket it is the end of a nested
1114     call. If it's ALT it is an alternation in a nested call. If it is
1115     END it's the end of the outer call. All can be handled by the same code. */
1116
1117     case OP_ALT:
1118     case OP_KET:
1119     case OP_KETRMAX:
1120     case OP_KETRMIN:
1121     case OP_END:
1122     if (length < 0) length = branchlength;
1123       else if (length != branchlength) return -1;
1124     if (*cc != OP_ALT) return length;
1125     cc += 1 + LINK_SIZE;
1126     branchlength = 0;
1127     break;
1128
1129     /* Skip over assertive subpatterns */
1130
1131     case OP_ASSERT:
1132     case OP_ASSERT_NOT:
1133     case OP_ASSERTBACK:
1134     case OP_ASSERTBACK_NOT:
1135     do cc += GET(cc, 1); while (*cc == OP_ALT);
1136     /* Fall through */
1137
1138     /* Skip over things that don't match chars */
1139
1140     case OP_REVERSE:
1141     case OP_CREF:
1142     case OP_RREF:
1143     case OP_DEF:
1144     case OP_OPT:
1145     case OP_CALLOUT:
1146     case OP_SOD:
1147     case OP_SOM:
1148     case OP_EOD:
1149     case OP_EODN:
1150     case OP_CIRC:
1151     case OP_DOLL:
1152     case OP_NOT_WORD_BOUNDARY:
1153     case OP_WORD_BOUNDARY:
1154     cc += _pcre_OP_lengths[*cc];
1155     break;
1156
1157     /* Handle literal characters */
1158
1159     case OP_CHAR:
1160     case OP_CHARNC:
1161     case OP_NOT:
1162     branchlength++;
1163     cc += 2;
1164 #ifdef SUPPORT_UTF8
1165     if ((options & PCRE_UTF8) != 0)
1166       {
1167       while ((*cc & 0xc0) == 0x80) cc++;
1168       }
1169 #endif
1170     break;