--- internal.h.orig Sat Mar 30 15:18:04 2002 +++ internal.h Fri Apr 19 16:28:26 2002 @@ -229,6 +229,8 @@ OP_CLASS, /* Match a character class */ OP_REF, /* Match a back reference */ OP_RECURSE, /* Match this pattern recursively */ + OP_NRECURSE, /* "Call" the numbered subpattern at this point. If used + inside that subpattern, can induce recursion. */ OP_ALT, /* Start of alternation */ OP_KET, /* End of group that doesn't have an unbounded repeat */ @@ -283,7 +285,7 @@ #define ERR7 "invalid escape sequence in character class" #define ERR8 "range out of order in character class" #define ERR9 "nothing to repeat" -#define ERR10 "operand of unlimited repeat could match the empty string" +#define ERR10 "Recursive call could loop indefinitely" #define ERR11 "internal error: unexpected repeat" #define ERR12 "unrecognized character after (?" #define ERR13 "unused error" @@ -302,13 +304,14 @@ #define ERR26 "malformed number after (?(" #define ERR27 "conditional group contains more than two branches" #define ERR28 "assertion expected after (?(" -#define ERR29 "(?p must be followed by )" +#define ERR29 "(?R or (?nnn must be followed by )" #define ERR30 "unknown POSIX class name" #define ERR31 "POSIX collating elements are not supported" #define ERR32 "this version of PCRE is not compiled with PCRE_UTF8 support" #define ERR33 "characters with values > 255 are not yet supported in classes" #define ERR34 "character value in \\x{...} sequence is too large" #define ERR35 "invalid condition (?(0)" +#define ERR36 "subexpression call (?nnn) of a non-open subexpression" /* All character handling must be done as unsigned characters. Otherwise there are problems with top-bit-set characters and functions such as isspace(). @@ -341,6 +344,27 @@ uschar start_bits[32]; } real_pcre_extra; +/* Represents a not yet closed bracketed group at compile time. */ + +typedef struct bragroup { + int branum; /* Number of the group (or 0 if non-capturing) */ + const uschar *start; /* Position in compiled pattern where group starts */ + BOOL maybe_empty; /* Could the segment of regex between the start of the + group and the start of the next - or the current pos - + match the empty string? */ + struct bragroup *outer; /* Pointer to immediately containing group */ +} bragroup; + +/* Linked list (used FIFO style) representing recursion in the regex. */ + +typedef struct recursion_info { + struct recursion_info *prev; /* Points to the previous recursion record (or NULL) */ + int group_num; /* Number of group that was called */ + const uschar *after_call; /* "Return value": points after the call in the expr */ + const uschar *save_start; /* Old value of md->start_match */ + int *offset_save; /* Pointer to start of saved offsets */ + int saved_max; /* Number of saved offsets */ +} recursion_info; /* Structure for passing "static" information around between the functions doing the compiling, so that they are thread-safe. */ @@ -374,6 +398,7 @@ const uschar *start_match; /* Start of this match attempt */ const uschar *end_match_ptr; /* Subject position at end match */ int end_offset_top; /* Highwater mark at end of match */ + recursion_info *recursive; /* Stacklike linked list of recursion return addresses */ } match_data; /* Bit definitions for entries in the pcre_ctypes table. */ --- pcre.c.orig Sat Mar 30 15:02:29 2002 +++ pcre.c Fri Apr 19 15:28:51 2002 @@ -95,7 +95,7 @@ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", "*", "*?", "+", "+?", "?", "??", "{", "{", "{", "*", "*?", "+", "+?", "?", "??", "{", "{", - "class", "Ref", "Recurse", + "class", "Ref", "Recurse", "(?nnn)", "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref", "Brazero", "Braminzero", "Branumber", "Bra" @@ -156,7 +156,7 @@ static BOOL compile_regex(int, int, int *, uschar **, const uschar **, const char **, - BOOL, int, int *, int *, compile_data *); + BOOL, int, int *, int *, bragroup *, compile_data *); /* Structure for building a chain of data that actually lives on the stack, for holding the values of the subject pointer at the start of each @@ -998,7 +998,7 @@ static BOOL compile_branch(int options, int *brackets, uschar **codeptr, const uschar **ptrptr, const char **errorptr, int *optchanged, - int *reqchar, int *countlits, compile_data *cd) + int *reqchar, int *countlits, bragroup *bragr, compile_data *cd) { int repeat_type, op_type; int repeat_min, repeat_max; @@ -1014,6 +1014,7 @@ const uschar *tempptr; uschar *previous = NULL; uschar class[32]; +BOOL prev_maybe_empty = FALSE; /* Set up the default and non-default settings for greediness */ @@ -1025,6 +1026,10 @@ *reqchar = prevreqchar = -1; *countlits = 0; +/* The branch might be empty so far */ + +bragr->maybe_empty = TRUE; + /* Switch on next character until the end of the branch */ for (;; ptr++) @@ -1075,6 +1080,8 @@ case '.': previous = code; *code++ = OP_ANY; + prev_maybe_empty = bragr->maybe_empty; + bragr->maybe_empty = FALSE; break; /* Character classes. These always build a 32-byte bitmap of the permitted @@ -1085,6 +1092,8 @@ case '[': previous = code; *code++ = OP_CLASS; + prev_maybe_empty = bragr->maybe_empty; + bragr->maybe_empty = FALSE; /* If the first character is '^', set the negation flag and skip it. */ @@ -1391,6 +1400,15 @@ if (ptr[1] == '?') { repeat_type = greedy_non_default; ptr++; } else repeat_type = greedy_default; + + /* If the last item determined that this subpattern couldn't match + the empty string so far; well, maybe it really still could! */ + + if (repeat_min == 0) + { + DPRINTF(("Undoing last maybe_empty change...\n")); + bragr->maybe_empty = prev_maybe_empty; + } /* If previous was a string of characters, chop off the last one and use it as the subject of the repeat. If there was only one character, we can @@ -1485,7 +1503,7 @@ *code++ = (repeat_min & 255); } - /* If the mininum is 1 and the previous item was a character string, + /* If the minimum is 1 and the previous item was a character string, we either have to put back the item that got cancelled if the string length was 1, or add the character back onto the end of a longer string. For a character type nothing need be done; it will just get @@ -1537,6 +1555,13 @@ code = previous; goto END_REPEAT; } + + if (repeat_min == 0) + { + DPRINTF(("Undoing last maybe_empty change...\n")); + bragr->maybe_empty = prev_maybe_empty; + } + if (repeat_min == 0 && repeat_max == -1) *code++ = OP_CRSTAR + repeat_type; else if (repeat_min == 1 && repeat_max == -1) @@ -1739,6 +1764,8 @@ { int set, unset; int *optset; + bragroup *bg; + BOOL left_recursion; /* scratch variable used in (?R) and (?nn) */ switch (*(++ptr)) { @@ -1807,10 +1834,90 @@ break; case 'R': /* Pattern recursion */ + previous = code; /* Can be repeated */ + + left_recursion = TRUE; + for(bg = bragr; bg; bg = bg->outer) + if (!bg->maybe_empty) left_recursion = FALSE; + if (left_recursion) + { + *errorptr = ERR10; + goto FAILED; + } + + prev_maybe_empty = bragr->maybe_empty; /* No change */ + + /* The BRA and KET are necessary so that we don't have to do + special handling for (?R)* and the like. */ + + *code++ = OP_BRA; + *code++ = '\0'; + *code++ = '\004'; + *code++ = OP_RECURSE; + + *code++ = OP_KET; + *code++ = '\0'; + *code++ = '\004'; + ptr++; continue; + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + { + int num = 0; + previous = code; /* Can be repeated */ + while (*ptr >= '0' && *ptr <= '9') + { + num = 10*num + (*ptr - '0'); + ptr++; + } + if (*ptr != ')') + { + *errorptr = ERR12; + goto FAILED; + } + + left_recursion = TRUE; + for(bg = bragr; bg; bg = bg->outer) + { + if (!bg->maybe_empty) left_recursion = FALSE; + if (bg->branum == num) break; + } + if (!bg) + { + *errorptr = ERR36; + goto FAILED; + } + if (left_recursion) + { + *errorptr = ERR10; + goto FAILED; + } + + prev_maybe_empty = bragr->maybe_empty; + + /* The BRA and KET are necessary so that we don't have to do + special handling for (?1)* and the like. */ + + *code++ = OP_BRA; + *code++ = '\0'; + *code++ = (uschar) (6 + sizeof(uschar *)); + + *code++ = OP_NRECURSE; + *code++ = num << 8; + *code++ = num & 255; + + *((const uschar **)code) = bg->start; + code += sizeof(uschar *); + + *code++ = OP_KET; + *code++ = '\0'; + *code++ = (uschar) (6 + sizeof(uschar *)); + } + continue; + default: /* Option setting */ set = unset = 0; optset = &set; @@ -1892,6 +1999,7 @@ to pass its address because some compilers complain otherwise. Pass in a new setting for the ims options if they have changed. */ + prev_maybe_empty = bragr->maybe_empty; previous = (bravalue >= OP_ONCE)? code : NULL; *code = bravalue; tempcode = code; @@ -1909,6 +2017,7 @@ skipbytes, /* Skip over OP_COND/OP_BRANUMBER */ &subreqchar, /* For possible last char */ &subcountlits, /* For literal count */ + bragr, /* Bracketed groups */ cd)) /* Tables block */ goto FAILED; @@ -1989,6 +2098,7 @@ { int number = -c - ESC_REF; previous = code; + prev_maybe_empty = bragr->maybe_empty; *code++ = OP_REF; *code++ = number >> 8; *code++ = number & 255; @@ -1996,6 +2106,8 @@ else { previous = (-c > ESC_b && -c < ESC_Z)? code : NULL; + prev_maybe_empty = bragr->maybe_empty; + bragr->maybe_empty = FALSE; *code++ = -c; } continue; @@ -2016,6 +2128,9 @@ *code = OP_CHARS; code += 2; length = 0; + prev_maybe_empty = bragr->maybe_empty; + bragr->maybe_empty = FALSE; + DPRINTF(("bragr->maybe_empty = FALSE;\n")); do { @@ -2118,6 +2233,7 @@ skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER) reqchar -> place to put the last required character, or a negative number countlits -> place to put the shortest literal count of any branch + outer -> immediately containing bracketed group cd points to the data block with tables pointers Returns: TRUE on success @@ -2126,7 +2242,7 @@ static BOOL compile_regex(int options, int optchanged, int *brackets, uschar **codeptr, const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes, - int *reqchar, int *countlits, compile_data *cd) + int *reqchar, int *countlits, bragroup *outer, compile_data *cd) { const uschar *ptr = *ptrptr; uschar *code = *codeptr; @@ -2135,6 +2251,15 @@ uschar *reverse_count = NULL; int oldoptions = options & PCRE_IMS; int branchreqchar, branchcountlits; +bragroup bragr; +BOOL group_maybe_empty = FALSE; + +if (*start_bracket > OP_BRA) + bragr.branum = *brackets; +else bragr.branum = 0; +bragr.start = code; +/* bragr.maybe_empty = TRUE; -- No need, reset by compile_branch */ +bragr.outer = outer; *reqchar = -1; *countlits = INT_MAX; @@ -2168,7 +2293,7 @@ /* Now compile the branch */ if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged, - &branchreqchar, &branchcountlits, cd)) + &branchreqchar, &branchcountlits, &bragr, cd)) { *ptrptr = ptr; return FALSE; @@ -2217,6 +2342,10 @@ reverse_count[0] = (length >> 8); reverse_count[1] = length & 255; } + + /* If any branch could be empty, so could the group */ + + if (bragr.maybe_empty) group_maybe_empty = TRUE; /* Reached end of expression, either ')' or end of pattern. Insert a terminating ket and the length of the whole bracketed item, and return, @@ -2236,6 +2365,8 @@ } *codeptr = code; *ptrptr = ptr; + if (!group_maybe_empty && *start_bracket >= OP_ONCE && outer) + outer->maybe_empty = FALSE; return TRUE; } @@ -2493,6 +2624,7 @@ compile_data compile_block; int brastack[BRASTACK_SIZE]; uschar bralenstack[BRASTACK_SIZE]; +uschar *subpatternoff_stack[16]; /* Subpattern offsets on stack */ #ifdef DEBUG uschar *code_base, *code_end; @@ -2730,7 +2862,7 @@ break; /* A recursive call to the regex is an extension, to provide the - facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */ + facility which can be obtained by (?{perl-code}) in Perl 5.6. */ case 'R': if (ptr[3] != ')') @@ -2739,8 +2871,30 @@ goto PCRE_ERROR_RETURN; } ptr += 3; - length += 1; + length += 7; /* BRA(3) + RECURSE(1) + KET(3) */ break; + + /* A subexpression invocation. This generalises the (?R) construct + above. */ + + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + { + int refnum = 0; + while (*ptr >= '0' && *ptr <= '9') + { + refnum = 10*refnum + (*ptr - '0'); + ptr++; + } + if (ptr[3] != ')') + { + *errorptr = ERR29; + goto PCRE_ERROR_RETURN; + } + ptr += 3; + length += 3 + sizeof(uschar *) + 6; /* +6 for BRA + KET */ + break; + } /* Lookbehinds are in Perl from version 5.005 */ @@ -3076,7 +3230,7 @@ *code = OP_BRA; bracount = 0; (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0, - &reqchar, &countlits, &compile_block); + &reqchar, &countlits, NULL, &compile_block); re->top_bracket = bracount; re->top_backref = top_backref; @@ -3293,6 +3447,16 @@ if (*code == OP_NOTMINUPTO) printf("?"); code += 3; break; + + case OP_RECURSE: + printf(" (?R)"); + break; + + case OP_NRECURSE: + printf(" (?%d) -> %d", (code[1] << 8) + code[2], + *((uschar **)&code[3]) - code_base); + code += (2 + sizeof(uschar *)); + break; case OP_REF: printf(" \\%d", (code[1] << 8) | code[2]); @@ -3424,7 +3588,7 @@ if (length > md->end_subject - eptr) return FALSE; -/* Separate the caselesss case for speed */ +/* Separate the caseless case for speed */ if ((ims & PCRE_CASELESS) != 0) { @@ -3560,7 +3724,7 @@ switch(op) { case OP_BRA: /* Non-capturing bracket: optimized */ - DPRINTF(("start bracket 0\n")); + DPRINTF(("start bracket 0 (%d)\n", ecode - md->start_pattern)); do { if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup)) @@ -3610,10 +3774,29 @@ ecode += 3; break; - /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched - an empty string - recursion will then try other alternatives, if any. */ + /* End of the pattern. */ case OP_END: + /* If we were called recursively, we should restore the offsets + appropriately and continue from after the call. */ + + if (md->recursive && md->recursive->group_num == 0) + { + recursion_info *rec = md->recursive; + + DPRINTF(("Hit the end in a (?R) recursion\n")); + md->recursive = rec->prev; + memmove(md->offset_vector, rec->offset_save, rec->saved_max * sizeof(int)); + md->start_match = rec->save_start; + ims = original_ims; + ecode = rec->after_call; + break; + } + + /* Otherwise we really are at the end. If PCRE_NOTEMPTY is set, fail if we + have matched an empty string - recursion will then try other alternatives, + if any. */ + if (md->notempty && eptr == md->start_match) return FALSE; md->end_match_ptr = eptr; /* Record where we ended */ md->end_offset_top = offset_top; /* and how many extracts were taken */ @@ -3693,52 +3876,99 @@ ecode += 3; break; - /* Recursion matches the current regex, nested. If there are any capturing - brackets started but not finished, we have to save their starting points - and reinstate them after the recursion. However, we don't know how many - such there are (offset_top records the completed total) so we just have - to save all the potential data. There may be up to 99 such values, which - is a bit large to put on the stack, but using malloc for small numbers - seems expensive. As a compromise, the stack is used when there are fewer - than 16 values to store; otherwise malloc is used. A problem is what to do - if the malloc fails ... there is no way of returning to the top level with - an error. Save the top 15 values on the stack, and accept that the rest - may be wrong. */ + /* Recursion matches either the current regex, or some subexpression. + If there are any capturing brackets started but not finished, we have to + save their starting points and reinstate them after the recursion. However, + we don't know how many such there are (offset_top records the completed total) + so we just have to save all the potential data. There may be up to 65535 such + values, which is too large to put on the stack, but using malloc for small + numbers seems expensive. As a compromise, the stack is used when there are + fewer than 16 values to store; otherwise malloc is used. */ case OP_RECURSE: + case OP_NRECURSE: { BOOL rc; - int *save; + recursion_info new_recursive; int stacksave[15]; + const uschar *callpat, *savestart; - c = md->offset_max; + new_recursive.prev = md->recursive; + md->recursive = &new_recursive; - if (c < 16) save = stacksave; else + if (op == OP_NRECURSE) { - save = (int *)(pcre_malloc)((c+1) * sizeof(int)); - if (save == NULL) + new_recursive.group_num = (ecode[1] << 8) + ecode[2]; + callpat = *((uschar **) &ecode[3]); + } + else + { + new_recursive.group_num = 0; + callpat = md->start_pattern; + } + + if (*callpat == OP_BRAZERO) callpat++; + + ecode++; + if (op == OP_NRECURSE) ecode += 2 + sizeof(uschar *); + new_recursive.after_call = ecode; + + new_recursive.saved_max = md->offset_end; + + if (new_recursive.saved_max < 16) new_recursive.offset_save = stacksave; else + { + new_recursive.offset_save = (int *) + (pcre_malloc)(new_recursive.saved_max * sizeof(int)); + if (new_recursive.offset_save == NULL) { - save = stacksave; - c = 15; + /* Warning: This may cause INCORRECT RESULTS if we run out of memory + here, because we won't be restoring all the stored strings correctly. + We either need proper run-time error handling or, at the very least, + some way to warn the user. Could we just spit a message to stderr? + + Returning error values would be very tedious because of the recursion; + and Philip Hazel says that longjmp() - in many ways the obvious solution - + has previously caused problems on some platforms. + + 2002-04-15 */ + + DPRINTF(("malloc() failed! Results may be wrong!\n")); + new_recursive.offset_save = stacksave; + new_recursive.saved_max = 15; } } - for (i = 1; i <= c; i++) - save[i] = md->offset_vector[md->offset_end - i]; - rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb, - match_isgroup); - for (i = 1; i <= c; i++) - md->offset_vector[md->offset_end - i] = save[i]; - if (save != stacksave) (pcre_free)(save); - if (!rc) return FALSE; + memmove(new_recursive.offset_save, md->offset_vector, + new_recursive.saved_max * sizeof(int)); + new_recursive.save_start = md->start_match; + md->start_match = eptr; - /* In case the recursion has set more capturing values, save the final - number, then move along the subject till after the recursive match, - and advance one byte in the pattern code. */ + DPRINTF(("Recursing into group %d\n", new_recursive.group_num)); - offset_top = md->end_offset_top; - eptr = md->end_match_ptr; - ecode++; + do + { + if (match(eptr, callpat+3, offset_top, md, ims, eptrb, match_isgroup)) + { + if (new_recursive.offset_save != stacksave) + (pcre_free) (new_recursive.offset_save); + return TRUE; + } + + md->recursive = &new_recursive; + memmove(md->offset_vector, new_recursive.offset_save, + new_recursive.saved_max * sizeof(int)); + callpat += (callpat[1] << 8) + callpat[2]; + } + while (*callpat == OP_ALT); + + DPRINTF(("Recursion didn't match\n")); + md->recursive = new_recursive.prev; + if (new_recursive.offset_save != stacksave) + (pcre_free) (new_recursive.offset_save); + return FALSE; + + + /* This point should never be reached */ } break; @@ -3883,7 +4113,7 @@ offset = number << 1; #ifdef DEBUG - printf("end bracket %d", number); + printf("end bracket %d (%d)", number, ecode - md->start_pattern); printf("\n"); #endif @@ -3896,6 +4126,22 @@ md->offset_vector[offset+1] = eptr - md->start_subject; if (offset_top <= offset) offset_top = offset + 2; } + + /* If we were called recursively, we should restore the offsets + appropriately and continue from after the call. */ + + if (md->recursive && md->recursive->group_num == number) + { + recursion_info *rec = md->recursive; + + DPRINTF(("Recursion (%d) succeeded - continuing\n", number)); + md->recursive = rec->prev; + md->start_match = rec->save_start; + memmove(md->offset_vector, rec->offset_save, rec->saved_max * sizeof(int)); + ecode = rec->after_call; + ims = original_ims; + break; + } } } @@ -3929,7 +4175,8 @@ else /* OP_KETRMAX */ { if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) || - match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE; + match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) + return TRUE; } } return FALSE; @@ -4927,6 +5174,8 @@ match_block.lcc = re->tables + lcc_offset; match_block.ctypes = re->tables + ctypes_offset; + +match_block.recursive = NULL; /* The ims options can vary during the matching as a result of the presence of (?ims) items in the pattern. They are kept in a local variable so that --- pcretest.c.orig Mon Apr 15 23:47:31 2002 +++ pcretest.c Fri Apr 19 15:24:24 2002 @@ -157,7 +157,7 @@ "*", "*?", "+", "+?", "?", "??", "{", "{", "{", "*", "*?", "+", "+?", "?", "??", "{", "{", "{", "*", "*?", "+", "+?", "?", "??", "{", "{", - "class", "Ref", "Recurse", + "class", "Ref", "Recurse", "(?nnn)", "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not", "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref", "Brazero", "Braminzero", "Branumber", "Bra" @@ -288,6 +288,16 @@ fprintf(outfile, "%d}", (code[1] << 8) + code[2]); if (*code == OP_NOTMINUPTO) fprintf(outfile, "?"); code += 3; + break; + + case OP_RECURSE: + fprintf(outfile, " (?R)"); + break; + + case OP_NRECURSE: + fprintf(outfile, " (?%d) -> %d", (code[1] << 8) + code[2], + *((uschar **)&code[3]) - ((real_pcre *)re)->code); + code += (2 + sizeof(uschar *)); break; case OP_REF: --- testdata/testoutput2.orig Fri Apr 19 19:14:10 2002 +++ testdata/testoutput2 Fri Apr 19 19:14:26 2002 @@ -1787,7 +1787,7 @@ 0: (abcd(xyz

qrs)123) 1: abcd(xyz

qrs)123 2: 123 - 3:

qrs + 3: /\( ( ( (?>[^()]+) | ((?R)) )* ) \) /x Capturing subpattern count = 3