freebsd/contrib/awk/re.c

/*
 * re.c - compile regular expressions.
 */

/* 
 * Copyright (C) 1991-2000 the Free Software Foundation, Inc.
 * 
 * This file is part of GAWK, the GNU implementation of the
 * AWK Programming Language.
 * 
 * GAWK is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * GAWK is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
 */

#include "awk.h"

static reg_syntax_t syn;

/* make_regexp --- generate compiled regular expressions */

Regexp *
make_regexp(s, len, ignorecase, dfa)
char *s;
size_t len;
int ignorecase;
int dfa;
{
	Regexp *rp;
	const char *rerr;
	char *src = s;
	char *temp;
	char *end = s + len;
	register char *dest;
	register int c, c2;

	/* Handle escaped characters first. */

	/*
	 * Build a copy of the string (in dest) with the
	 * escaped characters translated, and generate the regex
	 * from that.  
	 */
	emalloc(dest, char *, len + 2, "make_regexp");
	temp = dest;

	while (src < end) {
		if (*src == '\\') {
			c = *++src;
			switch (c) {
			case 'a':
			case 'b':
			case 'f':
			case 'n':
			case 'r':
			case 't':
			case 'v':
			case 'x':
			case '0':
			case '1':
			case '2':
			case '3':
			case '4':
			case '5':
			case '6':
			case '7':
				c2 = parse_escape(&src);
				if (c2 < 0)
					cant_happen();
				/*
				 * Unix awk treats octal (and hex?) chars
				 * literally in re's, so escape regexp
				 * metacharacters.
				 */
				if (do_traditional && ! do_posix && (isdigit(c) || c == 'x')
				    && strchr("()|*+?.^$\\[]", c2) != NULL)
					*dest++ = '\\';
				*dest++ = (char) c2;
				break;
			case '8':
			case '9':	/* a\9b not valid */
				*dest++ = c;
				src++;
				break;
			case 'y':	/* normally \b */
				/* gnu regex op */
				if (! do_traditional) {
					*dest++ = '\\';
					*dest++ = 'b';
					src++;
					break;
				}
				/* else, fall through */
			default:
				*dest++ = '\\';
				*dest++ = (char) c;
				src++;
				break;
			} /* switch */
		} else
			*dest++ = *src++;	/* not '\\' */
	} /* for */

	*dest = '\0' ;	/* Only necessary if we print dest ? */
	emalloc(rp, Regexp *, sizeof(*rp), "make_regexp");
	memset((char *) rp, 0, sizeof(*rp));
	rp->pat.allocated = 0;	/* regex will allocate the buffer */
	emalloc(rp->pat.fastmap, char *, 256, "make_regexp");

	if (ignorecase)
		rp->pat.translate = casetable;
	else
		rp->pat.translate = NULL;
	len = dest - temp;
	if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)
		fatal("%s: /%s/", rerr, temp);

	/* gack. this must be done *after* re_compile_pattern */
	rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */
	if (dfa && ! ignorecase) {
		dfacomp(temp, len, &(rp->dfareg), TRUE);
		rp->dfa = TRUE;
	} else
		rp->dfa = FALSE;

	free(temp);
	return rp;
}

/* research --- do a regexp search. use dfa if possible */

int
research(rp, str, start, len, need_start)
Regexp *rp;
register char *str;
int start;
register size_t len;
int need_start;
{
	char *ret = str;
	int try_backref;

	/*
	 * Always do dfa search if can; if it fails, then even if
	 * need_start is true, we won't bother with the regex search.
	 */
	if (rp->dfa) {
		char save;
		int count = 0;

		/*
		 * dfa likes to stick a '\n' right after the matched
		 * text.  So we just save and restore the character.
		 */
		save = str[start+len];
		ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE,
					&count, &try_backref);
		str[start+len] = save;
	}
	if (ret) {
		if (need_start || rp->dfa == FALSE || try_backref) {
			int result = re_search(&(rp->pat), str, start+len,
					start, len, &(rp->regs));
			/* recover any space from C based alloca */
#ifdef C_ALLOCA
			(void) alloca(0);
#endif
			return result;
		} else
			return 1;
	} else
		return -1;
}

/* refree --- free up the dynamic memory used by a compiled regexp */

void
refree(rp)
Regexp *rp;
{
	free(rp->pat.buffer);
	free(rp->pat.fastmap);
	if (rp->regs.start)
		free(rp->regs.start);
	if (rp->regs.end)
		free(rp->regs.end);
	if (rp->dfa)
		dfafree(&(rp->dfareg));
	free(rp);
}

/* dfaerror --- print an error message for the dfa routines */

void
dfaerror(s)
const char *s;
{
	fatal("%s", s);
}

/* re_update --- recompile a dynamic regexp */

Regexp *
re_update(t)
NODE *t;
{
	NODE *t1;

	if ((t->re_flags & CASE) == IGNORECASE) {
		if ((t->re_flags & CONST) != 0)
			return t->re_reg;
		t1 = force_string(tree_eval(t->re_exp));
		if (t->re_text != NULL) {
			if (cmp_nodes(t->re_text, t1) == 0) {
				free_temp(t1);
				return t->re_reg;
			}
			unref(t->re_text);
		}
		t->re_text = dupnode(t1);
		free_temp(t1);
	}
	if (t->re_reg != NULL)
		refree(t->re_reg);
	if (t->re_cnt > 0)
		t->re_cnt++;
	if (t->re_cnt > 10)
		t->re_cnt = 0;
	if (t->re_text == NULL || (t->re_flags & CASE) != IGNORECASE) {
		t1 = force_string(tree_eval(t->re_exp));
		t->re_text = dupnode(t1);
		free_temp(t1);
	}
	t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,
				IGNORECASE, t->re_cnt);
	t->re_flags &= ~CASE;
	t->re_flags |= IGNORECASE;
	return t->re_reg;
}

/* resetup --- choose what kind of regexps we match */

void
resetup()
{
	if (do_posix)
		syn = RE_SYNTAX_POSIX_AWK;	/* strict POSIX re's */
	else if (do_traditional)
		syn = RE_SYNTAX_AWK;		/* traditional Unix awk re's */
	else
		syn = RE_SYNTAX_GNU_AWK;	/* POSIX re's + GNU ops */

	/*
	 * Interval expressions are off by default, since it's likely to
	 * break too many old programs to have them on.
	 */
	if (do_intervals)
		syn |= RE_INTERVALS;

	(void) re_set_syntax(syn);
	dfasyntax(syn, FALSE);
}

/* avoid_dfa --- FIXME: temporary kludge function until we have a new dfa.c */

int
avoid_dfa(re, str, len)
NODE *re;
char *str;
size_t len;
{
	char *restr;
	int relen;
	int anchor, i;
	char *end;

	if ((re->re_flags & CONST) != 0) {
		restr = re->re_exp->stptr;
		relen = re->re_exp->stlen;
	} else {
		restr = re->re_text->stptr;
		relen = re->re_text->stlen;
	}

	for (anchor = FALSE, i = 0; i < relen; i++) {
		if (restr[i] == '^' || restr[i] == '$') {
			anchor = TRUE;
			break;
		}
	}
	if (! anchor)
		return FALSE;

	for (end = str + len; str < end; str++)
		if (*str == '\n')
			return TRUE;

	return FALSE;
}
Virgin import of GNU awk 3.0.3 1997-10-14 18:17:11 +00:00			`/*`
			`* re.c - compile regular expressions.`
			`*/`

			`/*`
Update vendor branch to gawk-3.0.6. 2000-08-15 10:02:07 +00:00			`* Copyright (C) 1991-2000 the Free Software Foundation, Inc.`
Virgin import of GNU awk 3.0.3 1997-10-14 18:17:11 +00:00			`*`
			`* This file is part of GAWK, the GNU implementation of the`
			`* AWK Programming Language.`
			`*`
			`* GAWK is free software; you can redistribute it and/or modify`
			`* it under the terms of the GNU General Public License as published by`
			`* the Free Software Foundation; either version 2 of the License, or`
			`* (at your option) any later version.`
			`*`
			`* GAWK is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`* GNU General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU General Public License`
			`* along with this program; if not, write to the Free Software`
			`* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA`
			`*/`

			`#include "awk.h"`

			`static reg_syntax_t syn;`

			`/* make_regexp --- generate compiled regular expressions */`

			`Regexp *`
			`make_regexp(s, len, ignorecase, dfa)`
			`char *s;`
			`size_t len;`
			`int ignorecase;`
			`int dfa;`
			`{`
			`Regexp *rp;`
			`const char *rerr;`
			`char *src = s;`
			`char *temp;`
			`char *end = s + len;`
			`register char *dest;`
			`register int c, c2;`

			`/* Handle escaped characters first. */`

			`/*`
			`* Build a copy of the string (in dest) with the`
			`* escaped characters translated, and generate the regex`
			`* from that.`
			`*/`
			`emalloc(dest, char *, len + 2, "make_regexp");`
			`temp = dest;`

			`while (src < end) {`
			`if (*src == '\\') {`
			`c = *++src;`
			`switch (c) {`
			`case 'a':`
			`case 'b':`
			`case 'f':`
			`case 'n':`
			`case 'r':`
			`case 't':`
			`case 'v':`
			`case 'x':`
			`case '0':`
			`case '1':`
			`case '2':`
			`case '3':`
			`case '4':`
			`case '5':`
			`case '6':`
			`case '7':`
			`c2 = parse_escape(&src);`
			`if (c2 < 0)`
			`cant_happen();`
			`/*`
			`* Unix awk treats octal (and hex?) chars`
			`* literally in re's, so escape regexp`
			`* metacharacters.`
			`*/`
			`if (do_traditional && ! do_posix && (isdigit(c) \|\| c == 'x')`
			`&& strchr("()\|*+?.^$\\[]", c2) != NULL)`
			`*dest++ = '\\';`
			`*dest++ = (char) c2;`
			`break;`
			`case '8':`
			`case '9': /* a\9b not valid */`
			`*dest++ = c;`
			`src++;`
			`break;`
			`case 'y': /* normally \b */`
			`/* gnu regex op */`
			`if (! do_traditional) {`
			`*dest++ = '\\';`
			`*dest++ = 'b';`
			`src++;`
			`break;`
			`}`
			`/* else, fall through */`
			`default:`
			`*dest++ = '\\';`
			`*dest++ = (char) c;`
			`src++;`
			`break;`
			`} /* switch */`
			`} else`
			`dest++ = src++; /* not '\\' */`
			`} /* for */`

			`dest = '\0' ; / Only necessary if we print dest ? */`
			`emalloc(rp, Regexp , sizeof(rp), "make_regexp");`
			`memset((char ) rp, 0, sizeof(rp));`
			`rp->pat.allocated = 0; /* regex will allocate the buffer */`
			`emalloc(rp->pat.fastmap, char *, 256, "make_regexp");`

			`if (ignorecase)`
			`rp->pat.translate = casetable;`
			`else`
			`rp->pat.translate = NULL;`
			`len = dest - temp;`
			`if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL)`
			`fatal("%s: /%s/", rerr, temp);`

			`/* gack. this must be done after re_compile_pattern */`
			`rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */`
			`if (dfa && ! ignorecase) {`
			`dfacomp(temp, len, &(rp->dfareg), TRUE);`
			`rp->dfa = TRUE;`
			`} else`
			`rp->dfa = FALSE;`

			`free(temp);`
			`return rp;`
			`}`

			`/* research --- do a regexp search. use dfa if possible */`

			`int`
			`research(rp, str, start, len, need_start)`
			`Regexp *rp;`
			`register char *str;`
			`int start;`
			`register size_t len;`
			`int need_start;`
			`{`
			`char *ret = str;`
			`int try_backref;`

			`/*`
			`* Always do dfa search if can; if it fails, then even if`
			`* need_start is true, we won't bother with the regex search.`
			`*/`
			`if (rp->dfa) {`
			`char save;`
			`int count = 0;`

			`/*`
			`* dfa likes to stick a '\n' right after the matched`
			`* text. So we just save and restore the character.`
			`*/`
			`save = str[start+len];`
			`ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE,`
			`&count, &try_backref);`
			`str[start+len] = save;`
			`}`
			`if (ret) {`
			`if (need_start \|\| rp->dfa == FALSE \|\| try_backref) {`
			`int result = re_search(&(rp->pat), str, start+len,`
			`start, len, &(rp->regs));`
			`/* recover any space from C based alloca */`
			`#ifdef C_ALLOCA`
			`(void) alloca(0);`
			`#endif`
			`return result;`
			`} else`
			`return 1;`
			`} else`
			`return -1;`
			`}`

			`/* refree --- free up the dynamic memory used by a compiled regexp */`

			`void`
			`refree(rp)`
			`Regexp *rp;`
			`{`
			`free(rp->pat.buffer);`
			`free(rp->pat.fastmap);`
			`if (rp->regs.start)`
			`free(rp->regs.start);`
			`if (rp->regs.end)`
			`free(rp->regs.end);`
			`if (rp->dfa)`
			`dfafree(&(rp->dfareg));`
			`free(rp);`
			`}`

			`/* dfaerror --- print an error message for the dfa routines */`

			`void`
			`dfaerror(s)`
			`const char *s;`
			`{`
			`fatal("%s", s);`
			`}`

			`/* re_update --- recompile a dynamic regexp */`

			`Regexp *`
			`re_update(t)`
			`NODE *t;`
			`{`
			`NODE *t1;`

			`if ((t->re_flags & CASE) == IGNORECASE) {`
			`if ((t->re_flags & CONST) != 0)`
			`return t->re_reg;`
			`t1 = force_string(tree_eval(t->re_exp));`
			`if (t->re_text != NULL) {`
			`if (cmp_nodes(t->re_text, t1) == 0) {`
			`free_temp(t1);`
			`return t->re_reg;`
			`}`
			`unref(t->re_text);`
			`}`
			`t->re_text = dupnode(t1);`
			`free_temp(t1);`
			`}`
			`if (t->re_reg != NULL)`
			`refree(t->re_reg);`
			`if (t->re_cnt > 0)`
			`t->re_cnt++;`
			`if (t->re_cnt > 10)`
			`t->re_cnt = 0;`
Update vendor branch to gawk-3.0.6. 2000-08-15 10:02:07 +00:00			`if (t->re_text == NULL \|\| (t->re_flags & CASE) != IGNORECASE) {`
Virgin import of GNU awk 3.0.3 1997-10-14 18:17:11 +00:00			`t1 = force_string(tree_eval(t->re_exp));`
			`t->re_text = dupnode(t1);`
			`free_temp(t1);`
			`}`
			`t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen,`
			`IGNORECASE, t->re_cnt);`
			`t->re_flags &= ~CASE;`
			`t->re_flags \|= IGNORECASE;`
			`return t->re_reg;`
			`}`

			`/* resetup --- choose what kind of regexps we match */`

			`void`
			`resetup()`
			`{`
			`if (do_posix)`
			`syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */`
			`else if (do_traditional)`
			`syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */`
			`else`
			`syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */`

			`/*`
			`* Interval expressions are off by default, since it's likely to`
			`* break too many old programs to have them on.`
			`*/`
			`if (do_intervals)`
			`syn \|= RE_INTERVALS;`

			`(void) re_set_syntax(syn);`
			`dfasyntax(syn, FALSE);`
			`}`

			`/* avoid_dfa --- FIXME: temporary kludge function until we have a new dfa.c */`

			`int`
			`avoid_dfa(re, str, len)`
			`NODE *re;`
			`char *str;`
			`size_t len;`
			`{`
			`char *restr;`
			`int relen;`
			`int anchor, i;`
			`char *end;`

			`if ((re->re_flags & CONST) != 0) {`
			`restr = re->re_exp->stptr;`
			`relen = re->re_exp->stlen;`
			`} else {`
			`restr = re->re_text->stptr;`
			`relen = re->re_text->stlen;`
			`}`

			`for (anchor = FALSE, i = 0; i < relen; i++) {`
			`if (restr[i] == '^' \|\| restr[i] == '$') {`
			`anchor = TRUE;`
			`break;`
			`}`
			`}`
			`if (! anchor)`
			`return FALSE;`

			`for (end = str + len; str < end; str++)`
			`if (*str == '\n')`
			`return TRUE;`

			`return FALSE;`
			`}`