439 lines
10 KiB
C
439 lines
10 KiB
C
/* ambig.c -
|
|
Content model ambiguity checking.
|
|
|
|
Written by James Clark (jjc@jclark.com).
|
|
*/
|
|
/*
|
|
This uses the construction in pp8-9 of [1], extended to deal with AND
|
|
groups.
|
|
|
|
Note that it is not correct for the purposes of ambiguity analysis to
|
|
handle AND groups by turning them into an OR group of SEQ groups
|
|
(consider (a&b?)).
|
|
|
|
We build an automaton for the entire content model by adding the
|
|
following case for AND:
|
|
|
|
nullable(v) := nullable(left child) and nullable(right child)
|
|
if nullable(right child) then
|
|
for each x in last(left child) do
|
|
follow(v,x) = follow(left child,x) U first(right child);
|
|
if nullable(left child) then
|
|
for each x in last(right child) do
|
|
follow(v,x) = follow(right child,x) U first(left child);
|
|
first(v) := first(left child) U first(right child);
|
|
last(v) := first(left child) U first(right child);
|
|
|
|
We also build an automaton for each AND group by building automata for
|
|
each of the members of the AND group using the above procedure and
|
|
then combine the members using:
|
|
|
|
for each x in last(left child) do
|
|
follow(v,x) = follow(left child,x) U first(right child);
|
|
for each x in last(right child) do
|
|
follow(v,x) = follow(right child,x) U first(left child);
|
|
first(v) := first(left child) U first(right child);
|
|
|
|
The content model is ambiguous just in case one of these automata is
|
|
non-deterministic. (Note that when checking determinism we need to
|
|
check the `first' set as well as all the `follow' sets.)
|
|
|
|
Why is this correct? Consider a primitive token in a member of an AND
|
|
group. There are two worst cases for ambiguity: firstly, when none of
|
|
the other members of AND group have been matched; secondly, when just
|
|
the nullable members remain to be matched. The first case is not
|
|
affected by context of the AND group (unless the first case is
|
|
identical to the second case.)
|
|
|
|
Note that inclusions are not relevant for the purposes of determining
|
|
the ambiguity of content models. Otherwise the case in clause
|
|
11.2.5.1:
|
|
|
|
An element that can satisfy an element in the content model is
|
|
considered to do so, even if the element is also an inclusion.
|
|
|
|
could never arise.
|
|
|
|
[1] Anne Brueggemann-Klein, Regular Expressions into Finite Automata,
|
|
Universitaet Freiburg, Institut fur Informatik, 33 July 1991.
|
|
*/
|
|
|
|
#include "sgmlincl.h"
|
|
|
|
/* Sets of states are represented by 0-terminated, ordered lists of
|
|
indexes in gbuf. */
|
|
|
|
#define MAXSTATES (GRPGTCNT+2)
|
|
#define listcat(x, y) strcat((char *)(x), (char *)(y))
|
|
#define listcpy(x, y) strcpy((char *)(x), (char *)(y))
|
|
|
|
/* Information about a content token. */
|
|
|
|
struct contoken {
|
|
UNCH size;
|
|
UNCH nullable;
|
|
UNCH *first;
|
|
UNCH *last;
|
|
};
|
|
|
|
static VOID contoken P((int, int, struct contoken *));
|
|
static VOID andgroup P((int, int, struct contoken *));
|
|
static VOID orgroup P((int, int, struct contoken *));
|
|
static VOID seqgroup P((int, int, struct contoken *));
|
|
static VOID andambig P((int));
|
|
static int listambig P((UNCH *));
|
|
static VOID listmerge P((UNCH *, UNCH *));
|
|
static struct contoken *newcontoken P((void));
|
|
static VOID freecontoken P((struct contoken *));
|
|
|
|
|
|
/* Dynamically allocated vector of follow sets. */
|
|
|
|
static UNCH **follow;
|
|
static UNCH *mergebuf; /* for use by listmerge */
|
|
|
|
/* Set to non-zero if the content model is ambiguous. */
|
|
|
|
static int ambigsw;
|
|
|
|
/* Check the current content model (in gbuf) for ambiguity. */
|
|
|
|
VOID ambig()
|
|
{
|
|
struct contoken *s;
|
|
int i;
|
|
|
|
if (!follow) {
|
|
/* We can't allocate everything in one chunk, because that would
|
|
overflow a 16-bit unsigned if GRPGTCNT was 253. */
|
|
UNCH *ptr;
|
|
follow = (UNCH **)rmalloc(MAXSTATES*sizeof(UNCH *));
|
|
follow[0] = 0;
|
|
ptr = (UNCH *)rmalloc((MAXSTATES - 1)*MAXSTATES);
|
|
for (i = 1; i < MAXSTATES; i++) {
|
|
follow[i] = ptr;
|
|
ptr += MAXSTATES;
|
|
}
|
|
mergebuf = (UNCH *)rmalloc(MAXSTATES);
|
|
}
|
|
|
|
for (i = 1; i < MAXSTATES; i++)
|
|
follow[i][0] = 0;
|
|
|
|
ambigsw = 0;
|
|
|
|
s = newcontoken();
|
|
contoken(1, 1, s);
|
|
|
|
ambigsw = ambigsw || listambig(s->first);
|
|
|
|
freecontoken(s);
|
|
|
|
for (i = 1; !ambigsw && i < MAXSTATES; i++)
|
|
if (listambig(follow[i]))
|
|
ambigsw = 1;
|
|
|
|
if (ambigsw)
|
|
mderr(137, (UNCH *)0, (UNCH *)0);
|
|
}
|
|
|
|
/* Free memory used for ambiguity checking. */
|
|
|
|
VOID ambigfree()
|
|
{
|
|
if (follow) {
|
|
frem((UNIV)follow[1]);
|
|
frem((UNIV)follow);
|
|
frem((UNIV)mergebuf);
|
|
follow = 0;
|
|
}
|
|
}
|
|
|
|
/* Determine whether a list of primitive content tokens (each
|
|
represented by its index in gbuf) is ambiguous. */
|
|
|
|
static
|
|
int listambig(list)
|
|
UNCH *list;
|
|
{
|
|
UNCH *p;
|
|
int chars = 0;
|
|
int rc = 0;
|
|
|
|
for (p = list; *p; p++) {
|
|
if ((gbuf[*p].ttype & TTMASK) == TTETD) {
|
|
struct etd *e = gbuf[*p].tu.thetd;
|
|
if (e->mark) {
|
|
rc = 1;
|
|
break;
|
|
}
|
|
e->mark = 1;
|
|
}
|
|
else {
|
|
assert((gbuf[*p].ttype & TTMASK) == TTCHARS);
|
|
if (chars) {
|
|
rc = 1;
|
|
break;
|
|
}
|
|
chars = 1;
|
|
}
|
|
}
|
|
|
|
for (p = list; *p; p++)
|
|
if ((gbuf[*p].ttype & TTMASK) == TTETD)
|
|
gbuf[*p].tu.thetd->mark = 0;
|
|
|
|
return rc;
|
|
}
|
|
|
|
|
|
/* Analyze a content token. The `checkand' argument is needed to ensure
|
|
that the algorithm is not exponential in the AND-group nesting depth.
|
|
*/
|
|
|
|
static
|
|
VOID contoken(m, checkand, res)
|
|
int m; /* Index of content token in gbuf */
|
|
int checkand; /* Non-zero if AND groups should be checked */
|
|
struct contoken *res; /* Result */
|
|
{
|
|
UNCH flags = gbuf[m].ttype;
|
|
switch (flags & TTMASK) {
|
|
case TTCHARS:
|
|
case TTETD:
|
|
res->first[0] = m;
|
|
res->first[1] = 0;
|
|
res->last[0] = m;
|
|
res->last[1] = 0;
|
|
res->size = 1;
|
|
res->nullable = 0;
|
|
break;
|
|
case TTAND:
|
|
if (checkand)
|
|
andambig(m);
|
|
andgroup(m, checkand, res);
|
|
break;
|
|
case TTOR:
|
|
orgroup(m, checkand, res);
|
|
break;
|
|
case TTSEQ:
|
|
seqgroup(m, checkand, res);
|
|
break;
|
|
default:
|
|
abort();
|
|
}
|
|
if (flags & TREP) {
|
|
UNCH *p;
|
|
for (p = res->last; *p; p++)
|
|
listmerge(follow[*p], res->first);
|
|
}
|
|
if (flags & TOPT)
|
|
res->nullable = 1;
|
|
}
|
|
|
|
/* Check an AND group for ambiguity. */
|
|
|
|
static
|
|
VOID andambig(m)
|
|
int m;
|
|
{
|
|
int i, tnum;
|
|
int lim;
|
|
struct contoken *curr;
|
|
struct contoken *next;
|
|
|
|
tnum = gbuf[m].tu.tnum;
|
|
assert(tnum > 0);
|
|
curr = newcontoken();
|
|
next = newcontoken();
|
|
contoken(m + 1, 0, curr);
|
|
i = m + 1 + curr->size;
|
|
curr->size += 1;
|
|
for (--tnum; tnum > 0; --tnum) {
|
|
UNCH *p;
|
|
contoken(i, 0, next);
|
|
curr->size += next->size;
|
|
i += next->size;
|
|
for (p = curr->last; *p; p++)
|
|
listcat(follow[*p], next->first);
|
|
for (p = next->last; *p; p++)
|
|
listmerge(follow[*p], curr->first);
|
|
listcat(curr->first, next->first);
|
|
listcat(curr->last, next->last);
|
|
}
|
|
lim = m + curr->size;
|
|
for (i = m + 1; i < lim; i++) {
|
|
if (listambig(follow[i]))
|
|
ambigsw = 1;
|
|
follow[i][0] = 0;
|
|
}
|
|
freecontoken(curr);
|
|
freecontoken(next);
|
|
}
|
|
|
|
/* Handle an AND group. */
|
|
|
|
static
|
|
VOID andgroup(m, checkand, res)
|
|
int m;
|
|
int checkand;
|
|
struct contoken *res;
|
|
{
|
|
int i, tnum;
|
|
/* union of the first sets of nullable members of the group */
|
|
UNCH *nullablefirst;
|
|
struct contoken *next;
|
|
|
|
tnum = gbuf[m].tu.tnum;
|
|
assert(tnum > 0);
|
|
contoken(m + 1, checkand, res);
|
|
nullablefirst = (UNCH *)rmalloc(MAXSTATES);
|
|
if (res->nullable)
|
|
listcpy(nullablefirst, res->first);
|
|
else
|
|
nullablefirst[0] = 0;
|
|
i = m + 1 + res->size;
|
|
res->size += 1;
|
|
next = newcontoken();
|
|
for (--tnum; tnum > 0; --tnum) {
|
|
UNCH *p;
|
|
contoken(i, checkand, next);
|
|
res->size += next->size;
|
|
i += next->size;
|
|
if (next->nullable)
|
|
for (p = res->last; *p; p++)
|
|
listcat(follow[*p], next->first);
|
|
for (p = next->last; *p; p++)
|
|
listmerge(follow[*p], nullablefirst);
|
|
listcat(res->first, next->first);
|
|
if (next->nullable)
|
|
listcat(nullablefirst, next->first);
|
|
listcat(res->last, next->last);
|
|
res->nullable &= next->nullable;
|
|
}
|
|
frem((UNIV)nullablefirst);
|
|
freecontoken(next);
|
|
}
|
|
|
|
/* Handle a SEQ group. */
|
|
|
|
static
|
|
VOID seqgroup(m, checkand, res)
|
|
int m;
|
|
int checkand;
|
|
struct contoken *res;
|
|
{
|
|
int i, tnum;
|
|
struct contoken *next;
|
|
|
|
tnum = gbuf[m].tu.tnum;
|
|
assert(tnum > 0);
|
|
contoken(m + 1, checkand, res);
|
|
i = m + 1 + res->size;
|
|
res->size += 1;
|
|
next = newcontoken();
|
|
for (--tnum; tnum > 0; --tnum) {
|
|
UNCH *p;
|
|
contoken(i, checkand, next);
|
|
res->size += next->size;
|
|
i += next->size;
|
|
for (p = res->last; *p; p++)
|
|
listcat(follow[*p], next->first);
|
|
if (res->nullable)
|
|
listcat(res->first, next->first);
|
|
if (next->nullable)
|
|
listcat(res->last, next->last);
|
|
else
|
|
listcpy(res->last, next->last);
|
|
res->nullable &= next->nullable;
|
|
}
|
|
freecontoken(next);
|
|
}
|
|
|
|
/* Handle an OR group. */
|
|
|
|
static
|
|
VOID orgroup(m, checkand, res)
|
|
int m;
|
|
int checkand;
|
|
struct contoken *res;
|
|
{
|
|
int i, tnum;
|
|
struct contoken *next;
|
|
|
|
tnum = gbuf[m].tu.tnum;
|
|
assert(tnum > 0);
|
|
contoken(m + 1, checkand, res);
|
|
i = m + 1 + res->size;
|
|
res->size += 1;
|
|
next = newcontoken();
|
|
for (--tnum; tnum > 0; --tnum) {
|
|
contoken(i, checkand, next);
|
|
res->size += next->size;
|
|
i += next->size;
|
|
listcat(res->first, next->first);
|
|
listcat(res->last, next->last);
|
|
res->nullable |= next->nullable;
|
|
}
|
|
freecontoken(next);
|
|
}
|
|
|
|
|
|
/* Merge the second ordered list into the first. */
|
|
|
|
static
|
|
VOID listmerge(p, b)
|
|
UNCH *p, *b;
|
|
{
|
|
UNCH *a = mergebuf;
|
|
|
|
strcpy((char *)a, (char *)p);
|
|
|
|
for (;;) {
|
|
if (*a) {
|
|
if (*b) {
|
|
if (*a < *b)
|
|
*p++ = *a++;
|
|
else if (*a > *b)
|
|
*p++ = *b++;
|
|
else
|
|
a++;
|
|
}
|
|
else
|
|
*p++ = *a++;
|
|
}
|
|
else if (*b)
|
|
*p++ = *b++;
|
|
else
|
|
break;
|
|
}
|
|
*p = '\0';
|
|
}
|
|
|
|
static
|
|
struct contoken *newcontoken()
|
|
{
|
|
struct contoken *p = (struct contoken *)rmalloc(sizeof(struct contoken)
|
|
+ MAXSTATES*2);
|
|
p->first = (UNCH *)(p + 1);
|
|
p->last = p->first + MAXSTATES;
|
|
return p;
|
|
}
|
|
|
|
static
|
|
VOID freecontoken(p)
|
|
struct contoken *p;
|
|
{
|
|
frem((UNIV)p);
|
|
}
|
|
|
|
/*
|
|
Local Variables:
|
|
c-indent-level: 5
|
|
c-continued-statement-offset: 5
|
|
c-brace-offset: -5
|
|
c-argdecl-indent: 0
|
|
c-label-offset: -5
|
|
End:
|
|
*/
|