1
0
mirror of https://git.savannah.gnu.org/git/emacs.git synced 2024-12-15 09:47:20 +00:00

(boyer_moore): Use zero as marker value for a possible

match instead of depending on overflow behavior.
This commit is contained in:
Andreas Schwab 2009-04-16 09:30:45 +00:00
parent 05fcb8daf6
commit f4646fffc2
2 changed files with 73 additions and 97 deletions

View File

@ -1,3 +1,8 @@
2009-04-16 Andreas Schwab <schwab@linux-m68k.org>
* search.c (boyer_moore): Use zero as marker value for a possible
match instead of depending on overflow behavior.
2009-04-16 Chong Yidong <cyd@stupidchicken.com>
* keyboard.c (adjust_point_for_property): Disable 2009-02-12

View File

@ -1729,9 +1729,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
{
int direction = ((n > 0) ? 1 : -1);
register int dirlen;
int infinity, limit, stride_for_teases = 0;
register int *BM_tab;
int *BM_tab_base;
int limit, stride_for_teases = 0;
int BM_tab[0400];
register unsigned char *cursor, *p_limit;
register int i, j;
unsigned char *pat, *pat_end;
@ -1747,37 +1746,28 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
int translate_prev_byte3 = 0;
int translate_prev_byte4 = 0;
BM_tab = (int *) alloca (0400 * sizeof (int));
/* The general approach is that we are going to maintain that we know
the first (closest to the present position, in whatever direction
we're searching) character that could possibly be the last
(furthest from present position) character of a valid match. We
advance the state of our knowledge by looking at that character
and seeing whether it indeed matches the last character of the
pattern. If it does, we take a closer look. If it does not, we
move our pointer (to putative last characters) as far as is
logically possible. This amount of movement, which I call a
stride, will be the length of the pattern if the actual character
appears nowhere in the pattern, otherwise it will be the distance
from the last occurrence of that character to the end of the
pattern. If the amount is zero we have a possible match. */
/* The general approach is that we are going to maintain that we know */
/* the first (closest to the present position, in whatever direction */
/* we're searching) character that could possibly be the last */
/* (furthest from present position) character of a valid match. We */
/* advance the state of our knowledge by looking at that character */
/* and seeing whether it indeed matches the last character of the */
/* pattern. If it does, we take a closer look. If it does not, we */
/* move our pointer (to putative last characters) as far as is */
/* logically possible. This amount of movement, which I call a */
/* stride, will be the length of the pattern if the actual character */
/* appears nowhere in the pattern, otherwise it will be the distance */
/* from the last occurrence of that character to the end of the */
/* pattern. */
/* As a coding trick, an enormous stride is coded into the table for */
/* characters that match the last character. This allows use of only */
/* a single test, a test for having gone past the end of the */
/* permissible match region, to test for both possible matches (when */
/* the stride goes past the end immediately) and failure to */
/* match (where you get nudged past the end one stride at a time). */
/* Here we make a "mickey mouse" BM table. The stride of the search */
/* is determined only by the last character of the putative match. */
/* If that character does not match, we will stride the proper */
/* distance to propose a match that superimposes it on the last */
/* instance of a character that matches it (per trt), or misses */
/* it entirely if there is none. */
/* Here we make a "mickey mouse" BM table. The stride of the search
is determined only by the last character of the putative match.
If that character does not match, we will stride the proper
distance to propose a match that superimposes it on the last
instance of a character that matches it (per trt), or misses
it entirely if there is none. */
dirlen = len_byte * direction;
infinity = dirlen - (lim_byte + pos_byte + len_byte + len_byte) * direction;
/* Record position after the end of the pattern. */
pat_end = base_pat + len_byte;
@ -1787,23 +1777,14 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
if (direction < 0)
base_pat = pat_end - 1;
BM_tab_base = BM_tab;
BM_tab += 0400;
j = dirlen; /* to get it in a register */
/* A character that does not appear in the pattern induces a */
/* stride equal to the pattern length. */
while (BM_tab_base != BM_tab)
{
*--BM_tab = j;
*--BM_tab = j;
*--BM_tab = j;
*--BM_tab = j;
}
/* A character that does not appear in the pattern induces a
stride equal to the pattern length. */
for (i = 0; i < 0400; i++)
BM_tab[i] = dirlen;
/* We use this for translation, instead of TRT itself.
We fill this in to handle the characters that actually
occur in the pattern. Others don't matter anyway! */
bzero (simple_translate, sizeof simple_translate);
for (i = 0; i < 0400; i++)
simple_translate[i] = i;
@ -1828,12 +1809,10 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
}
i = 0;
while (i != infinity)
while (i != dirlen)
{
unsigned char *ptr = base_pat + i;
i += direction;
if (i == dirlen)
i = infinity;
if (! NILP (trt))
{
/* If the byte currently looking at is the last of a
@ -1861,7 +1840,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
else
j = *ptr;
if (i == infinity)
if (i == dirlen)
stride_for_teases = BM_tab[j];
BM_tab[j] = dirlen - i;
@ -1894,17 +1873,16 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
{
j = *ptr;
if (i == infinity)
if (i == dirlen)
stride_for_teases = BM_tab[j];
BM_tab[j] = dirlen - i;
}
/* stride_for_teases tells how much to stride if we get a */
/* match on the far character but are subsequently */
/* disappointed, by recording what the stride would have been */
/* for that character if the last character had been */
/* different. */
/* stride_for_teases tells how much to stride if we get a
match on the far character but are subsequently
disappointed, by recording what the stride would have been
for that character if the last character had been
different. */
}
infinity = dirlen - infinity;
pos_byte += dirlen - ((direction > 0) ? direction : 0);
/* loop invariant - POS_BYTE points at where last char (first
char if reverse) of pattern would align in a possible match. */
@ -1948,43 +1926,34 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
p_limit = BYTE_POS_ADDR (limit);
p2 = (cursor = BYTE_POS_ADDR (pos_byte));
/* In this loop, pos + cursor - p2 is the surrogate for pos */
/* In this loop, pos + cursor - p2 is the surrogate for pos. */
while (1) /* use one cursor setting as long as i can */
{
if (direction > 0) /* worth duplicating */
{
/* Use signed comparison if appropriate
to make cursor+infinity sure to be > p_limit.
Assuming that the buffer lies in a range of addresses
that are all "positive" (as ints) or all "negative",
either kind of comparison will work as long
as we don't step by infinity. So pick the kind
that works when we do step by infinity. */
if ((EMACS_INT) (p_limit + infinity) > (EMACS_INT) p_limit)
while ((EMACS_INT) cursor <= (EMACS_INT) p_limit)
cursor += BM_tab[*cursor];
else
while ((EMACS_UINT) cursor <= (EMACS_UINT) p_limit)
while (cursor <= p_limit)
{
if (BM_tab[*cursor] == 0)
goto hit;
cursor += BM_tab[*cursor];
}
}
else
{
if ((EMACS_INT) (p_limit + infinity) < (EMACS_INT) p_limit)
while ((EMACS_INT) cursor >= (EMACS_INT) p_limit)
cursor += BM_tab[*cursor];
else
while ((EMACS_UINT) cursor >= (EMACS_UINT) p_limit)
while (cursor >= p_limit)
{
if (BM_tab[*cursor] == 0)
goto hit;
cursor += BM_tab[*cursor];
}
}
/* If you are here, cursor is beyond the end of the searched region. */
/* This can happen if you match on the far character of the pattern, */
/* because the "stride" of that character is infinity, a number able */
/* to throw you well beyond the end of the search. It can also */
/* happen if you fail to match within the permitted region and would */
/* otherwise try a character beyond that region */
if ((cursor - p_limit) * direction <= len_byte)
break; /* a small overrun is genuine */
cursor -= infinity; /* large overrun = hit */
/* If you are here, cursor is beyond the end of the
searched region. You fail to match within the
permitted region and would otherwise try a character
beyond that region. */
break;
hit:
i = dirlen - direction;
if (! NILP (trt))
{
@ -2056,8 +2025,8 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
pos_byte += cursor - p2;
}
else
/* Now we'll pick up a clump that has to be done the hard */
/* way because it covers a discontinuity */
/* Now we'll pick up a clump that has to be done the hard
way because it covers a discontinuity. */
{
limit = ((direction > 0)
? BUFFER_CEILING_OF (pos_byte - dirlen + 1)
@ -2069,19 +2038,21 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
and still be valid for a possible match. */
while (1)
{
/* This loop can be coded for space rather than */
/* speed because it will usually run only once. */
/* (the reach is at most len + 21, and typically */
/* does not exceed len) */
/* This loop can be coded for space rather than
speed because it will usually run only once.
(the reach is at most len + 21, and typically
does not exceed len). */
while ((limit - pos_byte) * direction >= 0)
pos_byte += BM_tab[FETCH_BYTE (pos_byte)];
/* now run the same tests to distinguish going off the */
/* end, a match or a phony match. */
if ((pos_byte - limit) * direction <= len_byte)
break; /* ran off the end */
/* Found what might be a match.
Set POS_BYTE back to last (first if reverse) pos. */
pos_byte -= infinity;
{
int ch = FETCH_BYTE (pos_byte);
if (BM_tab[ch] == 0)
goto hit2;
pos_byte += BM_tab[ch];
}
break; /* ran off the end */
hit2:
/* Found what might be a match. */
i = dirlen - direction;
while ((i -= direction) + direction != 0)
{
@ -2110,7 +2081,7 @@ boyer_moore (n, base_pat, len, len_byte, trt, inverse_trt,
/* Above loop has moved POS_BYTE part or all the way
back to the first pos (last pos if reverse).
Set it once again at the last (first if reverse) char. */
pos_byte += dirlen - i- direction;
pos_byte += dirlen - i - direction;
if (i + direction == 0)
{
int position, start, end;