1
0
mirror of https://git.FreeBSD.org/src.git synced 2024-12-05 09:14:03 +00:00

Merge local changes.

This commit is contained in:
Tim J. Robbins 2004-07-04 10:02:03 +00:00
parent d1e9179e8c
commit e5978bf334
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=131557
9 changed files with 2673 additions and 880 deletions

File diff suppressed because it is too large Load Diff

View File

@ -24,18 +24,24 @@
In addition to clobbering modularity, we eat up valuable
name space. */
# undef PARAMS
#if __STDC__
#ifdef __STDC__
# ifndef _PTR_T
# define _PTR_T
typedef void * ptr_t;
# endif
# define PARAMS(x) x
#else
# ifndef _PTR_T
# define _PTR_T
typedef char * ptr_t;
# endif
#endif
#ifdef PARAMS
# undef PARAMS
#endif
#if PROTOTYPES
# define PARAMS(x) x
#else
# define PARAMS(x) ()
#endif
@ -138,6 +144,21 @@ typedef enum
RPAREN, /* RPAREN never appears in the parse tree. */
CRANGE, /* CRANGE never appears in the parse tree.
It stands for a character range that can
match a string of one or more characters.
For example, [a-z] can match "ch" in
a Spanish locale. */
#ifdef MBS_SUPPORT
ANYCHAR, /* ANYCHAR is a terminal symbol that matches
any multibyte(or singlebyte) characters.
It is used only if MB_CUR_MAX > 1. */
MBCSET, /* MBCSET is similar to CSET, but for
multibyte characters. */
#endif /* MBS_SUPPORT */
CSET /* CSET and (and any value greater) is a
terminal symbol that matches any of a
class of characters. */
@ -225,6 +246,12 @@ typedef struct
char backref; /* True if this state matches a \<digit>. */
unsigned char constraint; /* Constraint for this state to accept. */
int first_end; /* Token value of the first END in elems. */
#ifdef MBS_SUPPORT
position_set mbps; /* Positions which can match multibyte
characters. e.g. period.
These staff are used only if
MB_CUR_MAX > 1. */
#endif
} dfa_state;
/* Element of a list of strings, at least one of which is known to
@ -236,6 +263,26 @@ struct dfamust
struct dfamust *next;
};
#ifdef MBS_SUPPORT
/* A bracket operator.
e.g. [a-c], [[:alpha:]], etc. */
struct mb_char_classes
{
int invert;
wchar_t *chars; /* Normal characters. */
int nchars;
wctype_t *ch_classes; /* Character classes. */
int nch_classes;
wchar_t *range_sts; /* Range characters (start of the range). */
wchar_t *range_ends; /* Range characters (end of the range). */
int nranges;
char **equivs; /* Equivalent classes. */
int nequivs;
char **coll_elems;
int ncoll_elems; /* Collating elements. */
};
#endif
/* A compiled regular expression. */
struct dfa
{
@ -254,6 +301,32 @@ struct dfa
int nleaves; /* Number of leaves on the parse tree. */
int nregexps; /* Count of parallel regexps being built
with dfaparse(). */
#ifdef MBS_SUPPORT
/* These stuff are used only if MB_CUR_MAX > 1 or multibyte environments. */
int nmultibyte_prop;
int *multibyte_prop;
/* The value of multibyte_prop[i] is defined by following rule.
if tokens[i] < NOTCHAR
bit 1 : tokens[i] is a singlebyte character, or the last-byte of
a multibyte character.
bit 0 : tokens[i] is a singlebyte character, or the 1st-byte of
a multibyte character.
if tokens[i] = MBCSET
("the index of mbcsets correspnd to this operator" << 2) + 3
e.g.
tokens
= 'single_byte_a', 'multi_byte_A', single_byte_b'
= 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b'
multibyte_prop
= 3 , 1 , 0 , 2 , 3
*/
/* Array of the bracket expressoin in the DFA. */
struct mb_char_classes *mbcsets;
int nmbcsets;
int mbcsets_alloc;
#endif
/* Stuff owned by the state builder. */
dfa_state *states; /* States of the dfa. */
@ -292,13 +365,6 @@ struct dfa
on a state that potentially could do so. */
int *success; /* Table of acceptance conditions used in
dfaexec and computed in build_state. */
int *newlines; /* Transitions on newlines. The entry for a
newline in any transition table is always
-1 so we can count lines without wasting
too many cycles. The transition for a
newline is stored separately and handled
as a special case. Newline is also used
as a sentinel at the end of the buffer. */
struct dfamust *musts; /* List of strings, at least one of which
is known to appear in any r.e. matching
the dfa. */
@ -325,26 +391,21 @@ struct dfa
/* dfasyntax() takes three arguments; the first sets the syntax bits described
earlier in this file, the second sets the case-folding flag, and the
third specifies the line terminator. */
extern void dfasyntax PARAMS ((reg_syntax_t, int, int));
extern void dfasyntax PARAMS ((reg_syntax_t, int, unsigned char));
/* Compile the given string of the given length into the given struct dfa.
Final argument is a flag specifying whether to build a searching or an
exact matcher. */
extern void dfacomp PARAMS ((char *, size_t, struct dfa *, int));
extern void dfacomp PARAMS ((char const *, size_t, struct dfa *, int));
/* Execute the given struct dfa on the buffer of characters. The
first char * points to the beginning, and the second points to the
first character after the end of the buffer, which must be a writable
place so a sentinel end-of-buffer marker can be stored there. The
second-to-last argument is a flag telling whether to allow newlines to
be part of a string matching the regexp. The next-to-last argument,
if non-NULL, points to a place to increment every time we see a
newline. The final argument, if non-NULL, points to a flag that will
last byte of the buffer must equal the end-of-line byte.
The final argument points to a flag that will
be set if further examination by a backtracking matcher is needed in
order to verify backreferencing; otherwise the flag will be cleared.
Returns NULL if no match is found, or a pointer to the first
Returns (size_t) -1 if no match is found, or the offset of the first
character after the first & shortest matching string in the buffer. */
extern char *dfaexec PARAMS ((struct dfa *, char *, char *, int, int *, int *));
extern size_t dfaexec PARAMS ((struct dfa *, char const *, size_t, int *));
/* Free the storage held by the components of a struct dfa. */
extern void dfafree PARAMS ((struct dfa *));
@ -355,7 +416,7 @@ extern void dfafree PARAMS ((struct dfa *));
extern void dfainit PARAMS ((struct dfa *));
/* Incrementally parse a string of given length into a struct dfa. */
extern void dfaparse PARAMS ((char *, size_t, struct dfa *));
extern void dfaparse PARAMS ((char const *, size_t, struct dfa *));
/* Analyze a parsed regexp; second argument tells whether to build a searching
or an exact matcher. */
@ -369,6 +430,5 @@ extern void dfastate PARAMS ((int, struct dfa *, int []));
/* dfaerror() is called by the regexp routines whenever an error occurs. It
takes a single argument, a NUL-terminated string describing the error.
The default dfaerror() prints the error message to stderr and exits.
The user can provide a different dfafree() if so desired. */
The user must supply a dfaerror. */
extern void dfaerror PARAMS ((const char *));

View File

@ -4,6 +4,11 @@
#ifndef HAVE_GETPAGESIZE
#if !defined getpagesize && defined __BEOS__
# include <OS.h>
# define getpagesize() B_PAGE_SIZE
#endif
#ifdef HAVE_UNISTD_H
# include <unistd.h>
#endif

View File

@ -13,7 +13,7 @@
.de Id
.ds Dt \\$4
..
.Id $Id: grep.1,v 1.11 2000/02/26 03:18:40 alainm Exp $
.Id $Id: grep.1,v 1.23 2002/01/22 13:20:04 bero Exp $
.TH GREP 1 \*(Dt "GNU Project"
.SH NAME
grep, egrep, fgrep, zgrep, zegrep, zfgrep,
@ -72,6 +72,9 @@ is the same as
Print
.I NUM
lines of trailing context after matching lines.
Places a line containing
.B \-\^\-
between contiguous groups of matches.
.TP
.BR \-a ", " \-\^\-text
Process a binary file as if it were text; this is equivalent to the
@ -82,11 +85,17 @@ option.
Print
.I NUM
lines of leading context before matching lines.
Places a line containing
.B \-\^\-
between contiguous groups of matches.
.TP
\fB\-C\fP [\fINUM\fP], \fB\-\fP\fINUM\fP, \fB\-\^\-context\fP[\fB=\fP\fINUM\fP]
.BI \-C " NUM" "\fR,\fP \-\^\-context=" NUM
Print
.I NUM
lines (default 2) of output context.
lines of output context.
Places a line containing
.B \-\^\-
between contiguous groups of matches.
.TP
.BR \-b ", " \-\^\-byte-offset
Print the byte offset within the input file before
@ -127,6 +136,11 @@ might output binary garbage,
which can have nasty side effects if the output is a terminal and if the
terminal driver interprets some of it as commands.
.TP
.BI \-\^\-colour[=\fIWHEN\fR] ", " \-\^\-color[=\fIWHEN\fR]
Surround the matching string with the marker find in
.B GREP_COLOR
environment variable. WHEN may be `never', `always', or `auto'
.TP
.BR \-c ", " \-\^\-count
Suppress normal output; instead print a count of
matching lines for each input file.
@ -134,6 +148,20 @@ With the
.BR \-v ", " \-\^\-invert-match
option (see below), count non-matching lines.
.TP
.BI \-D " ACTION" "\fR,\fP \-\^\-devices=" ACTION
If an input file is a device, FIFO or socket, use
.I ACTION
to process it. By default,
.I ACTION
is
.BR read ,
which means that devices are read just as if they were ordinary files.
If
.I ACTION
is
.BR skip ,
devices are silently skipped.
.TP
.BI \-d " ACTION" "\fR,\fP \-\^\-directories=" ACTION
If an input file is a directory, use
.I ACTION
@ -173,6 +201,10 @@ Interpret
.I PATTERN
as a list of fixed strings, separated by newlines,
any of which is to be matched.
.BR \-P ", " \-\^\-perl-regexp
Interpret
.I PATTERN
as a Perl regular expression.
.TP
.BI \-f " FILE" "\fR,\fP \-\^\-file=" FILE
Obtain patterns from
@ -218,6 +250,39 @@ the name of each input file from which output
would normally have been printed. The scanning will
stop on the first match.
.TP
.BI \-m " NUM" "\fR,\fP \-\^\-max-count=" NUM
Stop reading a file after
.I NUM
matching lines. If the input is standard input from a regular file,
and
.I NUM
matching lines are output,
.B grep
ensures that the standard input is positioned to just after the last
matching line before exiting, regardless of the presence of trailing
context lines. This enables a calling process to resume a search.
When
.B grep
stops after
.I NUM
matching lines, it outputs any trailing context lines. When the
.B \-c
or
.B \-\^\-count
option is also used,
.B grep
does not output a count greater than
.IR NUM .
When the
.B \-v
or
.B \-\^\-invert-match
option is also used,
.B grep
stops after outputting
.I NUM
non-matching lines.
.TP
.B \-\^\-mmap
If possible, use the
.BR mmap (2)
@ -237,21 +302,43 @@ is operating, or if an I/O error occurs.
Prefix each line of output with the line number
within its input file.
.TP
.BR \-o ", " \-\^\-only-matching
Show only the part of a matching line that matches
.I PATTERN.
.TP
.BI \-\^\-label= LABEL
Displays input actually coming from standard input as input coming from file
.I LABEL.
This is especially useful for tools like zgrep, e.g.
.B "gzip -cd foo.gz |grep --label=foo something"
.TP
.BR \-\^\-line-buffering
Use line buffering, it can be a performance penality.
.TP
.BR \-q ", " \-\^\-quiet ", " \-\^\-silent
Quiet; suppress normal output. The scanning will stop
on the first match.
Quiet; do not write anything to standard output.
Exit immediately with zero status if any match is found,
even if an error was detected.
Also see the
.B \-s
or
.B \-\^\-no-messages
option below.
option.
.TP
.BR \-r ", " \-\^\-recursive
.BR \-R ", " \-r ", " \-\^\-recursive
Read all files under each directory, recursively;
this is equivalent to the
.B "\-d recurse"
option.
.TP
.BR "\fR \fP \-\^\-include=" PATTERN
Recurse in directories only searching file matching
.I PATTERN.
.TP
.BR "\fR \fP \-\^\-exclude=" PATTERN
Recurse in directories skip file matching
.I PATTERN.
.TP
.BR \-s ", " \-\^\-no-messages
Suppress error messages about nonexistent or unreadable files.
Portability note: unlike \s-1GNU\s0
@ -378,11 +465,13 @@ a single character. Most characters, including all letters and digits,
are regular expressions that match themselves. Any metacharacter with
special meaning may be quoted by preceding it with a backslash.
.PP
A list of characters enclosed by
A
.I "bracket expression"
is a list of characters enclosed by
.B [
and
.B ]
matches any single
.BR ] .
It matches any single
character in that list; if the first character of the list
is the caret
.B ^
@ -391,10 +480,32 @@ then it matches any character
in the list.
For example, the regular expression
.B [0123456789]
matches any single digit. A range of characters
may be specified by giving the first and last characters, separated
by a hyphen.
Finally, certain named classes of characters are predefined.
matches any single digit.
.PP
Within a bracket expression, a
.I "range expression"
consists of two characters separated by a hyphen.
It matches any single character that sorts between the two characters,
inclusive, using the locale's collating sequence and character set.
For example, in the default C locale,
.B [a\-d]
is equivalent to
.BR [abcd] .
Many locales sort characters in dictionary order, and in these locales
.B [a\-d]
is typically not equivalent to
.BR [abcd] ;
it might be equivalent to
.BR [aBbCcDd] ,
for example.
To obtain the traditional interpretation of bracket expressions,
you can use the C locale by setting the
.B LC_ALL
environment variable to the value
.BR C .
.PP
Finally, certain named classes of characters are predefined within
bracket expressions, as follows.
Their names are self explanatory, and they are
.BR [:alnum:] ,
.BR [:alpha:] ,
@ -411,8 +522,8 @@ and
For example,
.B [[:alnum:]]
means
.BR [0-9A-Za-z] ,
except the latter form depends upon the \s-1POSIX\s0 locale and the
.BR [0\-9A\-Za\-z] ,
except the latter form depends upon the C locale and the
\s-1ASCII\s0 character encoding, whereas the former is independent
of locale and character set.
(Note that the brackets in these class names are part of the symbolic
@ -559,6 +670,29 @@ instead of reporting a syntax error in the regular expression.
\s-1POSIX.2\s0 allows this behavior as an extension, but portable scripts
should avoid it.
.SH "ENVIRONMENT VARIABLES"
Grep's behavior is affected by the following environment variables.
.PP
A locale
.BI LC_ foo
is specified by examining the three environment variables
.BR LC_ALL ,
.BR LC_\fIfoo\fP ,
.BR LANG ,
in that order.
The first of these variables that is set specifies the locale.
For example, if
.B LC_ALL
is not set, but
.B LC_MESSAGES
is set to
.BR pt_BR ,
then Brazilian Portuguese is used for the
.B LC_MESSAGES
locale.
The C locale is used if none of these environment variables are set,
or if the locale catalog is not installed, or if
.B grep
was not compiled with national language support (\s-1NLS\s0).
.TP
.B GREP_OPTIONS
This variable specifies default options to be placed in front of any
@ -576,28 +710,29 @@ Option specifications are separated by whitespace.
A backslash escapes the next character,
so it can be used to specify an option containing whitespace or a backslash.
.TP
\fBLC_ALL\fP, \fBLC_MESSAGES\fP, \fBLANG\fP
.B GREP_COLOR
Specifies the marker for highlighting.
.TP
\fBLC_ALL\fP, \fBLC_COLLATE\fP, \fBLANG\fP
These variables specify the
.B LC_MESSAGES
locale, which determines the language that
.B grep
uses for messages.
The locale is determined by the first of these variables that is set.
American English is used if none of these environment variables are set,
or if the message catalog is not installed, or if
.B grep
was not compiled with national language support (\s-1NLS\s0).
.B LC_COLLATE
locale, which determines the collating sequence used to interpret
range expressions like
.BR [a\-z] .
.TP
\fBLC_ALL\fP, \fBLC_CTYPE\fP, \fBLANG\fP
These variables specify the
.B LC_CTYPE
locale, which determines the type of characters, e.g., which
characters are whitespace.
The locale is determined by the first of these variables that is set.
The \s-1POSIX\s0 locale is used if none of these environment variables
are set, or if the locale catalog is not installed, or if
.TP
\fBLC_ALL\fP, \fBLC_MESSAGES\fP, \fBLANG\fP
These variables specify the
.B LC_MESSAGES
locale, which determines the language that
.B grep
was not compiled with national language support (\s-1NLS\s0).
uses for messages.
The default C locale uses American English messages.
.TP
.B POSIXLY_CORRECT
If set,
@ -612,13 +747,15 @@ Also, \s-1POSIX.2\s0 requires that unrecognized options be diagnosed as
\*(lqillegal\*(rq, but since they are not really against the law the default
is to diagnose them as \*(lqinvalid\*(rq.
.SH DIAGNOSTICS
Normally, exit status is 0 if matches were found,
and 1 if no matches were found. (The
.B \-v
option inverts the sense of the exit status.)
Exit status is 2 if there were syntax errors
in the pattern, inaccessible input files, or
other system errors.
.PP
Normally, exit status is 0 if selected lines are found and 1 otherwise.
But the exit status is 2 if an error occurred, unless the
.B \-q
or
.B \-\^\-quiet
or
.B \-\^\-silent
option is used and a selected line is found.
.SH BUGS
Email bug reports to
.BR bug-gnu-utils@gnu.org .
@ -626,7 +763,7 @@ Be sure to include the word \*(lqgrep\*(rq somewhere in the
\*(lqSubject:\*(rq field.
.PP
Large repetition counts in the
.BI { m , n }
.BI { n , m }
construct may cause grep to use lots of memory.
In addition,
certain other obscure regular expressions require exponential time

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
/* grep.h - interface to grep driver for searching subroutines.
Copyright (C) 1992, 1998 Free Software Foundation, Inc.
Copyright (C) 1992, 1998, 2001 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -22,20 +22,16 @@
# define __attribute__(x)
#endif
extern void fatal PARAMS ((const char *, int)) __attribute__((noreturn));
extern char *xmalloc PARAMS ((size_t size));
extern char *xrealloc PARAMS ((char *ptr, size_t size));
/* Grep.c expects the matchers vector to be terminated
by an entry with a NULL name, and to contain at least
by an entry with a NULL compile, and to contain at least
an entry named "default". */
extern struct matcher
{
char *name;
void (*compile) PARAMS ((char *, size_t));
char *(*execute) PARAMS ((char *, size_t, char **));
} matchers[];
char name[8];
void (*compile) PARAMS ((char const *, size_t));
size_t (*execute) PARAMS ((char const *, size_t, size_t *, int));
} const matchers[];
/* Exported from fgrepmat.c, egrepmat.c, grepmat.c. */
extern char const *matcher;

View File

@ -83,22 +83,13 @@ struct kwset
struct trie *next[NCHAR]; /* Table of children of the root. */
char *target; /* Target string if there's only one. */
int mind2; /* Used in Boyer-Moore search for one string. */
char *trans; /* Character translation table. */
char const *trans; /* Character translation table. */
};
/* prototypes */
static void enqueue PARAMS((struct tree *, struct trie **));
static void treefails PARAMS((register struct tree *, struct trie *, struct trie *));
static void treedelta PARAMS((register struct tree *,register unsigned int, unsigned char *));
static int hasevery PARAMS((register struct tree *, register struct tree *));
static void treenext PARAMS((struct tree *, struct trie **));
static char * bmexec PARAMS((kwset_t, char *, size_t));
static char * cwexec PARAMS((kwset_t, char *, size_t, struct kwsmatch *));
/* Allocate and initialize a keyword set object, returning an opaque
pointer to it. Return NULL if memory is not available. */
kwset_t
kwsalloc (char *trans)
kwsalloc (char const *trans)
{
struct kwset *kwset;
@ -133,7 +124,7 @@ kwsalloc (char *trans)
/* Add the given string to the contents of the keyword set. Return NULL
for success, an error message otherwise. */
char *
kwsincr (kwset_t kws, char *text, size_t len)
kwsincr (kwset_t kws, char const *text, size_t len)
{
struct kwset *kwset;
register struct trie *trie;
@ -303,7 +294,8 @@ enqueue (struct tree *tree, struct trie **last)
from the given tree, given the failure function for their parent as
well as a last resort failure node. */
static void
treefails (register struct tree *tree, struct trie *fail, struct trie *recourse)
treefails (register struct tree const *tree, struct trie const *fail,
struct trie *recourse)
{
register struct tree *link;
@ -337,7 +329,7 @@ treefails (register struct tree *tree, struct trie *fail, struct trie *recourse)
/* Set delta entries for the links of the given tree such that
the preexisting delta value is larger than the current depth. */
static void
treedelta (register struct tree *tree,
treedelta (register struct tree const *tree,
register unsigned int depth,
unsigned char delta[])
{
@ -351,7 +343,7 @@ treedelta (register struct tree *tree,
/* Return true if A has every label in B. */
static int
hasevery (register struct tree *a, register struct tree *b)
hasevery (register struct tree const *a, register struct tree const *b)
{
if (!b)
return 1;
@ -370,7 +362,7 @@ hasevery (register struct tree *a, register struct tree *b)
/* Compute a vector, indexed by character code, of the trie nodes
referenced from the given tree. */
static void
treenext (struct tree *tree, struct trie *next[])
treenext (struct tree const *tree, struct trie *next[])
{
if (!tree)
return;
@ -387,7 +379,7 @@ kwsprep (kwset_t kws)
register struct kwset *kwset;
register int i;
register struct trie *curr, *fail;
register char *trans;
register char const *trans;
unsigned char delta[NCHAR];
struct trie *last, *next[NCHAR];
@ -499,23 +491,26 @@ kwsprep (kwset_t kws)
#define U(C) ((unsigned char) (C))
/* Fast boyer-moore search. */
static char *
bmexec (kwset_t kws, char *text, size_t size)
static size_t
bmexec (kwset_t kws, char const *text, size_t size)
{
struct kwset *kwset;
register unsigned char *d1;
register char *ep, *sp, *tp;
struct kwset const *kwset;
register unsigned char const *d1;
register char const *ep, *sp, *tp;
register int d, gc, i, len, md2;
kwset = (struct kwset *) kws;
kwset = (struct kwset const *) kws;
len = kwset->mind;
if (len == 0)
return text;
if (len > size)
return 0;
if (len > size)
return -1;
if (len == 1)
return memchr(text, kwset->target[0], size);
{
tp = memchr (text, kwset->target[0], size);
return tp ? tp - text : -1;
}
d1 = kwset->delta;
sp = kwset->target + len;
@ -554,7 +549,7 @@ bmexec (kwset_t kws, char *text, size_t size)
for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i)
;
if (i > len)
return tp - len;
return tp - len - text;
}
tp += md2;
}
@ -573,26 +568,29 @@ bmexec (kwset_t kws, char *text, size_t size)
for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i)
;
if (i > len)
return tp - len;
return tp - len - text;
}
d = md2;
}
return 0;
return -1;
}
/* Hairy multiple string search. */
static char *
cwexec (kwset_t kws, char *text, size_t len, struct kwsmatch *kwsmatch)
static size_t
cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch)
{
struct kwset *kwset;
struct trie **next, *trie, *accept;
char *beg, *lim, *mch, *lmch;
register unsigned char c, *delta;
struct kwset const *kwset;
struct trie * const *next;
struct trie const *trie;
struct trie const *accept;
char const *beg, *lim, *mch, *lmch;
register unsigned char c;
register unsigned char const *delta;
register int d;
register char *end, *qlim;
register struct tree *tree;
register char *trans;
register char const *end, *qlim;
register struct tree const *tree;
register char const *trans;
#ifdef lint
accept = NULL;
@ -601,7 +599,7 @@ cwexec (kwset_t kws, char *text, size_t len, struct kwsmatch *kwsmatch)
/* Initialize register copies and look for easy ways out. */
kwset = (struct kwset *) kws;
if (len < kwset->mind)
return 0;
return -1;
next = kwset->next;
delta = kwset->delta;
trans = kwset->trans;
@ -670,7 +668,7 @@ cwexec (kwset_t kws, char *text, size_t len, struct kwsmatch *kwsmatch)
if (mch)
goto match;
}
return 0;
return -1;
match:
/* Given a known match, find the longest possible match anchored
@ -730,10 +728,10 @@ cwexec (kwset_t kws, char *text, size_t len, struct kwsmatch *kwsmatch)
if (kwsmatch)
{
kwsmatch->index = accept->accepting / 2;
kwsmatch->beg[0] = mch;
kwsmatch->offset[0] = mch - text;
kwsmatch->size[0] = accept->depth;
}
return mch;
return mch - text;
}
/* Search through the given text for a match of any member of the
@ -743,20 +741,18 @@ cwexec (kwset_t kws, char *text, size_t len, struct kwsmatch *kwsmatch)
matching substring. Similarly, if FOUNDIDX is non-NULL, store
in the referenced location the index number of the particular
keyword matched. */
char *
kwsexec (kwset_t kws, char *text, size_t size, struct kwsmatch *kwsmatch)
size_t
kwsexec (kwset_t kws, char const *text, size_t size,
struct kwsmatch *kwsmatch)
{
struct kwset *kwset;
char *ret;
kwset = (struct kwset *) kws;
struct kwset const *kwset = (struct kwset *) kws;
if (kwset->words == 1 && kwset->trans == 0)
{
ret = bmexec(kws, text, size);
if (kwsmatch != 0 && ret != 0)
size_t ret = bmexec (kws, text, size);
if (kwsmatch != 0 && ret != (size_t) -1)
{
kwsmatch->index = 0;
kwsmatch->beg[0] = ret;
kwsmatch->offset[0] = ret;
kwsmatch->size[0] = kwset->mind;
}
return ret;

View File

@ -25,7 +25,7 @@
struct kwsmatch
{
int index; /* Index number of matching keyword. */
char *beg[1]; /* Begin pointer for each submatch. */
size_t offset[1]; /* Offset of each submatch. */
size_t size[1]; /* Length of each submatch. */
};
@ -35,12 +35,12 @@ typedef ptr_t kwset_t;
if enough memory cannot be obtained. The argument if non-NULL
specifies a table of character translations to be applied to all
pattern and search text. */
extern kwset_t kwsalloc PARAMS((char *));
extern kwset_t kwsalloc PARAMS((char const *));
/* Incrementally extend the keyword set to include the given string.
Return NULL for success, or an error message. Remember an index
number for each keyword included in the set. */
extern char *kwsincr PARAMS((kwset_t, char *, size_t));
extern char *kwsincr PARAMS((kwset_t, char const *, size_t));
/* When the keyword set has been completely built, prepare it for
use. Return NULL for success, or an error message. */
@ -52,7 +52,7 @@ extern char *kwsprep PARAMS((kwset_t));
the matching substring in the integer it points to. Similarly,
if foundindex is non-NULL, store the index of the particular
keyword found therein. */
extern char *kwsexec PARAMS((kwset_t, char *, size_t, struct kwsmatch *));
extern size_t kwsexec PARAMS((kwset_t, char const *, size_t, struct kwsmatch *));
/* Deallocate the given keyword set and all its associated storage. */
extern void kwsfree PARAMS((kwset_t));

View File

@ -24,54 +24,71 @@
# include <config.h>
#endif
#include <sys/types.h>
#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC
/* We can handle multibyte string. */
# define MBS_SUPPORT
# include <wchar.h>
# include <wctype.h>
#endif
#include "system.h"
#include "grep.h"
#include "regex.h"
#include "dfa.h"
#include "kwset.h"
#include "error.h"
#include "xalloc.h"
#ifdef HAVE_LIBPCRE
# include <pcre.h>
#endif
#define NCHAR (UCHAR_MAX + 1)
static void Gcompile PARAMS((char *, size_t));
static void Ecompile PARAMS((char *, size_t));
static char *EGexecute PARAMS((char *, size_t, char **));
static void Fcompile PARAMS((char *, size_t));
static char *Fexecute PARAMS((char *, size_t, char **));
static void kwsinit PARAMS((void));
/* Here is the matchers vector for the main program. */
struct matcher matchers[] = {
{ "default", Gcompile, EGexecute },
{ "grep", Gcompile, EGexecute },
{ "egrep", Ecompile, EGexecute },
{ "awk", Ecompile, EGexecute },
{ "fgrep", Fcompile, Fexecute },
{ 0, 0, 0 },
};
/* For -w, we also consider _ to be word constituent. */
#define WCHAR(C) (ISALNUM(C) || (C) == '_')
/* DFA compiled regexp. */
static struct dfa dfa;
/* Regex compiled regexp. */
static struct re_pattern_buffer regexbuf;
/* The Regex compiled patterns. */
static struct patterns
{
/* Regex compiled regexp. */
struct re_pattern_buffer regexbuf;
struct re_registers regs; /* This is here on account of a BRAIN-DEAD
Q@#%!# library interface in regex.c. */
} patterns0;
struct patterns *patterns;
size_t pcount;
/* KWset compiled pattern. For Ecompile and Gcompile, we compile
a list of strings, at least one of which is known to occur in
any string matching the regexp. */
static kwset_t kwset;
/* Last compiled fixed string known to exactly match the regexp.
If kwsexec() returns < lastexact, then we don't need to
/* Number of compiled fixed strings known to exactly match the regexp.
If kwsexec returns < kwset_exact_matches, then we don't need to
call the regexp matcher at all. */
static int lastexact;
static int kwset_exact_matches;
#if defined(MBS_SUPPORT)
static char* check_multibyte_string PARAMS ((char const *buf, size_t size));
#endif
static void kwsinit PARAMS ((void));
static void kwsmusts PARAMS ((void));
static void Gcompile PARAMS ((char const *, size_t));
static void Ecompile PARAMS ((char const *, size_t));
static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int ));
static void Fcompile PARAMS ((char const *, size_t));
static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int));
static void Pcompile PARAMS ((char const *, size_t ));
static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int));
void
dfaerror (char const *mesg)
{
fatal(mesg, 0);
error (2, 0, mesg);
}
static void
@ -82,10 +99,10 @@ kwsinit (void)
if (match_icase)
for (i = 0; i < NCHAR; ++i)
trans[i] = TOLOWER(i);
trans[i] = TOLOWER (i);
if (!(kwset = kwsalloc(match_icase ? trans : (char *) 0)))
fatal("memory exhausted", 0);
if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0)))
error (2, 0, _("memory exhausted"));
}
/* If the DFA turns out to have some set of fixed strings one of
@ -95,12 +112,12 @@ kwsinit (void)
static void
kwsmusts (void)
{
struct dfamust *dm;
char *err;
struct dfamust const *dm;
char const *err;
if (dfa.musts)
{
kwsinit();
kwsinit ();
/* First, we compile in the substrings known to be exact
matches. The kwset matcher will return the index
of the matching string that it chooses. */
@ -108,9 +125,9 @@ kwsmusts (void)
{
if (!dm->exact)
continue;
++lastexact;
if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
fatal(err, 0);
++kwset_exact_matches;
if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
error (2, 0, err);
}
/* Now, we compile the substrings that will require
the use of the regexp matcher. */
@ -118,24 +135,90 @@ kwsmusts (void)
{
if (dm->exact)
continue;
if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0)
fatal(err, 0);
if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0)
error (2, 0, err);
}
if ((err = kwsprep(kwset)) != 0)
fatal(err, 0);
if ((err = kwsprep (kwset)) != 0)
error (2, 0, err);
}
}
#ifdef MBS_SUPPORT
/* This function allocate the array which correspond to "buf".
Then this check multibyte string and mark on the positions which
are not singlebyte character nor the first byte of a multibyte
character. Caller must free the array. */
static char*
check_multibyte_string(char const *buf, size_t size)
{
char *mb_properties = malloc(size);
mbstate_t cur_state;
int i;
memset(&cur_state, 0, sizeof(mbstate_t));
memset(mb_properties, 0, sizeof(char)*size);
for (i = 0; i < size ;)
{
size_t mbclen;
mbclen = mbrlen(buf + i, size - i, &cur_state);
if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0)
{
/* An invalid sequence, or a truncated multibyte character.
We treat it as a singlebyte character. */
mbclen = 1;
}
mb_properties[i] = mbclen;
i += mbclen;
}
return mb_properties;
}
#endif
static void
Gcompile (char *pattern, size_t size)
Gcompile (char const *pattern, size_t size)
{
const char *err;
char const *sep;
size_t total = size;
char const *motif = pattern;
re_set_syntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
dfasyntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte);
if ((err = re_compile_pattern(pattern, size, &regexbuf)) != 0)
fatal(err, 0);
/* For GNU regex compiler we have to pass the patterns separately to detect
errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
GNU regex should have raise a syntax error. The same for backref, where
the backref should have been local to each pattern. */
do
{
size_t len;
sep = memchr (motif, '\n', total);
if (sep)
{
len = sep - motif;
sep++;
total -= (len + 1);
}
else
{
len = total;
total = 0;
}
patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
if (patterns == NULL)
error (2, errno, _("memory exhausted"));
patterns[pcount] = patterns0;
if ((err = re_compile_pattern (motif, len,
&(patterns[pcount].regexbuf))) != 0)
error (2, 0, err);
pcount++;
motif = sep;
} while (sep && total != 0);
/* In the match_words and match_lines cases, we use a different pattern
for the DFA matcher that will quickly throw out cases that won't work.
@ -144,49 +227,42 @@ Gcompile (char *pattern, size_t size)
if (match_words || match_lines)
{
/* In the whole-word case, we use the pattern:
(^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
\(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\).
In the whole-line case, we use the pattern:
^(userpattern)$.
BUG: Using [A-Za-z_] is locale-dependent!
So will use [:alnum:] */
^\(userpattern\)$. */
char *n = malloc(size + 50);
int i = 0;
strcpy(n, "");
if (match_lines)
strcpy(n, "^\\(");
if (match_words)
strcpy(n, "\\(^\\|[^[:alnum:]_]\\)\\(");
i = strlen(n);
memcpy(n + i, pattern, size);
static char const line_beg[] = "^\\(";
static char const line_end[] = "\\)$";
static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\(";
static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)";
char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
size_t i;
strcpy (n, match_lines ? line_beg : word_beg);
i = strlen (n);
memcpy (n + i, pattern, size);
i += size;
if (match_words)
strcpy(n + i, "\\)\\([^[:alnum:]_]\\|$\\)");
if (match_lines)
strcpy(n + i, "\\)$");
i += strlen(n + i);
dfacomp(n, i, &dfa, 1);
strcpy (n + i, match_lines ? line_end : word_end);
i += strlen (n + i);
pattern = n;
size = i;
}
else
dfacomp(pattern, size, &dfa, 1);
kwsmusts();
dfacomp (pattern, size, &dfa, 1);
kwsmusts ();
}
static void
Ecompile (char *pattern, size_t size)
Ecompile (char const *pattern, size_t size)
{
const char *err;
const char *sep;
size_t total = size;
char const *motif = pattern;
if (strcmp(matcher, "awk") == 0)
if (strcmp (matcher, "awk") == 0)
{
re_set_syntax(RE_SYNTAX_AWK);
dfasyntax(RE_SYNTAX_AWK, match_icase, eolbyte);
re_set_syntax (RE_SYNTAX_AWK);
dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte);
}
else
{
@ -194,8 +270,38 @@ Ecompile (char *pattern, size_t size)
dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte);
}
if ((err = re_compile_pattern(pattern, size, &regexbuf)) != 0)
fatal(err, 0);
/* For GNU regex compiler we have to pass the patterns separately to detect
errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
GNU regex should have raise a syntax error. The same for backref, where
the backref should have been local to each pattern. */
do
{
size_t len;
sep = memchr (motif, '\n', total);
if (sep)
{
len = sep - motif;
sep++;
total -= (len + 1);
}
else
{
len = total;
total = 0;
}
patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns));
if (patterns == NULL)
error (2, errno, _("memory exhausted"));
patterns[pcount] = patterns0;
if ((err = re_compile_pattern (motif, len,
&(patterns[pcount].regexbuf))) != 0)
error (2, 0, err);
pcount++;
motif = sep;
} while (sep && total != 0);
/* In the match_words and match_lines cases, we use a different pattern
for the DFA matcher that will quickly throw out cases that won't work.
@ -204,186 +310,236 @@ Ecompile (char *pattern, size_t size)
if (match_words || match_lines)
{
/* In the whole-word case, we use the pattern:
(^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$).
(^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$).
In the whole-line case, we use the pattern:
^(userpattern)$.
BUG: Using [A-Za-z_] is locale-dependent!
so will use the char class */
char *n = malloc(size + 50);
int i = 0;
strcpy(n, "");
if (match_lines)
strcpy(n, "^(");
if (match_words)
strcpy(n, "(^|[^[:alnum:]_])(");
^(userpattern)$. */
static char const line_beg[] = "^(";
static char const line_end[] = ")$";
static char const word_beg[] = "(^|[^[:alnum:]_])(";
static char const word_end[] = ")([^[:alnum:]_]|$)";
char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end);
size_t i;
strcpy (n, match_lines ? line_beg : word_beg);
i = strlen(n);
memcpy(n + i, pattern, size);
memcpy (n + i, pattern, size);
i += size;
if (match_words)
strcpy(n + i, ")([^[:alnum:]_]|$)");
if (match_lines)
strcpy(n + i, ")$");
i += strlen(n + i);
dfacomp(n, i, &dfa, 1);
strcpy (n + i, match_lines ? line_end : word_end);
i += strlen (n + i);
pattern = n;
size = i;
}
else
dfacomp(pattern, size, &dfa, 1);
kwsmusts();
dfacomp (pattern, size, &dfa, 1);
kwsmusts ();
}
static char *
EGexecute (char *buf, size_t size, char **endp)
static size_t
EGexecute (char const *buf, size_t size, size_t *match_size, int exact)
{
register char *buflim, *beg, *end, save;
register char const *buflim, *beg, *end;
char eol = eolbyte;
int backref, start, len;
struct kwsmatch kwsm;
static struct re_registers regs; /* This is static on account of a BRAIN-DEAD
Q@#%!# library interface in regex.c. */
size_t i;
#ifdef MBS_SUPPORT
char *mb_properties = NULL;
#endif /* MBS_SUPPORT */
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1 && kwset)
mb_properties = check_multibyte_string(buf, size);
#endif /* MBS_SUPPORT */
buflim = buf + size;
for (beg = end = buf; end < buflim; beg = end + 1)
for (beg = end = buf; end < buflim; beg = end)
{
if (kwset)
if (!exact)
{
/* Find a possible match using the KWset matcher. */
beg = kwsexec(kwset, beg, buflim - beg, &kwsm);
if (!beg)
goto failure;
/* Narrow down to the line containing the candidate, and
run it through DFA. */
end = memchr(beg, eol, buflim - beg);
if (!end)
end = buflim;
while (beg > buf && beg[-1] != eol)
--beg;
save = *end;
if (kwsm.index < lastexact)
goto success;
if (!dfaexec(&dfa, beg, end, 0, (int *) 0, &backref))
if (kwset)
{
*end = save;
continue;
/* Find a possible match using the KWset matcher. */
size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm);
if (offset == (size_t) -1)
{
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
free(mb_properties);
#endif
return (size_t)-1;
}
beg += offset;
/* Narrow down to the line containing the candidate, and
run it through DFA. */
end = memchr(beg, eol, buflim - beg);
end++;
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0)
continue;
#endif
while (beg > buf && beg[-1] != eol)
--beg;
if (kwsm.index < kwset_exact_matches)
goto success;
if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1)
continue;
}
else
{
/* No good fixed strings; start with DFA. */
size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref);
if (offset == (size_t) -1)
break;
/* Narrow down to the line we've found. */
beg += offset;
end = memchr (beg, eol, buflim - beg);
end++;
while (beg > buf && beg[-1] != eol)
--beg;
}
*end = save;
/* Successful, no backreferences encountered. */
if (!backref)
goto success;
}
else
{
/* No good fixed strings; start with DFA. */
save = *buflim;
beg = dfaexec(&dfa, beg, buflim, 0, (int *) 0, &backref);
*buflim = save;
if (!beg)
goto failure;
/* Narrow down to the line we've found. */
end = memchr(beg, eol, buflim - beg);
if (!end)
end = buflim;
while (beg > buf && beg[-1] != eol)
--beg;
/* Successful, no backreferences encountered! */
if (!backref)
goto success;
}
else
end = beg + size;
/* If we've made it to this point, this means DFA has seen
a probable match, and we need to run it through Regex. */
regexbuf.not_eol = 0;
if ((start = re_search(&regexbuf, beg, end - beg, 0, end - beg, &regs)) >= 0)
for (i = 0; i < pcount; i++)
{
len = regs.end[0] - start;
if ((!match_lines && !match_words)
|| (match_lines && len == end - beg))
goto success;
/* If -w, check if the match aligns with word boundaries.
We do this iteratively because:
(a) the line may contain more than one occurence of the pattern, and
(b) Several alternatives in the pattern might be valid at a given
point, and we may need to consider a shorter one to find a word
boundary. */
if (match_words)
while (start >= 0)
{
if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
&& (len == end - beg
|| !WCHAR ((unsigned char) beg[start + len])))
goto success;
if (len > 0)
patterns[i].regexbuf.not_eol = 0;
if (0 <= (start = re_search (&(patterns[i].regexbuf), beg,
end - beg - 1, 0,
end - beg - 1, &(patterns[i].regs))))
{
len = patterns[i].regs.end[0] - start;
if (exact)
{
*match_size = len;
return start;
}
if ((!match_lines && !match_words)
|| (match_lines && len == end - beg - 1))
goto success;
/* If -w, check if the match aligns with word boundaries.
We do this iteratively because:
(a) the line may contain more than one occurence of the
pattern, and
(b) Several alternatives in the pattern might be valid at a
given point, and we may need to consider a shorter one to
find a word boundary. */
if (match_words)
while (start >= 0)
{
/* Try a shorter length anchored at the same place. */
--len;
regexbuf.not_eol = 1;
len = re_match(&regexbuf, beg, start + len, start, &regs);
if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1]))
&& (len == end - beg - 1
|| !WCHAR ((unsigned char) beg[start + len])))
goto success;
if (len > 0)
{
/* Try a shorter length anchored at the same place. */
--len;
patterns[i].regexbuf.not_eol = 1;
len = re_match (&(patterns[i].regexbuf), beg,
start + len, start,
&(patterns[i].regs));
}
if (len <= 0)
{
/* Try looking further on. */
if (start == end - beg - 1)
break;
++start;
patterns[i].regexbuf.not_eol = 0;
start = re_search (&(patterns[i].regexbuf), beg,
end - beg - 1,
start, end - beg - 1 - start,
&(patterns[i].regs));
len = patterns[i].regs.end[0] - start;
}
}
if (len <= 0)
{
/* Try looking further on. */
if (start == end - beg)
break;
++start;
regexbuf.not_eol = 0;
start = re_search(&regexbuf, beg, end - beg,
start, end - beg - start, &regs);
len = regs.end[0] - start;
}
}
}
}
failure:
return 0;
}
} /* for Regex patterns. */
} /* for (beg = end ..) */
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1 && mb_properties)
free (mb_properties);
#endif /* MBS_SUPPORT */
return (size_t) -1;
success:
*endp = end < buflim ? end + 1 : end;
return beg;
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1 && mb_properties)
free (mb_properties);
#endif /* MBS_SUPPORT */
*match_size = end - beg;
return beg - buf;
}
static void
Fcompile (char *pattern, size_t size)
Fcompile (char const *pattern, size_t size)
{
char *beg, *lim, *err;
char const *beg, *lim, *err;
kwsinit();
kwsinit ();
beg = pattern;
do
{
for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim)
;
if ((err = kwsincr(kwset, beg, lim - beg)) != 0)
fatal(err, 0);
if ((err = kwsincr (kwset, beg, lim - beg)) != 0)
error (2, 0, err);
if (lim < pattern + size)
++lim;
beg = lim;
}
while (beg < pattern + size);
if ((err = kwsprep(kwset)) != 0)
fatal(err, 0);
if ((err = kwsprep (kwset)) != 0)
error (2, 0, err);
}
static char *
Fexecute (char *buf, size_t size, char **endp)
static size_t
Fexecute (char const *buf, size_t size, size_t *match_size, int exact)
{
register char *beg, *try, *end;
register char const *beg, *try, *end;
register size_t len;
char eol = eolbyte;
struct kwsmatch kwsmatch;
#ifdef MBS_SUPPORT
char *mb_properties;
if (MB_CUR_MAX > 1)
mb_properties = check_multibyte_string (buf, size);
#endif /* MBS_SUPPORT */
for (beg = buf; beg <= buf + size; ++beg)
{
if (!(beg = kwsexec(kwset, beg, buf + size - beg, &kwsmatch)))
return 0;
size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch);
if (offset == (size_t) -1)
{
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
free(mb_properties);
#endif /* MBS_SUPPORT */
return offset;
}
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0)
continue; /* It is a part of multibyte character. */
#endif /* MBS_SUPPORT */
beg += offset;
len = kwsmatch.size[0];
if (exact)
{
*match_size = len;
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
free (mb_properties);
#endif /* MBS_SUPPORT */
return beg - buf;
}
if (match_lines)
{
if (beg > buf && beg[-1] != eol)
@ -393,13 +549,22 @@ Fexecute (char *buf, size_t size, char **endp)
goto success;
}
else if (match_words)
for (try = beg; len && try;)
for (try = beg; len; )
{
if (try > buf && WCHAR((unsigned char) try[-1]))
break;
if (try + len < buf + size && WCHAR((unsigned char) try[len]))
{
try = kwsexec(kwset, beg, --len, &kwsmatch);
offset = kwsexec (kwset, beg, --len, &kwsmatch);
if (offset == (size_t) -1)
{
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
free (mb_properties);
#endif /* MBS_SUPPORT */
return offset;
}
try = beg + offset;
len = kwsmatch.size[0];
}
else
@ -409,15 +574,153 @@ Fexecute (char *buf, size_t size, char **endp)
goto success;
}
return 0;
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
free (mb_properties);
#endif /* MBS_SUPPORT */
return -1;
success:
if ((end = memchr(beg + len, eol, (buf + size) - (beg + len))) != 0)
++end;
else
end = buf + size;
*endp = end;
while (beg > buf && beg[-1] != '\n')
end = memchr (beg + len, eol, (buf + size) - (beg + len));
end++;
while (buf < beg && beg[-1] != eol)
--beg;
return beg;
*match_size = end - beg;
#ifdef MBS_SUPPORT
if (MB_CUR_MAX > 1)
free (mb_properties);
#endif /* MBS_SUPPORT */
return beg - buf;
}
#if HAVE_LIBPCRE
/* Compiled internal form of a Perl regular expression. */
static pcre *cre;
/* Additional information about the pattern. */
static pcre_extra *extra;
#endif
static void
Pcompile (char const *pattern, size_t size)
{
#if !HAVE_LIBPCRE
error (2, 0, _("The -P option is not supported"));
#else
int e;
char const *ep;
char *re = xmalloc (4 * size + 7);
int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0);
char const *patlim = pattern + size;
char *n = re;
char const *p;
char const *pnul;
/* FIXME: Remove this restriction. */
if (eolbyte != '\n')
error (2, 0, _("The -P and -z options cannot be combined"));
*n = '\0';
if (match_lines)
strcpy (n, "^(");
if (match_words)
strcpy (n, "\\b(");
n += strlen (n);
/* The PCRE interface doesn't allow NUL bytes in the pattern, so
replace each NUL byte in the pattern with the four characters
"\000", removing a preceding backslash if there are an odd
number of backslashes before the NUL.
FIXME: This method does not work with some multibyte character
encodings, notably Shift-JIS, where a multibyte character can end
in a backslash byte. */
for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1)
{
memcpy (n, p, pnul - p);
n += pnul - p;
for (p = pnul; pattern < p && p[-1] == '\\'; p--)
continue;
n -= (pnul - p) & 1;
strcpy (n, "\\000");
n += 4;
}
memcpy (n, p, patlim - p);
n += patlim - p;
*n = '\0';
if (match_words)
strcpy (n, ")\\b");
if (match_lines)
strcpy (n, ")$");
cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ());
if (!cre)
error (2, 0, ep);
extra = pcre_study (cre, 0, &ep);
if (ep)
error (2, 0, ep);
free (re);
#endif
}
static size_t
Pexecute (char const *buf, size_t size, size_t *match_size, int exact)
{
#if !HAVE_LIBPCRE
abort ();
return -1;
#else
/* This array must have at least two elements; everything after that
is just for performance improvement in pcre_exec. */
int sub[300];
int e = pcre_exec (cre, extra, buf, size, 0, 0,
sub, sizeof sub / sizeof *sub);
if (e <= 0)
{
switch (e)
{
case PCRE_ERROR_NOMATCH:
return -1;
case PCRE_ERROR_NOMEMORY:
error (2, 0, _("Memory exhausted"));
default:
abort ();
}
}
else
{
/* Narrow down to the line we've found. */
char const *beg = buf + sub[0];
char const *end = buf + sub[1];
char const *buflim = buf + size;
char eol = eolbyte;
if (!exact)
{
end = memchr (end, eol, buflim - end);
end++;
while (buf < beg && beg[-1] != eol)
--beg;
}
*match_size = end - beg;
return beg - buf;
}
#endif
}
struct matcher const matchers[] = {
{ "default", Gcompile, EGexecute },
{ "grep", Gcompile, EGexecute },
{ "egrep", Ecompile, EGexecute },
{ "awk", Ecompile, EGexecute },
{ "fgrep", Fcompile, Fexecute },
{ "perl", Pcompile, Pexecute },
{ "", 0, 0 },
};