mirror of
https://git.FreeBSD.org/ports.git
synced 2024-11-23 00:43:28 +00:00
Update to 0.8b
This commit is contained in:
parent
678ba8bc8f
commit
b9b33f4d05
Notes:
svn2git
2021-03-31 03:12:20 +00:00
svn path=/head/; revision=79817
@ -6,16 +6,16 @@
|
||||
#
|
||||
|
||||
PORTNAME= spamprobe
|
||||
PORTVERSION= 0.7c
|
||||
PORTVERSION= 0.8b
|
||||
CATEGORIES= mail
|
||||
MASTER_SITES= ${MASTER_SITE_SOURCEFORGE}
|
||||
MASTER_SITE_SUBDIR=${PORTNAME}
|
||||
|
||||
LIB_DEPENDS= db3.3:${PORTSDIR}/databases/db3
|
||||
|
||||
MAINTAINER= mdodd@freebsd.org
|
||||
COMMENT= Spam detector using Bayesian analysis of word counts
|
||||
|
||||
LIB_DEPENDS= db4:${PORTSDIR}/databases/db4
|
||||
|
||||
MAKEFILE= ${FILESDIR}/Makefile
|
||||
MAKE_ENV+= FILESDIR="${FILESDIR}"
|
||||
|
||||
@ -25,12 +25,9 @@ post-build:
|
||||
@cd ${WRKSRC} && ${MAKE_ENV} ${MAKE} -f \
|
||||
${FILESDIR}/Makefile.export0_6 clean all
|
||||
|
||||
post-extract:
|
||||
@${RM} -rf ${WRKSRC}/thirdparty
|
||||
|
||||
post-install:
|
||||
@cd ${WRKSRC} && ${MAKE_ENV} ${MAKE} -f \
|
||||
${FILESDIR}/Makefile.export0_6 install
|
||||
@${CAT} ${FILESDIR}/post-install-notes
|
||||
@${CAT} ${PKGMESSAGE}
|
||||
|
||||
.include <bsd.port.post.mk>
|
||||
|
@ -1 +1 @@
|
||||
MD5 (spamprobe-0.7c.tar.gz) = 51e568a3bd908ca629537bb0f9acde8c
|
||||
MD5 (spamprobe-0.8b.tar.gz) = a5ddc25dd2d116f3e6f346b027ae034f
|
||||
|
@ -1,10 +1,10 @@
|
||||
# $FreeBSD: /tmp/pcvs/ports/mail/spamprobe/files/Attic/Makefile.export0_6,v 1.2 2002-10-08 23:48:39 mi Exp $
|
||||
# $FreeBSD: /tmp/pcvs/ports/mail/spamprobe/files/Attic/Makefile.export0_6,v 1.3 2003-04-29 00:18:11 mdodd Exp $
|
||||
#
|
||||
PREFIX?= /usr/local
|
||||
BINDIR= ${PREFIX}/bin
|
||||
NOMAN=
|
||||
PROG_CXX= spamprobe-export_0.6
|
||||
CXXFLAGS+= -Wall -DUSE_DBM
|
||||
CXXFLAGS+= -Wall -DUSE_DBM -DNDEBUG
|
||||
SRCS= File.cc export0_6.cc
|
||||
|
||||
.include <bsd.prog.mk>
|
||||
|
34
mail/spamprobe/files/patch-MessageFactory.cc
Normal file
34
mail/spamprobe/files/patch-MessageFactory.cc
Normal file
@ -0,0 +1,34 @@
|
||||
--- MessageFactory.cc.orig Tue Mar 11 07:38:41 2003
|
||||
+++ MessageFactory.cc Tue Mar 11 07:51:38 2003
|
||||
@@ -28,7 +28,7 @@
|
||||
// http://www.cooldevtools.com/qpl.html
|
||||
//
|
||||
|
||||
-#include <strstream>
|
||||
+#include <sstream>
|
||||
#include "Tokenizer.h"
|
||||
#include "MessageFactory.h"
|
||||
#include "RegularExpression.h"
|
||||
@@ -50,11 +50,11 @@
|
||||
MessageFactory::MessageFactory()
|
||||
: m_minWordLength(2),
|
||||
m_maxWordLength(90),
|
||||
+ m_phraser(new PhraseBuilder(2)),
|
||||
m_replaceNonAsciiChars(true),
|
||||
m_nonAsciiChar('z'),
|
||||
m_removeHTML(true),
|
||||
- m_headersToInclude(NORMAL_HEADERS),
|
||||
- m_phraser(new PhraseBuilder(2))
|
||||
+ m_headersToInclude(NORMAL_HEADERS)
|
||||
{
|
||||
}
|
||||
|
||||
@@ -299,7 +299,7 @@
|
||||
text += ' ';
|
||||
} else if (entity[0] == '#') {
|
||||
int code = 0;
|
||||
- istrstream in(entity.c_str() + 1);
|
||||
+ istringstream in(entity.c_str() + 1);
|
||||
in >> code;
|
||||
text += (char)code;
|
||||
} else {
|
@ -1,76 +0,0 @@
|
||||
--- MimeMessageReader.h Thu Sep 19 12:15:38 2002
|
||||
+++ MimeMessageReader.h Wed Sep 25 09:19:55 2002
|
||||
@@ -34,4 +34,7 @@
|
||||
#include "MimeHeader.h"
|
||||
+#include <sys/types.h>
|
||||
+#include <md5.h>
|
||||
+#define MD5_DIGEST_LENGTH 16
|
||||
|
||||
-class md5_state_s;
|
||||
+typedef unsigned char md5_digest_t[MD5_DIGEST_LENGTH*2 + 1];
|
||||
|
||||
@@ -64,3 +65,3 @@
|
||||
|
||||
- const string &getMD5Digest();
|
||||
+ const md5_digest_t &getMD5Digest();
|
||||
|
||||
@@ -105,4 +106,4 @@
|
||||
vector<MimeHeader> m_headers;
|
||||
- string m_md5digest;
|
||||
- NewPtr<md5_state_s> m_md5state;
|
||||
+ md5_digest_t m_md5digest;
|
||||
+ NewPtr<MD5_CTX> m_md5state;
|
||||
};
|
||||
--- MimeMessageReader.cc Thu Sep 19 12:15:38 2002
|
||||
+++ MimeMessageReader.cc Wed Sep 25 22:56:17 2002
|
||||
@@ -30,4 +30,5 @@
|
||||
|
||||
-#include <cstdio>
|
||||
-#include "md5.h"
|
||||
+#include <sys/types.h>
|
||||
+#include <md5.h>
|
||||
+#define MD5_DIGEST_LENGTH 16
|
||||
#include "util.h"
|
||||
@@ -93,4 +92,4 @@
|
||||
|
||||
- m_md5state.set(new md5_state_s);
|
||||
- md5_init(m_md5state.get());
|
||||
+ m_md5state.set(new MD5_CTX);
|
||||
+ MD5Init(m_md5state.get());
|
||||
|
||||
@@ -140,3 +139,3 @@
|
||||
}
|
||||
- md5_append(m_md5state.get(), (md5_byte_t *)value.data(), value.length());
|
||||
+ MD5Update(m_md5state.get(), (const unsigned char *)value.data(), value.length());
|
||||
}
|
||||
@@ -228,3 +227,3 @@
|
||||
|
||||
-const string &MimeMessageReader::getMD5Digest()
|
||||
+const md5_digest_t &MimeMessageReader::getMD5Digest()
|
||||
{
|
||||
@@ -236,11 +235,10 @@
|
||||
|
||||
- m_md5digest.erase();
|
||||
-
|
||||
- md5_byte_t raw_digest[32];
|
||||
- char hexcode[8];
|
||||
- md5_finish(m_md5state.get(), raw_digest);
|
||||
- for (int i = 0; i < 16; ++i) {
|
||||
- sprintf(hexcode, "%02x", (unsigned)raw_digest[i]);
|
||||
- m_md5digest += hexcode;
|
||||
+ MD5Final(m_md5digest + MD5_DIGEST_LENGTH + 1, m_md5state.get());
|
||||
+ for (int i = 0; i < MD5_DIGEST_LENGTH; i++) {
|
||||
+ char hexdigits[] = "0123456789abcdef";
|
||||
+ m_md5digest[i*2] = hexdigits[m_md5digest[i + MD5_DIGEST_LENGTH + 1] >> 4];
|
||||
+ m_md5digest[i*2 + 1] =
|
||||
+ hexdigits[m_md5digest[i + MD5_DIGEST_LENGTH + 1] & 0x0f];
|
||||
}
|
||||
+ m_md5digest[MD5_DIGEST_LENGTH*2 + 1] = '\0';
|
||||
m_md5state.clear();
|
||||
--- MessageFactory.cc Tue Sep 17 17:39:36 2002
|
||||
+++ MessageFactory.cc Tue Oct 8 18:59:07 2002
|
||||
@@ -127,3 +127,3 @@
|
||||
|
||||
- msg.setDigest(reader.getMD5Digest());
|
||||
+ msg.setDigest((char *)reader.getMD5Digest());
|
||||
|
@ -1,321 +0,0 @@
|
||||
.\"
|
||||
.\" $Id$
|
||||
.\"
|
||||
.\" Note: The date here should be updated whenever a non-trivial
|
||||
.\" change is made to the manual page.
|
||||
.Dd September 5, 2002
|
||||
.Dt SPAMPROBE 1
|
||||
.Os
|
||||
.Sh NAME
|
||||
.Nm spamprobe
|
||||
.Nd "Spam detector using Bayesian analysis of word counts."
|
||||
.Sh SYNOPSIS
|
||||
.Nm
|
||||
.Op Fl a Ar char
|
||||
.Op Fl c
|
||||
.Op Fl d Ar directory
|
||||
.Op Fl h
|
||||
.Op Fl H Ar option
|
||||
.Op Fl m
|
||||
.Op Fl n Ar number
|
||||
.Op Fl r Ar number
|
||||
.Op Fl s Ar number
|
||||
.Op Fl v
|
||||
.Op Fl V
|
||||
.Op Fl Y
|
||||
.Op Fl 7
|
||||
.Op Fl 8
|
||||
.Ar command Op ...
|
||||
.Nm
|
||||
.Ar receive Op filename ...
|
||||
.Nm
|
||||
.Ar score Op filename ...
|
||||
.Nm
|
||||
.Ar find-spam Op filename ...
|
||||
.Nm
|
||||
.Ar find-good Op filename ...
|
||||
.Nm
|
||||
.Ar good Op filename ...
|
||||
.Nm
|
||||
.Ar spam Op filename ...
|
||||
.Nm
|
||||
.Ar remove Op filename ...
|
||||
.Nm
|
||||
.Ar dump
|
||||
.Nm
|
||||
.Ar export
|
||||
.Nm
|
||||
.Ar import Op filename ...
|
||||
.Sh DESCRIPTION
|
||||
Welcome to
|
||||
.Nm SpamProbe !
|
||||
Are you tired of the constant bombardment of your inbox by unwanted
|
||||
email pushing everything from porn to get rich quick schemes? Have you
|
||||
tried other spam filters but become disenchanted with them when you
|
||||
realized that their manually generated rule sets weren't updated fast
|
||||
enough to keep up with spammers wording changes? Or that they generated
|
||||
unwanted false positive scores?
|
||||
.Pp
|
||||
.Nm SpamProbe
|
||||
operates on a different basis entirely. Instead of using pattern matching
|
||||
and a set of human generated rules
|
||||
.Nm SpamProbe
|
||||
relies on a Bayesian analysis
|
||||
of the frequency of words used in spam and non-spam emails received by an
|
||||
individual person. The process is completely automatic and tailors itself
|
||||
to the kinds of emails that each person receives.
|
||||
.Ss FEATURES
|
||||
.Bl -bullet -offset indent -compact
|
||||
.It
|
||||
Spam detection using Bayesian analysis of terms contained in each email.
|
||||
Words used often in spams but not in good email tend to indicate that a
|
||||
message is spam.
|
||||
.It
|
||||
Written in C++ for good performance. Database access using GDBM for quick
|
||||
startup and fast term count retrieval.
|
||||
.It
|
||||
Recognition and decoding of MIME attachments in quoted-printable and
|
||||
base64 encoding. Automatically skips non-text attachments.
|
||||
.It
|
||||
Counts two word phrases as well as single words for higher precision.
|
||||
.It
|
||||
Ignores HTML tags in emails for scoring purposes unless the -h command
|
||||
line option is used. Many spams use HTML and few humans do so HTML tends
|
||||
to become a powerful recognizer of spams. However in the author's opinion
|
||||
this also substantially increases the likelihood of false positives if
|
||||
someone does send a non-spam email containing HTML tags.
|
||||
.Nm SpamProbe
|
||||
does pull urls from inside of html tags however since those tend to be
|
||||
spammer specific.
|
||||
.It
|
||||
Locks mboxes and databases using fcntl file locking to avoid problems when
|
||||
multiple emails arrive simultaneously.
|
||||
.It
|
||||
Scores only the Received, Subject, To, From, and Cc headers. All other
|
||||
headers are ignored to make it hard for spammers to hide non-spammy words
|
||||
in X- headers to fool the filter. The
|
||||
.Fl H
|
||||
command line option can be used to override this.
|
||||
.El
|
||||
.Ss OPTIONS
|
||||
.Bl -tag -width ".Fl d Ar directory"
|
||||
.It Fl a Ar char
|
||||
By default
|
||||
.Nm
|
||||
converts non-ascii characters (characters with the most significant bit
|
||||
set to 1) into the letter 'z'. This is useful for lumping all Asian
|
||||
characters into a single word for easy recognition. The
|
||||
.Fl a
|
||||
option allows you to change the character to something else if you don't
|
||||
like the letter 'z' for some reason.
|
||||
.It Fl c
|
||||
Create the database directory if it does not already exist. Normally
|
||||
.Nm
|
||||
exits with a usage error if the database directory does not already exist.
|
||||
.It Fl d Ar directory
|
||||
By default
|
||||
.Nm
|
||||
stores its database in a directory named .spamprobe under your home
|
||||
directory. The
|
||||
.Fl d
|
||||
option allows you to specify a different directory to use. This is
|
||||
necessary if your home directory is NFS mounted for example.
|
||||
.It Fl h
|
||||
By default
|
||||
.Nm
|
||||
removes HTML markup from the text in emails to help avoid false positives.
|
||||
The
|
||||
.Fl h
|
||||
option allows you to override this behavior and force
|
||||
.Nm
|
||||
to include words from within HTML tags in its word counts. Note that
|
||||
.Nm
|
||||
always counts any URLs in hrefs within tags whether
|
||||
.Fl h
|
||||
is used or not. Use of this option is discouraged. It can increase the
|
||||
rate of spam detection slightly but unless the user receives a significant
|
||||
amount of HTML emails it also tends to increase the number of false
|
||||
positives.
|
||||
.It Fl H Ar option
|
||||
By default
|
||||
.Nm
|
||||
only scans a meaningful subset of headers from the email message when
|
||||
searching for words to score. The
|
||||
.Fl H
|
||||
option allows the user to specify additional headers to scan. Legal values
|
||||
are "all", "nox", or "normal". "all" scans all headers, "nox" scans all
|
||||
headers except those starting with X-, and "normal" scans the normal set
|
||||
of headers.
|
||||
.It Fl m
|
||||
Use mbox format for reading emails in receive mode. Normally
|
||||
.Nm
|
||||
assumes that the input to receive mode contains a single message so it
|
||||
doesn't look for message breaks.
|
||||
.It Fl n Ar number
|
||||
Changes the number of most significant words/phrases used by
|
||||
.Nm
|
||||
to calculate the score for each message. Generally this is changed only
|
||||
for optimization purposes.
|
||||
.It Fl r Ar number
|
||||
Changes the number of times that a single word/phrase can occurr in the
|
||||
top words array used to calculate the score for each message. Allowing
|
||||
repeats reduces the number of words overall (since a single word occupies
|
||||
more than one slot) but allows words which occur frequently in the message
|
||||
to have a higher weight. Generally this is changed only for optimization
|
||||
purposes.
|
||||
.It Fl s Ar number
|
||||
.Nm
|
||||
maintains an in memory cache of the words it has seen in previous messages
|
||||
to reduce disk i/o and improve performance. By default the cache is
|
||||
flushed and cleared every 250 messages. This number can be changed using
|
||||
the
|
||||
.Fl s
|
||||
option. A value of zero causes
|
||||
.NM
|
||||
to use 100,000 as the limit which effectively means that the cache will
|
||||
only be flushed at program exit (unless you have really enormous mailbox
|
||||
files). The cache doesn't affect receive, dump, or export but has a
|
||||
significant impact on the others.
|
||||
.It Fl v
|
||||
Write debugging information to stderr. This can be useful for debugging
|
||||
or for seeing which terms
|
||||
.Nm
|
||||
used to score each email.
|
||||
.It Fl V
|
||||
Prints version and copyright information and then exits.
|
||||
.It Fl Y
|
||||
Assume traditional Berkeley mailbox format, ignoring any Content-Length:
|
||||
fields.
|
||||
.It Fl 7
|
||||
Ignore any characters with the most significant bit set to 1 instead of
|
||||
mapping them to the letter 'z'.
|
||||
.It Fl 8
|
||||
Store all characters even if their most significant bit is set to 1.
|
||||
.El
|
||||
.Pp
|
||||
.Ss COMMANDS
|
||||
.Bl -tag -width ".Ar find-spam Op filename ..."
|
||||
.It Ar receive Op filename ...
|
||||
Tells
|
||||
.Nm
|
||||
to read its standard input (or a file specified after the receive command)
|
||||
and score it using the current databases. Once the message has been
|
||||
scored the message is classified as either spam or non-spam and its word
|
||||
counts are written to the appropriate database. The message's score is
|
||||
written to stdout along with a single word. For example:
|
||||
.Pp
|
||||
.Dl "SPAM 0.99"
|
||||
.Pp
|
||||
or
|
||||
.Pp
|
||||
.Dl "GOOD 0.02"
|
||||
.It Ar score Op filename ...
|
||||
Similar to receive except that the databases are not modified in any way
|
||||
and only the score is printed to stdout.
|
||||
.It Ar find-spam Op filename ...
|
||||
Similar to score except that it prints a short summary and score for each
|
||||
message that is determined to be spam. This can be useful when testing.
|
||||
.It Ar find-good Op filename ...
|
||||
Similar to score except that it prints a short summary and score for each
|
||||
message that is determined to be good. This can be useful when testing.
|
||||
.It Ar good Op filename ...
|
||||
Scans each file (or stdin if no file is specified) and reclassifies every
|
||||
email in the file as non-spam. The databases are updated appropriately.
|
||||
Previously processed messages (recognized using their message ids) are
|
||||
ignored.
|
||||
.It Ar spam Op filename ...
|
||||
Scans each file (or stdin if no file is specified) and reclassifies every
|
||||
email in the file as spam. The databases are updated appropriately.
|
||||
Previously processed messages (recognized using their message ids) are
|
||||
ignored.
|
||||
.It Ar remove Op filename ...
|
||||
Scans each file (or stdin if no file is specified) and removes its term
|
||||
counts from the database. Messages which are not in the database
|
||||
(recognized using their message ids) are ignored.
|
||||
.It Ar dump
|
||||
Prints the contents of the word counts database one word per line in human
|
||||
readable format with good count, spam count, and word in columns separated
|
||||
by whitespace. Note that when using GDBM for the database the words are
|
||||
printed in the order they are hashed so the results will need to be sorted
|
||||
to be most useful. The standard unix sort command can do this. For
|
||||
example to list all words from "most good" to "least good" use this
|
||||
command:
|
||||
.Pp
|
||||
.Dl "spamprobe dump | sort -k 1 -n -r"
|
||||
.Pp
|
||||
To list all words from "most spammy" to "least spammy" use this command:
|
||||
.Pp
|
||||
.Dl "spamprobe dump | sort -k 2 -n -r"
|
||||
.It Ar export
|
||||
Similar to the dump command but prints the counts and words in a comma
|
||||
separated format with the words surrounded by double quotes. This can be
|
||||
more useful for importing into some databases.
|
||||
.It Ar import Op filename ...
|
||||
Reads the specified files which must contain export data written by the
|
||||
export command. The terms and counts from this file are added to the
|
||||
database. This can be used to convert a database from a prior version.
|
||||
.El
|
||||
.Sh ENVIRONMENT
|
||||
The
|
||||
.Nm
|
||||
command looks for the database directory in the users home directory
|
||||
specified by the
|
||||
.Ev HOME
|
||||
environment variable. Use the
|
||||
.Fl d
|
||||
flag to specify a different database directory.
|
||||
.Sh FILES
|
||||
.Bl -tag -width ".Pa $HOME/. Ns Nm" -compact
|
||||
.It Pa $HOME/. Ns Nm
|
||||
The default database directory.
|
||||
.El
|
||||
.Sh EXAMPLES
|
||||
Typically one would use
|
||||
.Nm
|
||||
with
|
||||
.Nm procmail
|
||||
and
|
||||
.Nm formail
|
||||
to flag and filter incoming email.
|
||||
.Pp
|
||||
.Dl "# SpamProbe rule."
|
||||
.Dl ":0"
|
||||
.Dl "{"
|
||||
.Dl " # Generate a score for the message."
|
||||
.Dl " SCORE=`spamprobe receive`"
|
||||
.Dl " # Add a X-SpamProbe header to the message."
|
||||
.Dl " :0 fhW"
|
||||
.Dl " | formail -I ""X-SpamProbe: $SCORE"""
|
||||
.Dl "}"
|
||||
.Pp
|
||||
.Dl "# Filter matching messages to their own mailbox."
|
||||
.Dl ":0:"
|
||||
.Dl "*^X-SpamProbe: SPAM"
|
||||
.Dl "spamprobe"
|
||||
.Sh DIAGNOSTICS
|
||||
Exit status is 0 on success, and 1 if
|
||||
.Nm
|
||||
encounters an invalid command.
|
||||
.Sh COMPATIBILITY
|
||||
Version of
|
||||
.Nm
|
||||
previous to 0.7 use a different database format. To convert your existing
|
||||
database to the new format use the following command.
|
||||
.Pp
|
||||
.Dl "spamprobe-export_0.6 | spamprobe import"
|
||||
.Sh SEE ALSO
|
||||
.Xr formail 1 ,
|
||||
.Xr procmail 1 ,
|
||||
.Rs
|
||||
.%A "Paul Graham"
|
||||
.%T "A Plan for Spam"
|
||||
.%O http://www.paulgraham.com/spam.html
|
||||
.%D "August 2002"
|
||||
.Re
|
||||
.Sh AUTHORS
|
||||
This
|
||||
manual page was written by
|
||||
.An Matthew N. Dodd Aq mdodd@FreeBSD.org .
|
||||
.Nm
|
||||
was written by
|
||||
.An Brian Burton Aq bburton@users.sourceforge.net
|
Loading…
Reference in New Issue
Block a user