Update to 0.8b

svn path=/head/; revision=79817
2024-11-23 00:43:28 +00:00 · 2003-04-29 00:18:11 +00:00 · 2003-04-29 00:18:11 +00:00 · b9b33f4d05 · 2021-03-31 03:12:20 +00:00
commit b9b33f4d05
parent 678ba8bc8f
7 changed files with 41 additions and 407 deletions
--- a/mail/spamprobe/Makefile
+++ b/mail/spamprobe/Makefile
@ -6,16 +6,16 @@
 #

 PORTNAME=	spamprobe
-PORTVERSION=	0.7c
+PORTVERSION=	0.8b
 CATEGORIES=	mail
 MASTER_SITES=	${MASTER_SITE_SOURCEFORGE}
 MASTER_SITE_SUBDIR=${PORTNAME}

-LIB_DEPENDS=	db3.3:${PORTSDIR}/databases/db3
-
 MAINTAINER=	mdodd@freebsd.org
 COMMENT=	Spam detector using Bayesian analysis of word counts

+LIB_DEPENDS=	db4:${PORTSDIR}/databases/db4
+
 MAKEFILE=	${FILESDIR}/Makefile
 MAKE_ENV+=	FILESDIR="${FILESDIR}"

@ -25,12 +25,9 @@ post-build:
 	@cd ${WRKSRC} && ${MAKE_ENV} ${MAKE} -f \
 		${FILESDIR}/Makefile.export0_6 clean all

-post-extract:
-	@${RM} -rf ${WRKSRC}/thirdparty
-
 post-install:
 	@cd ${WRKSRC} && ${MAKE_ENV} ${MAKE} -f \
 		${FILESDIR}/Makefile.export0_6 install
-	@${CAT} ${FILESDIR}/post-install-notes
+	@${CAT} ${PKGMESSAGE}

 .include <bsd.port.post.mk>
--- a/mail/spamprobe/distinfo
+++ b/mail/spamprobe/distinfo
@ -1 +1 @@
-MD5 (spamprobe-0.7c.tar.gz) = 51e568a3bd908ca629537bb0f9acde8c
+MD5 (spamprobe-0.8b.tar.gz) = a5ddc25dd2d116f3e6f346b027ae034f
--- a/mail/spamprobe/files/Makefile.export0_6
+++ b/mail/spamprobe/files/Makefile.export0_6
@ -1,10 +1,10 @@
-# $FreeBSD: /tmp/pcvs/ports/mail/spamprobe/files/Attic/Makefile.export0_6,v 1.2 2002-10-08 23:48:39 mi Exp $
+# $FreeBSD: /tmp/pcvs/ports/mail/spamprobe/files/Attic/Makefile.export0_6,v 1.3 2003-04-29 00:18:11 mdodd Exp $
 #
 PREFIX?=	/usr/local
 BINDIR=		${PREFIX}/bin
 NOMAN=
 PROG_CXX=	spamprobe-export_0.6
-CXXFLAGS+=	-Wall -DUSE_DBM
+CXXFLAGS+=	-Wall -DUSE_DBM -DNDEBUG
 SRCS=		File.cc export0_6.cc

 .include <bsd.prog.mk>
--- a/mail/spamprobe/files/patch-MessageFactory.cc
+++ b/mail/spamprobe/files/patch-MessageFactory.cc
@ -0,0 +1,34 @@
+--- MessageFactory.cc.orig	Tue Mar 11 07:38:41 2003
+++ MessageFactory.cc	Tue Mar 11 07:51:38 2003
+@@ -28,7 +28,7 @@
+ //    http://www.cooldevtools.com/qpl.html
+ //
+ 
+-#include <strstream>
+#include <sstream>
+ #include "Tokenizer.h"
+ #include "MessageFactory.h"
+ #include "RegularExpression.h"
+@@ -50,11 +50,11 @@
+ MessageFactory::MessageFactory()
+   : m_minWordLength(2),
+     m_maxWordLength(90),
+    m_phraser(new PhraseBuilder(2)),
+     m_replaceNonAsciiChars(true),
+     m_nonAsciiChar('z'),
+     m_removeHTML(true),
+-    m_headersToInclude(NORMAL_HEADERS),
+-    m_phraser(new PhraseBuilder(2))
+    m_headersToInclude(NORMAL_HEADERS)
+ {
+ }
+ 
+@@ -299,7 +299,7 @@
+     text += ' ';
+   } else if (entity[0] == '#') {
+     int code = 0;
+-    istrstream in(entity.c_str() + 1);
+    istringstream in(entity.c_str() + 1);
+     in >> code;
+     text += (char)code;
+   } else {
--- a/mail/spamprobe/files/patch-md5
+++ b/mail/spamprobe/files/patch-md5
@ -1,76 +0,0 @@
--- MimeMessageReader.h	Thu Sep 19 12:15:38 2002
-+++ MimeMessageReader.h	Wed Sep 25 09:19:55 2002
-@@ -34,4 +34,7 @@
- #include "MimeHeader.h"
-+#include <sys/types.h>
-+#include <md5.h>
-+#define MD5_DIGEST_LENGTH 16
- 
-class md5_state_s;
-+typedef	unsigned char md5_digest_t[MD5_DIGEST_LENGTH*2 + 1];
- 
-@@ -64,3 +65,3 @@
- 
-  const string &getMD5Digest();
-+  const md5_digest_t &getMD5Digest();
- 
-@@ -105,4 +106,4 @@
-   vector<MimeHeader> m_headers;
-  string m_md5digest;
-  NewPtr<md5_state_s> m_md5state;
-+  md5_digest_t m_md5digest;
-+  NewPtr<MD5_CTX> m_md5state;
- };
--- MimeMessageReader.cc	Thu Sep 19 12:15:38 2002
-+++ MimeMessageReader.cc	Wed Sep 25 22:56:17 2002
-@@ -30,4 +30,5 @@
- 
-#include <cstdio>
-#include "md5.h"
-+#include <sys/types.h>
-+#include <md5.h>
-+#define MD5_DIGEST_LENGTH 16
- #include "util.h"
-@@ -93,4 +92,4 @@
- 
-  m_md5state.set(new md5_state_s);
-  md5_init(m_md5state.get());
-+  m_md5state.set(new MD5_CTX);
-+  MD5Init(m_md5state.get());
- 
-@@ -140,3 +139,3 @@
-     }
-    md5_append(m_md5state.get(), (md5_byte_t *)value.data(), value.length());
-+    MD5Update(m_md5state.get(), (const unsigned char *)value.data(), value.length());
-   }
-@@ -228,3 +227,3 @@
- 
-const string &MimeMessageReader::getMD5Digest()
-+const md5_digest_t &MimeMessageReader::getMD5Digest()
- {
-@@ -236,11 +235,10 @@
- 
-    m_md5digest.erase();
-
-    md5_byte_t raw_digest[32];
-    char hexcode[8];
-    md5_finish(m_md5state.get(), raw_digest);
-    for (int i = 0; i < 16; ++i) {
-      sprintf(hexcode, "%02x", (unsigned)raw_digest[i]);
-      m_md5digest += hexcode;
-+    MD5Final(m_md5digest + MD5_DIGEST_LENGTH + 1, m_md5state.get());
-+    for (int i = 0; i < MD5_DIGEST_LENGTH; i++) {
-+	char hexdigits[] = "0123456789abcdef";
-+	m_md5digest[i*2] = hexdigits[m_md5digest[i + MD5_DIGEST_LENGTH + 1] >> 4];
-+	m_md5digest[i*2 + 1] =
-+		hexdigits[m_md5digest[i + MD5_DIGEST_LENGTH + 1] & 0x0f];
-     }
-+    m_md5digest[MD5_DIGEST_LENGTH*2 + 1] = '\0';
-     m_md5state.clear();
--- MessageFactory.cc	Tue Sep 17 17:39:36 2002
-+++ MessageFactory.cc	Tue Oct  8 18:59:07 2002
-@@ -127,3 +127,3 @@
- 
-  msg.setDigest(reader.getMD5Digest());
-+  msg.setDigest((char *)reader.getMD5Digest());
- 
--- a/mail/spamprobe/files/spamprobe.1
+++ b/mail/spamprobe/files/spamprobe.1
@ -1,321 +0,0 @@
-.\"
-.\" $Id$
-.\"
-.\" Note: The date here should be updated whenever a non-trivial
-.\" change is made to the manual page.
-.Dd September 5, 2002
-.Dt SPAMPROBE 1
-.Os
-.Sh NAME
-.Nm spamprobe
-.Nd "Spam detector using Bayesian analysis of word counts."
-.Sh SYNOPSIS
-.Nm
-.Op Fl a Ar char
-.Op Fl c
-.Op Fl d Ar directory
-.Op Fl h
-.Op Fl H Ar option
-.Op Fl m
-.Op Fl n Ar number
-.Op Fl r Ar number
-.Op Fl s Ar number
-.Op Fl v
-.Op Fl V
-.Op Fl Y
-.Op Fl 7
-.Op Fl 8
-.Ar command Op ...
-.Nm
-.Ar receive Op filename ...
-.Nm
-.Ar score Op filename ...
-.Nm
-.Ar find-spam Op filename ...
-.Nm
-.Ar find-good Op filename ...
-.Nm
-.Ar good Op filename ...
-.Nm
-.Ar spam Op filename ...
-.Nm
-.Ar remove Op filename ...
-.Nm
-.Ar dump
-.Nm
-.Ar export
-.Nm
-.Ar import Op filename ...
-.Sh DESCRIPTION
-Welcome to
-.Nm SpamProbe ! 
-Are you tired of the constant bombardment of your inbox by unwanted
-email pushing everything from porn to get rich quick schemes?  Have you
-tried other spam filters but become disenchanted with them when you
-realized that their manually generated rule sets weren't updated fast
-enough to keep up with spammers wording changes?  Or that they generated
-unwanted false positive scores?
-.Pp
-.Nm SpamProbe
-operates on a different basis entirely.  Instead of using pattern matching
-and a set of human generated rules
-.Nm SpamProbe
-relies on a Bayesian analysis
-of the frequency of words used in spam and non-spam emails received by an
-individual person.  The process is completely automatic and tailors itself
-to the kinds of emails that each person receives.
-.Ss FEATURES
-.Bl -bullet -offset indent -compact
-.It
-Spam detection using Bayesian analysis of terms contained in each email.  
-Words used often in spams but not in good email tend to indicate that a
-message is spam.
-.It
-Written in C++ for good performance.  Database access using GDBM for quick
-startup and fast term count retrieval.
-.It
-Recognition and decoding of MIME attachments in quoted-printable and
-base64 encoding.  Automatically skips non-text attachments.
-.It
-Counts two word phrases as well as single words for higher precision.
-.It
-Ignores HTML tags in emails for scoring purposes unless the -h command
-line option is used.  Many spams use HTML and few humans do so HTML tends
-to become a powerful recognizer of spams.  However in the author's opinion
-this also substantially increases the likelihood of false positives if
-someone does send a non-spam email containing HTML tags.
-.Nm SpamProbe
-does pull urls from inside of html tags however since those tend to be
-spammer specific.
-.It
-Locks mboxes and databases using fcntl file locking to avoid problems when
-multiple emails arrive simultaneously.
-.It
-Scores only the Received, Subject, To, From, and Cc headers.  All other
-headers are ignored to make it hard for spammers to hide non-spammy words
-in X- headers to fool the filter.  The
-.Fl H
-command line option can be used to override this. 
-.El
-.Ss OPTIONS
-.Bl -tag -width ".Fl d Ar directory"
-.It Fl a Ar char
-By default
-.Nm
-converts non-ascii characters (characters with the most significant bit
-set to 1) into the letter 'z'.  This is useful for lumping all Asian
-characters into a single word for easy recognition.  The
-.Fl a
-option allows you to change the character to something else if you don't
-like the letter 'z' for some reason.
-.It Fl c
-Create the database directory if it does not already exist.  Normally
-.Nm
-exits with a usage error if the database directory does not already exist.
-.It Fl d Ar directory
-By default
-.Nm
-stores its database in a directory named .spamprobe under your home
-directory.  The
-.Fl d
-option allows you to specify a different directory to use.  This is
-necessary if your home directory is NFS mounted for example.
-.It Fl h
-By default
-.Nm
-removes HTML markup from the text in emails to help avoid false positives.  
-The
-.Fl h
-option allows you to override this behavior and force
-.Nm
-to include words from within HTML tags in its word counts.  Note that
-.Nm
-always counts any URLs in hrefs within tags whether
-.Fl h
-is used or not.  Use of this option is discouraged.  It can increase the
-rate of spam detection slightly but unless the user receives a significant
-amount of HTML emails it also tends to increase the number of false
-positives.
-.It Fl H Ar option
-By default
-.Nm
-only scans a meaningful subset of headers from the email message when
-searching for words to score.  The
-.Fl H
-option allows the user to specify additional headers to scan. Legal values
-are "all", "nox", or "normal".  "all" scans all headers, "nox" scans all
-headers except those starting with X-, and "normal" scans the normal set
-of headers.
-.It Fl m
-Use mbox format for reading emails in receive mode.  Normally
-.Nm
-assumes that the input to receive mode contains a single message so it
-doesn't look for message breaks.
-.It Fl n Ar number
-Changes the number of most significant words/phrases used by
-.Nm
-to calculate the score for each message.  Generally this is changed only
-for optimization purposes.
-.It Fl r Ar number
-Changes the number of times that a single word/phrase can occurr in the
-top words array used to calculate the score for each message.  Allowing
-repeats reduces the number of words overall (since a single word occupies
-more than one slot) but allows words which occur frequently in the message
-to have a higher weight. Generally this is changed only for optimization
-purposes.
-.It Fl s Ar number
-.Nm
-maintains an in memory cache of the words it has seen in previous messages
-to reduce disk i/o and improve performance.  By default the cache is
-flushed and cleared every 250 messages.  This number can be changed using
-the
-.Fl s
-option.  A value of zero causes
-.NM
-to use 100,000 as the limit which effectively means that the cache will
-only be flushed at program exit (unless you have really enormous mailbox
-files).  The cache doesn't affect receive, dump, or export but has a
-significant impact on the others.
-.It Fl v
-Write debugging information to stderr.  This can be useful for debugging
-or for seeing which terms
-.Nm
-used to score each email.
-.It Fl V
-Prints version and copyright information and then exits.
-.It Fl Y
-Assume traditional Berkeley mailbox format, ignoring any Content-Length:
-fields.
-.It Fl 7
-Ignore any characters with the most significant bit set to 1 instead of
-mapping them to the letter 'z'.
-.It Fl 8
-Store all characters even if their most significant bit is set to 1.
-.El
-.Pp
-.Ss COMMANDS
-.Bl -tag -width ".Ar find-spam Op filename ..."
-.It Ar receive Op filename ...
-Tells
-.Nm
-to read its standard input (or a file specified after the receive command)
-and score it using the current databases.  Once the message has been
-scored the message is classified as either spam or non-spam and its word
-counts are written to the appropriate database.  The message's score is
-written to stdout along with a single word.  For example:
-.Pp
-.Dl "SPAM 0.99"
-.Pp
-or
-.Pp
-.Dl "GOOD 0.02"
-.It Ar score Op filename ...
-Similar to receive except that the databases are not modified in any way
-and only the score is printed to stdout.
-.It Ar find-spam Op filename ...
-Similar to score except that it prints a short summary and score for each
-message that is determined to be spam.  This can be useful when testing.
-.It Ar find-good Op filename ...
-Similar to score except that it prints a short summary and score for each
-message that is determined to be good.  This can be useful when testing.
-.It Ar good Op filename ...
-Scans each file (or stdin if no file is specified) and reclassifies every
-email in the file as non-spam.  The databases are updated appropriately.  
-Previously processed messages (recognized using their message ids) are
-ignored.
-.It Ar spam Op filename ...
-Scans each file (or stdin if no file is specified) and reclassifies every
-email in the file as spam.  The databases are updated appropriately.  
-Previously processed messages (recognized using their message ids) are
-ignored.
-.It Ar remove Op filename ...
-Scans each file (or stdin if no file is specified) and removes its term
-counts from the database.  Messages which are not in the database
-(recognized using their message ids) are ignored.
-.It Ar dump
-Prints the contents of the word counts database one word per line in human
-readable format with good count, spam count, and word in columns separated
-by whitespace.  Note that when using GDBM for the database the words are
-printed in the order they are hashed so the results will need to be sorted
-to be most useful.  The standard unix sort command can do this.  For
-example to list all words from "most good" to "least good" use this
-command:
-.Pp
-.Dl "spamprobe dump | sort -k 1 -n -r"
-.Pp
-To list all words from "most spammy" to "least spammy" use this command:
-.Pp
-.Dl "spamprobe dump | sort -k 2 -n -r"
-.It Ar export
-Similar to the dump command but prints the counts and words in a comma
-separated format with the words surrounded by double quotes. This can be
-more useful for importing into some databases.
-.It Ar import Op filename ...
-Reads the specified files which must contain export data written by the
-export command.  The terms and counts from this file are added to the
-database.  This can be used to convert a database from a prior version.
-.El
-.Sh ENVIRONMENT
-The
-.Nm
-command looks for the database directory in the users home directory
-specified by the
-.Ev HOME
-environment variable.  Use the
-.Fl d
-flag to specify a different database directory.
-.Sh FILES
-.Bl -tag -width ".Pa $HOME/. Ns Nm" -compact
-.It Pa $HOME/. Ns Nm
-The default database directory.
-.El
-.Sh EXAMPLES
-Typically one would use
-.Nm
-with
-.Nm procmail
-and
-.Nm formail
-to flag and filter incoming email.
-.Pp
-.Dl "# SpamProbe rule."
-.Dl ":0"
-.Dl "{"
-.Dl "    # Generate a score for the message."
-.Dl "    SCORE=`spamprobe receive`"
-.Dl "    # Add a X-SpamProbe header to the message."
-.Dl "    :0 fhW"
-.Dl "    | formail -I ""X-SpamProbe: $SCORE"""
-.Dl "}"
-.Pp
-.Dl "# Filter matching messages to their own mailbox."
-.Dl ":0:"
-.Dl "*^X-SpamProbe: SPAM"
-.Dl "spamprobe"
-.Sh DIAGNOSTICS
-Exit status is 0 on success, and 1 if 
-.Nm
-encounters an invalid command.
-.Sh COMPATIBILITY
-Version of 
-.Nm
-previous to 0.7 use a different database format.  To convert your existing
-database to the new format use the following command.
-.Pp
-.Dl "spamprobe-export_0.6 | spamprobe import"
-.Sh SEE ALSO
-.Xr formail 1 ,
-.Xr procmail 1 ,
-.Rs
-.%A "Paul Graham"
-.%T "A Plan for Spam"
-.%O http://www.paulgraham.com/spam.html
-.%D "August 2002"
-.Re
-.Sh AUTHORS
-This
-manual page was written by
-.An Matthew N. Dodd Aq mdodd@FreeBSD.org .
-.Nm
-was written by
-.An Brian Burton Aq bburton@users.sourceforge.net
--- a/mail/spamprobe/files/post-install-notes
+++ b/mail/spamprobe/files/post-install-notes