Sync to HEAD
[bpt/emacs.git] / lisp / mh-e / mh-junk.el
CommitLineData
924df208
BW
1;;; mh-junk.el --- Interface to anti-spam measures
2
3;; Copyright (C) 2003 Free Software Foundation, Inc.
4
5;; Author: Satyaki Das <satyaki@theforce.stanford.edu>,
6;; Bill Wohler <wohler@newt.com>
7;; Maintainer: Bill Wohler <wohler@newt.com>
8;; Keywords: mail, spam
9
10;; This file is part of GNU Emacs.
11
12;; GNU Emacs is free software; you can redistribute it and/or modify
13;; it under the terms of the GNU General Public License as published by
14;; the Free Software Foundation; either version 2, or (at your option)
15;; any later version.
16
17;; GNU Emacs is distributed in the hope that it will be useful,
18;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;; GNU General Public License for more details.
21
22;; You should have received a copy of the GNU General Public License
23;; along with GNU Emacs; see the file COPYING. If not, write to the
24;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
25;; Boston, MA 02111-1307, USA.
26
27;;; Commentary:
28
29;; Spam handling in MH-E.
30
31;;; Change Log:
32
33;;; Code:
34
35(require 'mh-e)
36
37;; Interactive functions callable from the folder buffer
38;;;###mh-autoload
39(defun mh-junk-blacklist (msg-or-seq)
40 "Blacklist MSG-OR-SEQ as spam.
41Default is the displayed message.
42If optional prefix argument is provided, then prompt for the message sequence.
43If variable `transient-mark-mode' is non-nil and the mark is active, then the
44selected region is blacklisted.
45In a program, MSG-OR-SEQ can be a message number, a list of message numbers, a
46region in a cons cell, or a sequence.
47
48First the appropriate function is called depending on the value of
49`mh-junk-choice'. Then if `mh-junk-mail-folder' is a string then the message is
50refiled to that folder. If nil, the message is deleted.
51
52To change the spam program being used, customize `mh-junk-program'. Directly
53setting `mh-junk-choice' is not recommended.
54
55The documentation for the following functions describes what setup is needed
56for the different spam fighting programs:
57
58 - `mh-bogofilter-blacklist'
59 - `mh-spamprobe-blacklist'
60 - `mh-spamassassin-blacklist'"
61 (interactive (list (mh-interactive-msg-or-seq "Blacklist")))
62 (let ((blacklist-func (nth 1 (assoc mh-junk-choice mh-junk-function-alist))))
63 (unless blacklist-func
64 (error "Customize `mh-junk-program' appropriately"))
65 (let ((dest (cond ((null mh-junk-mail-folder) nil)
66 ((equal mh-junk-mail-folder "") "+")
67 ((eq (aref mh-junk-mail-folder 0) ?+)
68 mh-junk-mail-folder)
69 ((eq (aref mh-junk-mail-folder 0) ?@)
70 (concat mh-current-folder "/"
71 (substring mh-junk-mail-folder 1)))
72 (t (concat "+" mh-junk-mail-folder)))))
73 (mh-iterate-on-msg-or-seq msg msg-or-seq
74 (funcall (symbol-function blacklist-func) msg)
75 (if dest
76 (mh-refile-a-msg nil (intern dest))
77 (mh-delete-a-msg nil)))
78 (mh-next-msg))))
79
80;;;###mh-autoload
81(defun mh-junk-whitelist (msg-or-seq)
82 "Whitelist MSG-OR-SEQ incorrectly classified as spam.
83Default is the displayed message.
84If optional prefix argument is provided, then prompt for the message sequence.
85If variable `transient-mark-mode' is non-nil and the mark is active, then the
86selected region is whitelisted.
87In a program, MSG-OR-SEQ can be a message number, a list of message numbers, a
88region in a cons cell, or a sequence.
89
90First the appropriate function is called depending on the value of
91`mh-junk-choice'. Then the message is refiled to `mh-inbox'.
92
93To change the spam program being used, customize `mh-junk-program'. Directly
94setting `mh-junk-choice' is not recommended."
95 (interactive (list (mh-interactive-msg-or-seq "Whitelist")))
96 (let ((whitelist-func (nth 2 (assoc mh-junk-choice mh-junk-function-alist))))
97 (unless whitelist-func
98 (error "Customize `mh-junk-program' appropriately"))
99 (mh-iterate-on-msg-or-seq msg msg-or-seq
100 (funcall (symbol-function whitelist-func) msg)
101 (mh-refile-a-msg nil (intern mh-inbox)))
102 (mh-next-msg)))
103
104\f
105
106;; Bogofilter Interface
107
108(defvar mh-bogofilter-executable (executable-find "bogofilter"))
109
110(defun mh-bogofilter-blacklist (msg)
111 "Classify MSG as spam.
112Tell bogofilter that the message is spam.
113
114Bogofilter is a Bayesian spam filtering program. Get it from your local
115distribution or from:
116 http://bogofilter.sourceforge.net/
117
118You first need to teach bogofilter. This is done by running
119
120 bogofilter -n < good-message
121
122on every good message, and
123
124 bogofilter -s < spam-message
125
126on every spam message. Most Bayesian filters need 1000 to 5000 of each to
127start doing a good job.
128
129To use bogofilter, add the following .procmailrc recipes which you can also
130find in the bogofilter man page:
131
132 # Bogofilter
133 :0fw
134 | bogofilter -u -e -p
135
136 :0
137 * ^X-Bogosity: Yes, tests=bogofilter
138 $SPAM
139
140Bogofilter continues to feed the messages it classifies back into its
141database. Occasionally it misses, and those messages need to be reclassified.
142MH-E can do this for you. Use \\[mh-junk-blacklist] to reclassify messges in
143your +inbox as spam, and \\[mh-junk-whitelist] to reclassify messages in your
144spambox as good messages."
145 (unless mh-bogofilter-executable
146 (error "Couldn't find the bogofilter executable"))
147 (let ((msg-file (mh-msg-filename msg mh-current-folder)))
148 (call-process mh-bogofilter-executable msg-file 0 nil "-Ns")))
149
150(defun mh-bogofilter-whitelist (msg)
151 "Reinstate incorrectly filtered MSG.
152Train bogofilter to think of the message as non-spam."
153 (unless mh-bogofilter-executable
154 (error "Couldn't find the bogofilter executable"))
155 (let ((msg-file (mh-msg-filename msg mh-current-folder)))
156 (call-process mh-bogofilter-executable msg-file 0 nil "-Sn")))
157
158\f
159
160;; Spamprobe Interface
161
162(defvar mh-spamprobe-executable (executable-find "spamprobe"))
163
164(defun mh-spamprobe-blacklist (msg)
165 "Classify MSG as spam.
166Tell spamprobe that the message is spam.
167
168Spamprobe is a Bayesian spam filtering program. More info about the program can
169be found at:
170 http://spamprobe.sourceforge.net
171
172Here is a procmail recipe to stores incoming spam mail into the folder +spam
173and good mail in /home/user/Mail/mdrop/mbox. This recipe is provided as an
174example in the spamprobe man page.
175
176 PATH=/bin:/usr/bin:/usr/local/bin
177 DEFAULT=/home/user/Mail/mdrop/mbox
178 SPAM=/home/user/Mail/spam/.
179
180 # Spamprobe filtering
181 :0
182 SCORE=| spamprobe receive
183 :0 wf
184 | formail -I \"X-SpamProbe: $SCORE\"
185 :0 a:
186 *^X-SpamProbe: SPAM
187 $SPAM
188
189Occasionally some good mail gets misclassified as spam. You can use
190\\[mh-junk-whitelist] to reclassify that as good mail."
191 (unless mh-spamprobe-executable
192 (error "Couldn't find the spamprobe executable"))
193 (let ((msg-file (mh-msg-filename msg mh-current-folder)))
194 (call-process mh-spamprobe-executable msg-file 0 nil "spam")))
195
196(defun mh-spamprobe-whitelist (msg)
197 "Reinstate incorrectly filtered MSG.
198Train spamprobe to think of the message as non-spam."
199 (unless mh-spamprobe-executable
200 (error "Couldn't find the spamprobe executable"))
201 (let ((msg-file (mh-msg-filename msg mh-current-folder)))
202 (call-process mh-spamprobe-executable msg-file 0 nil "good")))
203
204\f
205
206;; Spamassassin Interface
207
208(defvar mh-spamassassin-executable (executable-find "spamassassin"))
209(defvar mh-sa-learn-executable (executable-find "sa-learn"))
210
211(defun mh-spamassassin-blacklist (msg)
212 "Blacklist MSG.
213This is done by sending the message to Razor and by appending the sender to
214~/.spamassassin/user_prefs in a blacklist_from rule. If sa-learn is available,
215the message is also recategorized as spam.
216
217Spamassassin is an excellent spam filter. For more information, see:
218 http://spamassassin.org/.
219
220I ran \"spamassassin -t\" on every mail message in my archive and ran an
221analysis in Gnumeric to find that the standard deviation of good mail
222scored under 5 (coincidentally, the spamassassin default for \"spam\").
223
224Furthermore, I observed that there weren't any messages with a score of 8
225or more that were interesting, so I added a couple of points to be
226conservative and send any message with a score of 10 or more down the
227drain. You might want to use a score of 12 or 13 to be really conservative.
228I have found that this really decreases the amount of junk to review.
229
230Messages with a score of 5-9 are set aside for later review. The major
231weakness of rules-based filters is a plethora of false positives\; I catch one
232or two legitimate messages in here a week, so it is worthwhile to check.
233
234You might choose to do this analysis yourself to pick a good score for
235deleting spam sight unseen, or you might pick a score out of a hat, or you
236might choose to be very conservative and not delete any messages at all.
237
238Based upon this discussion, here is what the associated ~/.procmailrc
239entries look like. These rules appear before my list filters so that spam
240sent to mailing lists gets pruned too.
241
242 #
243 # Spam
244 #
245 :0fw
246 | spamc
247
248 # Anything with a spam level of 10 or more is junked immediately.
249 :0:
250 * ^X-Spam-Level: ..........
251 /dev/null
252
253 :0
254 * ^X-Spam-Status: Yes
255 $SPAM
256
257If you don't use \"spamc\", use \"spamassassin -P -a\".
258
259A handful of spam does find its way into +inbox. In this case, use
260\\[mh-junk-blacklist] to add a \"blacklist_from\" line to
261~/spamassassin/user_prefs, delete the message, and send the message to the
262Razor, so that others might not see this spam.
263
264Over time, you see some patterns in the blacklisted addresses and can
265replace several lines with wildcards. For example, it is clear that High
266Speed Media is the biggest bunch of jerks on the Net. Here are some of the
267entries I have for them, and the list continues to grow.
268
269 blacklist_from *@*-hsm-*.com
270 blacklist_from *@*182*643*.com
271 blacklist_from *@*antarhsm*.com
272 blacklist_from *@*h*speed*
273 blacklist_from *@*hsm*182*.com
274 blacklist_from *@*hsm*643*.com
275 blacklist_from *@*hsmridi2983cslt227.com
276 blacklist_from *@*list*hsm*.com
277 blacklist_from *@h*s*media*
278 blacklist_from *@hsmdrct.com
279 blacklist_from *@hsmridi2983csltsite.com
280
281The function `mh-spamassassin-identify-spammers' is provided that shows the
282frequency counts of the host and domain names in your blacklist_from
283entries. This can be helpful when editing the blacklist_from entries.
284
285In versions of spamassassin (2.50 and on) that support a Bayesian classifier,
286\\[mh-junk-blacklist] uses the sa-learn program to recategorize the message as
287spam. Neither MH-E, nor spamassassin, rebuilds the database after adding
288words, so you will need to run \"sa-learn --rebuild\" periodically. This can
289be done by adding the following to your crontab:
290
291 0 * * * * sa-learn --rebuild > /dev/null 2>&1"
292 (unless mh-spamassassin-executable
293 (error "Couldn't find the spamassassin executable"))
294 (let ((current-folder mh-current-folder)
295 (msg-file (mh-msg-filename msg mh-current-folder))
296 (sender))
297 (save-excursion
298 (message "Giving this message the Razor...")
299 (mh-truncate-log-buffer)
300 (call-process mh-spamassassin-executable msg-file mh-log-buffer nil
301 "--report" "--remove-from-whitelist")
302 (when mh-sa-learn-executable
303 (message "Recategorizing this message as spam...")
304 (call-process mh-sa-learn-executable msg-file mh-log-buffer nil
305 "--single" "--spam" "--local --no-rebuild"))
306 (message "Blacklisting address...")
307 (set-buffer (get-buffer-create mh-temp-buffer))
308 (erase-buffer)
309 (call-process (expand-file-name mh-scan-prog mh-progs) nil t nil
310 (format "%s" msg) current-folder
311 "-format" "%<(mymbox{from})%|%(addr{from})%>")
312 (goto-char (point-min))
313 (if (search-forward-regexp "^\\(.+\\)$" nil t)
314 (progn
315 (setq sender (match-string 0))
316 (mh-spamassassin-add-rule "blacklist_from" sender)
317 (message "Blacklisting address...done"))
318 (message "Blacklisting address...not done (from my address)")))))
319
320(defun mh-spamassassin-whitelist (msg)
321 "Whitelist MSG.
322Add a whitelist_from rule to the ~/.spamassassin/user_prefs file. If sa-learn
323is available, then the message is recategorized as ham."
324 (unless mh-spamassassin-executable
325 (error "Couldn't find the spamassassin executable"))
326 (let ((msg-file (mh-msg-filename msg mh-current-folder))
327 (show-buffer (get-buffer mh-show-buffer))
328 from)
329 (save-excursion
330 (set-buffer (get-buffer-create mh-temp-buffer))
331 (erase-buffer)
332 (message "Removing spamassassin markup from message...")
333 (call-process mh-spamassassin-executable msg-file mh-temp-buffer nil
334 "--remove-markup")
335 (if show-buffer
336 (kill-buffer show-buffer))
337 (write-file msg-file)
338 (when mh-sa-learn-executable
339 (message "Recategorizing this message as ham...")
340 (call-process mh-sa-learn-executable msg-file mh-temp-buffer nil
341 "--single" "--ham" "--local --no-rebuild"))
342 (message "Whitelisting address...")
343 (setq from (car (ietf-drums-parse-address (mh-get-header-field "From:"))))
344 (kill-buffer nil)
345 (unless (equal from "")
346 (mh-spamassassin-add-rule "whitelist_from" from))
347 (message "Whitelisting address...done"))))
348
349(defun mh-spamassassin-add-rule (rule body)
350 "Add a new rule to ~/.spamassassin/user_prefs.
351The name of the rule is RULE and its body is BODY."
352 (save-window-excursion
353 (let* ((line (format "%s\t%s\n" rule body))
354 (case-fold-search t)
355 (file (expand-file-name "~/.spamassassin/user_prefs"))
356 (buffer-exists (find-buffer-visiting file)))
357 (find-file file)
358 (if (not (search-forward (format "\n%s" line) nil t))
359 (progn
360 (goto-char (point-max))
361 (insert (if (bolp) "" "\n") line)
362 (save-buffer)))
363 (if (not buffer-exists)
364 (kill-buffer nil)))))
365
366(defun mh-spamassassin-identify-spammers ()
367 "Identifies spammers who are repeat offenders.
368
369For each blacklist_from entry from the last blank line of
370~/.spamassassin/user_prefs to the end of the file, a list of host and domain
371names along with their frequency counts is displayed. This information can be
372used to replace multiple blacklist_from entries with a single wildcard entry
373such as:
374
375 blacklist_from *@*amazingoffersdirect2u.com"
376 (interactive)
377 (let* ((file (expand-file-name "~/.spamassassin/user_prefs"))
378 (domains (make-hash-table :test 'equal)))
379 (find-file file)
380 ;; Only consider entries between last blank line and end of file.
381 (goto-char (1- (point-max)))
382 (search-backward-regexp "^$")
383 ;; Perform frequency count.
384 (save-excursion
385 (while (search-forward-regexp "^blacklist_from\\s-*\\(.*\\)@\\(.*\\)$"
386 nil t)
387 (let ((host (match-string 2))
388 value)
389 ;; Remove top-level-domain from hostname.
390 (setq host (cdr (reverse (split-string host "\\."))))
391 ;; Add counts for each host and domain part.
392 (while host
393 (setq value (gethash (car host) domains))
394 (puthash (car host) (1+ (if (not value) 0 value)) domains)
395 (setq host (cdr host))))))
396
397 ;; Output
398 (delete-other-windows)
399 (pop-to-buffer (get-buffer-create "*MH-E Spammer Frequencies*"))
400 (erase-buffer)
401 (maphash '(lambda (key value) ""
402 (if (> value 2)
403 (insert (format "%s %s\n" key value))))
404 domains)
405 (sort-numeric-fields 2 (point-min) (point-max))
406 (reverse-region (point-min) (point-max))
407 (goto-char (point-min))))
408
409(provide 'mh-junk)
410
411;;; Local Variables:
412;;; indent-tabs-mode: nil
413;;; sentence-end-double-space: nil
414;;; End:
415
6b61353c 416;;; arch-tag: 603335f1-77ff-4306-8828-5d3dad51abe1
924df208 417;;; mh-junk.el ends here