Add arch taglines
[bpt/emacs.git] / lisp / mh-e / mh-junk.el
1 ;;; mh-junk.el --- Interface to anti-spam measures
2
3 ;; Copyright (C) 2003 Free Software Foundation, Inc.
4
5 ;; Author: Satyaki Das <satyaki@theforce.stanford.edu>,
6 ;; Bill Wohler <wohler@newt.com>
7 ;; Maintainer: Bill Wohler <wohler@newt.com>
8 ;; Keywords: mail, spam
9
10 ;; This file is part of GNU Emacs.
11
12 ;; GNU Emacs is free software; you can redistribute it and/or modify
13 ;; it under the terms of the GNU General Public License as published by
14 ;; the Free Software Foundation; either version 2, or (at your option)
15 ;; any later version.
16
17 ;; GNU Emacs is distributed in the hope that it will be useful,
18 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;; GNU General Public License for more details.
21
22 ;; You should have received a copy of the GNU General Public License
23 ;; along with GNU Emacs; see the file COPYING. If not, write to the
24 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
25 ;; Boston, MA 02111-1307, USA.
26
27 ;;; Commentary:
28
29 ;; Spam handling in MH-E.
30
31 ;;; Change Log:
32
33 ;;; Code:
34
35 (require 'mh-e)
36
37 ;; Interactive functions callable from the folder buffer
38 ;;;###mh-autoload
39 (defun mh-junk-blacklist (msg-or-seq)
40 "Blacklist MSG-OR-SEQ as spam.
41 Default is the displayed message.
42 If optional prefix argument is provided, then prompt for the message sequence.
43 If variable `transient-mark-mode' is non-nil and the mark is active, then the
44 selected region is blacklisted.
45 In a program, MSG-OR-SEQ can be a message number, a list of message numbers, a
46 region in a cons cell, or a sequence.
47
48 First the appropriate function is called depending on the value of
49 `mh-junk-choice'. Then if `mh-junk-mail-folder' is a string then the message is
50 refiled to that folder. If nil, the message is deleted.
51
52 To change the spam program being used, customize `mh-junk-program'. Directly
53 setting `mh-junk-choice' is not recommended.
54
55 The documentation for the following functions describes what setup is needed
56 for the different spam fighting programs:
57
58 - `mh-bogofilter-blacklist'
59 - `mh-spamprobe-blacklist'
60 - `mh-spamassassin-blacklist'"
61 (interactive (list (mh-interactive-msg-or-seq "Blacklist")))
62 (let ((blacklist-func (nth 1 (assoc mh-junk-choice mh-junk-function-alist))))
63 (unless blacklist-func
64 (error "Customize `mh-junk-program' appropriately"))
65 (let ((dest (cond ((null mh-junk-mail-folder) nil)
66 ((equal mh-junk-mail-folder "") "+")
67 ((eq (aref mh-junk-mail-folder 0) ?+)
68 mh-junk-mail-folder)
69 ((eq (aref mh-junk-mail-folder 0) ?@)
70 (concat mh-current-folder "/"
71 (substring mh-junk-mail-folder 1)))
72 (t (concat "+" mh-junk-mail-folder)))))
73 (mh-iterate-on-msg-or-seq msg msg-or-seq
74 (funcall (symbol-function blacklist-func) msg)
75 (if dest
76 (mh-refile-a-msg nil (intern dest))
77 (mh-delete-a-msg nil)))
78 (mh-next-msg))))
79
80 ;;;###mh-autoload
81 (defun mh-junk-whitelist (msg-or-seq)
82 "Whitelist MSG-OR-SEQ incorrectly classified as spam.
83 Default is the displayed message.
84 If optional prefix argument is provided, then prompt for the message sequence.
85 If variable `transient-mark-mode' is non-nil and the mark is active, then the
86 selected region is whitelisted.
87 In a program, MSG-OR-SEQ can be a message number, a list of message numbers, a
88 region in a cons cell, or a sequence.
89
90 First the appropriate function is called depending on the value of
91 `mh-junk-choice'. Then the message is refiled to `mh-inbox'.
92
93 To change the spam program being used, customize `mh-junk-program'. Directly
94 setting `mh-junk-choice' is not recommended."
95 (interactive (list (mh-interactive-msg-or-seq "Whitelist")))
96 (let ((whitelist-func (nth 2 (assoc mh-junk-choice mh-junk-function-alist))))
97 (unless whitelist-func
98 (error "Customize `mh-junk-program' appropriately"))
99 (mh-iterate-on-msg-or-seq msg msg-or-seq
100 (funcall (symbol-function whitelist-func) msg)
101 (mh-refile-a-msg nil (intern mh-inbox)))
102 (mh-next-msg)))
103
104 \f
105
106 ;; Bogofilter Interface
107
108 (defvar mh-bogofilter-executable (executable-find "bogofilter"))
109
110 (defun mh-bogofilter-blacklist (msg)
111 "Classify MSG as spam.
112 Tell bogofilter that the message is spam.
113
114 Bogofilter is a Bayesian spam filtering program. Get it from your local
115 distribution or from:
116 http://bogofilter.sourceforge.net/
117
118 You first need to teach bogofilter. This is done by running
119
120 bogofilter -n < good-message
121
122 on every good message, and
123
124 bogofilter -s < spam-message
125
126 on every spam message. Most Bayesian filters need 1000 to 5000 of each to
127 start doing a good job.
128
129 To use bogofilter, add the following .procmailrc recipes which you can also
130 find in the bogofilter man page:
131
132 # Bogofilter
133 :0fw
134 | bogofilter -u -e -p
135
136 :0
137 * ^X-Bogosity: Yes, tests=bogofilter
138 $SPAM
139
140 Bogofilter continues to feed the messages it classifies back into its
141 database. Occasionally it misses, and those messages need to be reclassified.
142 MH-E can do this for you. Use \\[mh-junk-blacklist] to reclassify messges in
143 your +inbox as spam, and \\[mh-junk-whitelist] to reclassify messages in your
144 spambox as good messages."
145 (unless mh-bogofilter-executable
146 (error "Couldn't find the bogofilter executable"))
147 (let ((msg-file (mh-msg-filename msg mh-current-folder)))
148 (call-process mh-bogofilter-executable msg-file 0 nil "-Ns")))
149
150 (defun mh-bogofilter-whitelist (msg)
151 "Reinstate incorrectly filtered MSG.
152 Train bogofilter to think of the message as non-spam."
153 (unless mh-bogofilter-executable
154 (error "Couldn't find the bogofilter executable"))
155 (let ((msg-file (mh-msg-filename msg mh-current-folder)))
156 (call-process mh-bogofilter-executable msg-file 0 nil "-Sn")))
157
158 \f
159
160 ;; Spamprobe Interface
161
162 (defvar mh-spamprobe-executable (executable-find "spamprobe"))
163
164 (defun mh-spamprobe-blacklist (msg)
165 "Classify MSG as spam.
166 Tell spamprobe that the message is spam.
167
168 Spamprobe is a Bayesian spam filtering program. More info about the program can
169 be found at:
170 http://spamprobe.sourceforge.net
171
172 Here is a procmail recipe to stores incoming spam mail into the folder +spam
173 and good mail in /home/user/Mail/mdrop/mbox. This recipe is provided as an
174 example in the spamprobe man page.
175
176 PATH=/bin:/usr/bin:/usr/local/bin
177 DEFAULT=/home/user/Mail/mdrop/mbox
178 SPAM=/home/user/Mail/spam/.
179
180 # Spamprobe filtering
181 :0
182 SCORE=| spamprobe receive
183 :0 wf
184 | formail -I \"X-SpamProbe: $SCORE\"
185 :0 a:
186 *^X-SpamProbe: SPAM
187 $SPAM
188
189 Occasionally some good mail gets misclassified as spam. You can use
190 \\[mh-junk-whitelist] to reclassify that as good mail."
191 (unless mh-spamprobe-executable
192 (error "Couldn't find the spamprobe executable"))
193 (let ((msg-file (mh-msg-filename msg mh-current-folder)))
194 (call-process mh-spamprobe-executable msg-file 0 nil "spam")))
195
196 (defun mh-spamprobe-whitelist (msg)
197 "Reinstate incorrectly filtered MSG.
198 Train spamprobe to think of the message as non-spam."
199 (unless mh-spamprobe-executable
200 (error "Couldn't find the spamprobe executable"))
201 (let ((msg-file (mh-msg-filename msg mh-current-folder)))
202 (call-process mh-spamprobe-executable msg-file 0 nil "good")))
203
204 \f
205
206 ;; Spamassassin Interface
207
208 (defvar mh-spamassassin-executable (executable-find "spamassassin"))
209 (defvar mh-sa-learn-executable (executable-find "sa-learn"))
210
211 (defun mh-spamassassin-blacklist (msg)
212 "Blacklist MSG.
213 This is done by sending the message to Razor and by appending the sender to
214 ~/.spamassassin/user_prefs in a blacklist_from rule. If sa-learn is available,
215 the message is also recategorized as spam.
216
217 Spamassassin is an excellent spam filter. For more information, see:
218 http://spamassassin.org/.
219
220 I ran \"spamassassin -t\" on every mail message in my archive and ran an
221 analysis in Gnumeric to find that the standard deviation of good mail
222 scored under 5 (coincidentally, the spamassassin default for \"spam\").
223
224 Furthermore, I observed that there weren't any messages with a score of 8
225 or more that were interesting, so I added a couple of points to be
226 conservative and send any message with a score of 10 or more down the
227 drain. You might want to use a score of 12 or 13 to be really conservative.
228 I have found that this really decreases the amount of junk to review.
229
230 Messages with a score of 5-9 are set aside for later review. The major
231 weakness of rules-based filters is a plethora of false positives\; I catch one
232 or two legitimate messages in here a week, so it is worthwhile to check.
233
234 You might choose to do this analysis yourself to pick a good score for
235 deleting spam sight unseen, or you might pick a score out of a hat, or you
236 might choose to be very conservative and not delete any messages at all.
237
238 Based upon this discussion, here is what the associated ~/.procmailrc
239 entries look like. These rules appear before my list filters so that spam
240 sent to mailing lists gets pruned too.
241
242 #
243 # Spam
244 #
245 :0fw
246 | spamc
247
248 # Anything with a spam level of 10 or more is junked immediately.
249 :0:
250 * ^X-Spam-Level: ..........
251 /dev/null
252
253 :0
254 * ^X-Spam-Status: Yes
255 $SPAM
256
257 If you don't use \"spamc\", use \"spamassassin -P -a\".
258
259 A handful of spam does find its way into +inbox. In this case, use
260 \\[mh-junk-blacklist] to add a \"blacklist_from\" line to
261 ~/spamassassin/user_prefs, delete the message, and send the message to the
262 Razor, so that others might not see this spam.
263
264 Over time, you see some patterns in the blacklisted addresses and can
265 replace several lines with wildcards. For example, it is clear that High
266 Speed Media is the biggest bunch of jerks on the Net. Here are some of the
267 entries I have for them, and the list continues to grow.
268
269 blacklist_from *@*-hsm-*.com
270 blacklist_from *@*182*643*.com
271 blacklist_from *@*antarhsm*.com
272 blacklist_from *@*h*speed*
273 blacklist_from *@*hsm*182*.com
274 blacklist_from *@*hsm*643*.com
275 blacklist_from *@*hsmridi2983cslt227.com
276 blacklist_from *@*list*hsm*.com
277 blacklist_from *@h*s*media*
278 blacklist_from *@hsmdrct.com
279 blacklist_from *@hsmridi2983csltsite.com
280
281 The function `mh-spamassassin-identify-spammers' is provided that shows the
282 frequency counts of the host and domain names in your blacklist_from
283 entries. This can be helpful when editing the blacklist_from entries.
284
285 In versions of spamassassin (2.50 and on) that support a Bayesian classifier,
286 \\[mh-junk-blacklist] uses the sa-learn program to recategorize the message as
287 spam. Neither MH-E, nor spamassassin, rebuilds the database after adding
288 words, so you will need to run \"sa-learn --rebuild\" periodically. This can
289 be done by adding the following to your crontab:
290
291 0 * * * * sa-learn --rebuild > /dev/null 2>&1"
292 (unless mh-spamassassin-executable
293 (error "Couldn't find the spamassassin executable"))
294 (let ((current-folder mh-current-folder)
295 (msg-file (mh-msg-filename msg mh-current-folder))
296 (sender))
297 (save-excursion
298 (message "Giving this message the Razor...")
299 (mh-truncate-log-buffer)
300 (call-process mh-spamassassin-executable msg-file mh-log-buffer nil
301 "--report" "--remove-from-whitelist")
302 (when mh-sa-learn-executable
303 (message "Recategorizing this message as spam...")
304 (call-process mh-sa-learn-executable msg-file mh-log-buffer nil
305 "--single" "--spam" "--local --no-rebuild"))
306 (message "Blacklisting address...")
307 (set-buffer (get-buffer-create mh-temp-buffer))
308 (erase-buffer)
309 (call-process (expand-file-name mh-scan-prog mh-progs) nil t nil
310 (format "%s" msg) current-folder
311 "-format" "%<(mymbox{from})%|%(addr{from})%>")
312 (goto-char (point-min))
313 (if (search-forward-regexp "^\\(.+\\)$" nil t)
314 (progn
315 (setq sender (match-string 0))
316 (mh-spamassassin-add-rule "blacklist_from" sender)
317 (message "Blacklisting address...done"))
318 (message "Blacklisting address...not done (from my address)")))))
319
320 (defun mh-spamassassin-whitelist (msg)
321 "Whitelist MSG.
322 Add a whitelist_from rule to the ~/.spamassassin/user_prefs file. If sa-learn
323 is available, then the message is recategorized as ham."
324 (unless mh-spamassassin-executable
325 (error "Couldn't find the spamassassin executable"))
326 (let ((msg-file (mh-msg-filename msg mh-current-folder))
327 (show-buffer (get-buffer mh-show-buffer))
328 from)
329 (save-excursion
330 (set-buffer (get-buffer-create mh-temp-buffer))
331 (erase-buffer)
332 (message "Removing spamassassin markup from message...")
333 (call-process mh-spamassassin-executable msg-file mh-temp-buffer nil
334 "--remove-markup")
335 (if show-buffer
336 (kill-buffer show-buffer))
337 (write-file msg-file)
338 (when mh-sa-learn-executable
339 (message "Recategorizing this message as ham...")
340 (call-process mh-sa-learn-executable msg-file mh-temp-buffer nil
341 "--single" "--ham" "--local --no-rebuild"))
342 (message "Whitelisting address...")
343 (setq from (car (ietf-drums-parse-address (mh-get-header-field "From:"))))
344 (kill-buffer nil)
345 (unless (equal from "")
346 (mh-spamassassin-add-rule "whitelist_from" from))
347 (message "Whitelisting address...done"))))
348
349 (defun mh-spamassassin-add-rule (rule body)
350 "Add a new rule to ~/.spamassassin/user_prefs.
351 The name of the rule is RULE and its body is BODY."
352 (save-window-excursion
353 (let* ((line (format "%s\t%s\n" rule body))
354 (case-fold-search t)
355 (file (expand-file-name "~/.spamassassin/user_prefs"))
356 (buffer-exists (find-buffer-visiting file)))
357 (find-file file)
358 (if (not (search-forward (format "\n%s" line) nil t))
359 (progn
360 (goto-char (point-max))
361 (insert (if (bolp) "" "\n") line)
362 (save-buffer)))
363 (if (not buffer-exists)
364 (kill-buffer nil)))))
365
366 (defun mh-spamassassin-identify-spammers ()
367 "Identifies spammers who are repeat offenders.
368
369 For each blacklist_from entry from the last blank line of
370 ~/.spamassassin/user_prefs to the end of the file, a list of host and domain
371 names along with their frequency counts is displayed. This information can be
372 used to replace multiple blacklist_from entries with a single wildcard entry
373 such as:
374
375 blacklist_from *@*amazingoffersdirect2u.com"
376 (interactive)
377 (let* ((file (expand-file-name "~/.spamassassin/user_prefs"))
378 (domains (make-hash-table :test 'equal)))
379 (find-file file)
380 ;; Only consider entries between last blank line and end of file.
381 (goto-char (1- (point-max)))
382 (search-backward-regexp "^$")
383 ;; Perform frequency count.
384 (save-excursion
385 (while (search-forward-regexp "^blacklist_from\\s-*\\(.*\\)@\\(.*\\)$"
386 nil t)
387 (let ((host (match-string 2))
388 value)
389 ;; Remove top-level-domain from hostname.
390 (setq host (cdr (reverse (split-string host "\\."))))
391 ;; Add counts for each host and domain part.
392 (while host
393 (setq value (gethash (car host) domains))
394 (puthash (car host) (1+ (if (not value) 0 value)) domains)
395 (setq host (cdr host))))))
396
397 ;; Output
398 (delete-other-windows)
399 (pop-to-buffer (get-buffer-create "*MH-E Spammer Frequencies*"))
400 (erase-buffer)
401 (maphash '(lambda (key value) ""
402 (if (> value 2)
403 (insert (format "%s %s\n" key value))))
404 domains)
405 (sort-numeric-fields 2 (point-min) (point-max))
406 (reverse-region (point-min) (point-max))
407 (goto-char (point-min))))
408
409 (provide 'mh-junk)
410
411 ;;; Local Variables:
412 ;;; indent-tabs-mode: nil
413 ;;; sentence-end-double-space: nil
414 ;;; End:
415
416 ;;; arch-tag: 603335f1-77ff-4306-8828-5d3dad51abe1
417 ;;; mh-junk.el ends here