Commit | Line | Data |
---|---|---|
23f87bed MB |
1 | ;;; spam-stat.el --- detecting spam based on statistics |
2 | ||
ba318903 | 3 | ;; Copyright (C) 2002-2014 Free Software Foundation, Inc. |
23f87bed MB |
4 | |
5 | ;; Author: Alex Schroeder <alex@gnu.org> | |
6 | ;; Keywords: network | |
7 | ;; URL: http://www.emacswiki.org/cgi-bin/wiki.pl?SpamStat | |
8 | ||
9 | ;; This file is part of GNU Emacs. | |
10 | ||
5e809f55 GM |
11 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
12 | ;; it under the terms of the GNU General Public License as published by | |
13 | ;; the Free Software Foundation, either version 3 of the License, or | |
14 | ;; (at your option) any later version. | |
23f87bed | 15 | |
5e809f55 GM |
16 | ;; GNU Emacs is distributed in the hope that it will be useful, |
17 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
18 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
19 | ;; GNU General Public License for more details. | |
23f87bed MB |
20 | |
21 | ;; You should have received a copy of the GNU General Public License | |
5e809f55 | 22 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
23f87bed MB |
23 | |
24 | ;;; Commentary: | |
25 | ||
26 | ;; This implements spam analysis according to Paul Graham in "A Plan | |
27 | ;; for Spam". The basis for all this is a statistical distribution of | |
28 | ;; words for your spam and non-spam mails. We need this information | |
29 | ;; in a hash-table so that the analysis can use the information when | |
30 | ;; looking at your mails. Therefore, before you begin, you need tons | |
31 | ;; of mails (Graham uses 4000 non-spam and 4000 spam mails for his | |
32 | ;; experiments). | |
33 | ;; | |
34 | ;; The main interface to using spam-stat, are the following functions: | |
35 | ;; | |
36 | ;; `spam-stat-buffer-is-spam' -- called in a buffer, that buffer is | |
37 | ;; considered to be a new spam mail; use this for new mail that has | |
38 | ;; not been processed before | |
39 | ;; | |
40 | ;; `spam-stat-buffer-is-non-spam' -- called in a buffer, that buffer | |
41 | ;; is considered to be a new non-spam mail; use this for new mail that | |
42 | ;; has not been processed before | |
43 | ;; | |
44 | ;; `spam-stat-buffer-change-to-spam' -- called in a buffer, that | |
45 | ;; buffer is no longer considered to be normal mail but spam; use this | |
46 | ;; to change the status of a mail that has already been processed as | |
47 | ;; non-spam | |
48 | ;; | |
49 | ;; `spam-stat-buffer-change-to-non-spam' -- called in a buffer, that | |
50 | ;; buffer is no longer considered to be spam but normal mail; use this | |
51 | ;; to change the status of a mail that has already been processed as | |
52 | ;; spam | |
53 | ;; | |
54 | ;; `spam-stat-save' -- save the hash table to the file; the filename | |
55 | ;; used is stored in the variable `spam-stat-file' | |
56 | ;; | |
57 | ;; `spam-stat-load' -- load the hash table from a file; the filename | |
58 | ;; used is stored in the variable `spam-stat-file' | |
59 | ;; | |
60 | ;; `spam-stat-score-word' -- return the spam score for a word | |
61 | ;; | |
62 | ;; `spam-stat-score-buffer' -- return the spam score for a buffer | |
63 | ;; | |
64 | ;; `spam-stat-split-fancy' -- for fancy mail splitting; add | |
65 | ;; the rule (: spam-stat-split-fancy) to `nnmail-split-fancy' | |
66 | ;; | |
67 | ;; This requires the following in your ~/.gnus file: | |
68 | ;; | |
69 | ;; (require 'spam-stat) | |
70 | ;; (spam-stat-load) | |
71 | ||
72 | ;;; Testing: | |
73 | ||
74 | ;; Typical test will involve calls to the following functions: | |
75 | ;; | |
76 | ;; Reset: (spam-stat-reset) | |
77 | ;; Learn spam: (spam-stat-process-spam-directory "~/Mail/mail/spam") | |
78 | ;; Learn non-spam: (spam-stat-process-non-spam-directory "~/Mail/mail/misc") | |
79 | ;; Save table: (spam-stat-save) | |
80 | ;; File size: (nth 7 (file-attributes spam-stat-file)) | |
81 | ;; Number of words: (hash-table-count spam-stat) | |
82 | ;; Test spam: (spam-stat-test-directory "~/Mail/mail/spam") | |
83 | ;; Test non-spam: (spam-stat-test-directory "~/Mail/mail/misc") | |
84 | ;; Reduce table size: (spam-stat-reduce-size) | |
85 | ;; Save table: (spam-stat-save) | |
86 | ;; File size: (nth 7 (file-attributes spam-stat-file)) | |
87 | ;; Number of words: (hash-table-count spam-stat) | |
88 | ;; Test spam: (spam-stat-test-directory "~/Mail/mail/spam") | |
89 | ;; Test non-spam: (spam-stat-test-directory "~/Mail/mail/misc") | |
90 | ||
91 | ;;; Dictionary Creation: | |
92 | ||
93 | ;; Typically, you will filter away mailing lists etc. using specific | |
94 | ;; rules in `nnmail-split-fancy'. Somewhere among these rules, you | |
95 | ;; will filter spam. Here is how you would create your dictionary: | |
96 | ||
97 | ;; Reset: (spam-stat-reset) | |
98 | ;; Learn spam: (spam-stat-process-spam-directory "~/Mail/mail/spam") | |
99 | ;; Learn non-spam: (spam-stat-process-non-spam-directory "~/Mail/mail/misc") | |
100 | ;; Repeat for any other non-spam group you need... | |
101 | ;; Reduce table size: (spam-stat-reduce-size) | |
102 | ;; Save table: (spam-stat-save) | |
103 | ||
104 | ;;; Todo: | |
105 | ||
106 | ;; Speed it up. Integrate with Gnus such that it uses spam and expiry | |
107 | ;; marks to call the appropriate functions when leaving the summary | |
108 | ;; buffer and saves the hash table when leaving Gnus. More testing: | |
109 | ;; More mails, disabling SpamAssassin, double checking algorithm, find | |
110 | ;; improved algorithm. | |
111 | ||
112 | ;;; Thanks: | |
113 | ||
114 | ;; Ted Zlatanov <tzz@lifelogs.com> | |
115 | ;; Jesper Harder <harder@myrealbox.com> | |
116 | ;; Dan Schmidt <dfan@dfan.org> | |
117 | ;; Lasse Rasinen <lrasinen@iki.fi> | |
118 | ;; Milan Zamazal <pdm@zamazal.org> | |
119 | ||
120 | \f | |
121 | ||
122 | ;;; Code: | |
01c52d31 | 123 | (require 'mail-parse) |
23f87bed | 124 | |
5998e245 JB |
125 | (defvar gnus-original-article-buffer) |
126 | ||
23f87bed MB |
127 | (defgroup spam-stat nil |
128 | "Statistical spam detection for Emacs. | |
129 | Use the functions to build a dictionary of words and their statistical | |
130 | distribution in spam and non-spam mails. Then use a function to determine | |
131 | whether a buffer contains spam or not." | |
bf247b6e | 132 | :version "22.1" |
23f87bed MB |
133 | :group 'gnus) |
134 | ||
135 | (defcustom spam-stat-file "~/.spam-stat.el" | |
136 | "File used to save and load the dictionary. | |
137 | See `spam-stat-to-hash-table' for the format of the file." | |
138 | :type 'file | |
139 | :group 'spam-stat) | |
140 | ||
23f87bed MB |
141 | (defcustom spam-stat-unknown-word-score 0.2 |
142 | "The score to use for unknown words. | |
143 | Also used for words that don't appear often enough." | |
144 | :type 'number | |
145 | :group 'spam-stat) | |
146 | ||
147 | (defcustom spam-stat-max-word-length 15 | |
148 | "Only words shorter than this will be considered." | |
149 | :type 'integer | |
150 | :group 'spam-stat) | |
151 | ||
152 | (defcustom spam-stat-max-buffer-length 10240 | |
153 | "Only the beginning of buffers will be analyzed. | |
154 | This variable says how many characters this will be." | |
155 | :type 'integer | |
156 | :group 'spam-stat) | |
157 | ||
158 | (defcustom spam-stat-split-fancy-spam-group "mail.spam" | |
01c52d31 MB |
159 | "Name of the group where spam should be stored. |
160 | If `spam-stat-split-fancy' is used in fancy splitting rules. Has | |
161 | no effect when spam-stat is invoked through spam.el." | |
23f87bed MB |
162 | :type 'string |
163 | :group 'spam-stat) | |
164 | ||
01c52d31 MB |
165 | (defcustom spam-stat-split-fancy-spam-threshold 0.9 |
166 | "Spam score threshold in spam-stat-split-fancy." | |
23f87bed MB |
167 | :type 'number |
168 | :group 'spam-stat) | |
169 | ||
01c52d31 MB |
170 | (defcustom spam-stat-washing-hook nil |
171 | "Hook applied to each message before analysis." | |
172 | :type 'hook | |
173 | :group 'spam-stat) | |
174 | ||
175 | (defcustom spam-stat-score-buffer-user-functions nil | |
176 | "List of additional scoring functions. | |
0577a430 | 177 | Called one by one on the buffer. |
01c52d31 MB |
178 | |
179 | If all of these functions return non-nil answers, these numerical | |
180 | answers are added to the computed spam stat score on the buffer. If | |
181 | you defun such functions, make sure they don't return the buffer in a | |
182 | narrowed state or such: use, for example, `save-excursion'. Each of | |
183 | your functions is also passed the initial spam-stat score which might | |
184 | aid in your scoring. | |
185 | ||
186 | Also be careful when defining such functions. If they take a long | |
187 | time, they will slow down your mail splitting. Thus, if the buffer is | |
188 | large, don't forget to use smaller regions, by wrapping your work in, | |
189 | say, `with-spam-stat-max-buffer-size'." | |
190 | :type '(repeat sexp) | |
191 | :group 'spam-stat) | |
192 | ||
193 | (defcustom spam-stat-process-directory-age 90 | |
194 | "Max. age of files to be processed in directory, in days. | |
195 | When using `spam-stat-process-spam-directory' or | |
196 | `spam-stat-process-non-spam-directory', only files that have | |
197 | been touched in this many days will be considered. Without | |
198 | this filter, re-training spam-stat with several thousand messages | |
199 | will start to take a very long time." | |
200 | :type 'number | |
201 | :group 'spam-stat) | |
202 | ||
203 | (defvar spam-stat-last-saved-at nil | |
204 | "Time stamp of last change of spam-stat-file on this run") | |
205 | ||
23f87bed MB |
206 | (defvar spam-stat-syntax-table |
207 | (let ((table (copy-syntax-table text-mode-syntax-table))) | |
208 | (modify-syntax-entry ?- "w" table) | |
209 | (modify-syntax-entry ?_ "w" table) | |
210 | (modify-syntax-entry ?. "w" table) | |
211 | (modify-syntax-entry ?! "w" table) | |
212 | (modify-syntax-entry ?? "w" table) | |
213 | (modify-syntax-entry ?+ "w" table) | |
214 | table) | |
215 | "Syntax table used when processing mails for statistical analysis. | |
216 | The important part is which characters are word constituents.") | |
217 | ||
218 | (defvar spam-stat-dirty nil | |
219 | "Whether the spam-stat database needs saving.") | |
220 | ||
221 | (defvar spam-stat-buffer nil | |
222 | "Buffer to use for scoring while splitting. | |
223 | This is set by hooking into Gnus.") | |
224 | ||
225 | (defvar spam-stat-buffer-name " *spam stat buffer*" | |
226 | "Name of the `spam-stat-buffer'.") | |
227 | ||
01c52d31 MB |
228 | (defvar spam-stat-coding-system |
229 | (if (mm-coding-system-p 'emacs-mule) 'emacs-mule 'raw-text) | |
230 | "Coding system used for `spam-stat-file'.") | |
23f87bed MB |
231 | |
232 | ;; Hooking into Gnus | |
233 | ||
234 | (defun spam-stat-store-current-buffer () | |
235 | "Store a copy of the current buffer in `spam-stat-buffer'." | |
01c52d31 MB |
236 | (let ((buf (current-buffer))) |
237 | (with-current-buffer (get-buffer-create spam-stat-buffer-name) | |
23f87bed | 238 | (erase-buffer) |
01c52d31 | 239 | (insert-buffer-substring buf) |
23f87bed MB |
240 | (setq spam-stat-buffer (current-buffer))))) |
241 | ||
242 | (defun spam-stat-store-gnus-article-buffer () | |
243 | "Store a copy of the current article in `spam-stat-buffer'. | |
244 | This uses `gnus-article-buffer'." | |
01c52d31 | 245 | (with-current-buffer gnus-original-article-buffer |
23f87bed MB |
246 | (spam-stat-store-current-buffer))) |
247 | ||
248 | ;; Data -- not using defstruct in order to save space and time | |
249 | ||
250 | (defvar spam-stat (make-hash-table :test 'equal) | |
251 | "Hash table used to store the statistics. | |
252 | Use `spam-stat-load' to load the file. | |
253 | Every word is used as a key in this table. The value is a vector. | |
254 | Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', | |
255 | `spam-stat-bad', and `spam-stat-score' to access this vector.") | |
256 | ||
257 | (defvar spam-stat-ngood 0 | |
258 | "The number of good mails in the dictionary.") | |
259 | ||
260 | (defvar spam-stat-nbad 0 | |
261 | "The number of bad mails in the dictionary.") | |
262 | ||
01c52d31 MB |
263 | (defvar spam-stat-error-holder nil |
264 | "A holder for condition-case errors while scoring buffers.") | |
265 | ||
23f87bed MB |
266 | (defsubst spam-stat-good (entry) |
267 | "Return the number of times this word belongs to good mails." | |
268 | (aref entry 0)) | |
269 | ||
270 | (defsubst spam-stat-bad (entry) | |
271 | "Return the number of times this word belongs to bad mails." | |
272 | (aref entry 1)) | |
273 | ||
274 | (defsubst spam-stat-score (entry) | |
275 | "Set the score of this word." | |
276 | (if entry | |
277 | (aref entry 2) | |
278 | spam-stat-unknown-word-score)) | |
279 | ||
280 | (defsubst spam-stat-set-good (entry value) | |
281 | "Set the number of times this word belongs to good mails." | |
282 | (aset entry 0 value)) | |
283 | ||
284 | (defsubst spam-stat-set-bad (entry value) | |
285 | "Set the number of times this word belongs to bad mails." | |
286 | (aset entry 1 value)) | |
287 | ||
288 | (defsubst spam-stat-set-score (entry value) | |
289 | "Set the score of this word." | |
290 | (aset entry 2 value)) | |
291 | ||
292 | (defsubst spam-stat-make-entry (good bad) | |
293 | "Return a vector with the given properties." | |
294 | (let ((entry (vector good bad nil))) | |
295 | (spam-stat-set-score entry (spam-stat-compute-score entry)) | |
296 | entry)) | |
297 | ||
298 | ;; Computing | |
299 | ||
300 | (defun spam-stat-compute-score (entry) | |
301 | "Compute the score of this word. 1.0 means spam." | |
302 | ;; promote all numbers to floats for the divisions | |
303 | (let* ((g (* 2.0 (spam-stat-good entry))) | |
304 | (b (float (spam-stat-bad entry)))) | |
305 | (cond ((< (+ g b) 5) | |
306 | .2) | |
307 | ((= 0 spam-stat-ngood) | |
308 | .99) | |
309 | ((= 0 spam-stat-nbad) | |
310 | .01) | |
311 | (t | |
312 | (max .01 | |
313 | (min .99 (/ (/ b spam-stat-nbad) | |
314 | (+ (/ g spam-stat-ngood) | |
315 | (/ b spam-stat-nbad))))))))) | |
316 | ||
317 | ;; Parsing | |
318 | ||
319 | (defmacro with-spam-stat-max-buffer-size (&rest body) | |
01c52d31 | 320 | "Narrow the buffer down to the first 4k characters, then evaluate BODY." |
23f87bed MB |
321 | `(save-restriction |
322 | (when (> (- (point-max) | |
323 | (point-min)) | |
324 | spam-stat-max-buffer-length) | |
325 | (narrow-to-region (point-min) | |
326 | (+ (point-min) spam-stat-max-buffer-length))) | |
327 | ,@body)) | |
328 | ||
329 | (defun spam-stat-buffer-words () | |
caf213c5 | 330 | "Return a hash table of words and number of occurrences in the buffer." |
01c52d31 | 331 | (run-hooks 'spam-stat-washing-hook) |
23f87bed MB |
332 | (with-spam-stat-max-buffer-size |
333 | (with-syntax-table spam-stat-syntax-table | |
334 | (goto-char (point-min)) | |
335 | (let ((result (make-hash-table :test 'equal)) | |
336 | word count) | |
337 | (while (re-search-forward "\\w+" nil t) | |
338 | (setq word (match-string-no-properties 0) | |
339 | count (1+ (gethash word result 0))) | |
340 | (when (< (length word) spam-stat-max-word-length) | |
341 | (puthash word count result))) | |
342 | result)))) | |
343 | ||
344 | (defun spam-stat-buffer-is-spam () | |
345 | "Consider current buffer to be a new spam mail." | |
346 | (setq spam-stat-nbad (1+ spam-stat-nbad)) | |
347 | (maphash | |
348 | (lambda (word count) | |
349 | (let ((entry (gethash word spam-stat))) | |
350 | (if entry | |
351 | (spam-stat-set-bad entry (+ count (spam-stat-bad entry))) | |
352 | (setq entry (spam-stat-make-entry 0 count))) | |
353 | (spam-stat-set-score entry (spam-stat-compute-score entry)) | |
354 | (puthash word entry spam-stat))) | |
355 | (spam-stat-buffer-words)) | |
356 | (setq spam-stat-dirty t)) | |
357 | ||
358 | (defun spam-stat-buffer-is-non-spam () | |
359 | "Consider current buffer to be a new non-spam mail." | |
360 | (setq spam-stat-ngood (1+ spam-stat-ngood)) | |
361 | (maphash | |
362 | (lambda (word count) | |
363 | (let ((entry (gethash word spam-stat))) | |
364 | (if entry | |
365 | (spam-stat-set-good entry (+ count (spam-stat-good entry))) | |
366 | (setq entry (spam-stat-make-entry count 0))) | |
367 | (spam-stat-set-score entry (spam-stat-compute-score entry)) | |
368 | (puthash word entry spam-stat))) | |
369 | (spam-stat-buffer-words)) | |
370 | (setq spam-stat-dirty t)) | |
371 | ||
64c7b653 GM |
372 | (autoload 'gnus-message "gnus-util") |
373 | ||
23f87bed MB |
374 | (defun spam-stat-buffer-change-to-spam () |
375 | "Consider current buffer no longer normal mail but spam." | |
376 | (setq spam-stat-nbad (1+ spam-stat-nbad) | |
377 | spam-stat-ngood (1- spam-stat-ngood)) | |
378 | (maphash | |
379 | (lambda (word count) | |
380 | (let ((entry (gethash word spam-stat))) | |
381 | (if (not entry) | |
01c52d31 | 382 | (gnus-message 8 "This buffer has unknown words in it") |
23f87bed MB |
383 | (spam-stat-set-good entry (- (spam-stat-good entry) count)) |
384 | (spam-stat-set-bad entry (+ (spam-stat-bad entry) count)) | |
385 | (spam-stat-set-score entry (spam-stat-compute-score entry)) | |
386 | (puthash word entry spam-stat)))) | |
387 | (spam-stat-buffer-words)) | |
388 | (setq spam-stat-dirty t)) | |
389 | ||
390 | (defun spam-stat-buffer-change-to-non-spam () | |
391 | "Consider current buffer no longer spam but normal mail." | |
392 | (setq spam-stat-nbad (1- spam-stat-nbad) | |
393 | spam-stat-ngood (1+ spam-stat-ngood)) | |
394 | (maphash | |
395 | (lambda (word count) | |
396 | (let ((entry (gethash word spam-stat))) | |
397 | (if (not entry) | |
01c52d31 | 398 | (gnus-message 8 "This buffer has unknown words in it") |
23f87bed MB |
399 | (spam-stat-set-good entry (+ (spam-stat-good entry) count)) |
400 | (spam-stat-set-bad entry (- (spam-stat-bad entry) count)) | |
401 | (spam-stat-set-score entry (spam-stat-compute-score entry)) | |
402 | (puthash word entry spam-stat)))) | |
403 | (spam-stat-buffer-words)) | |
404 | (setq spam-stat-dirty t)) | |
405 | ||
406 | ;; Saving and Loading | |
407 | ||
408 | (defun spam-stat-save (&optional force) | |
409 | "Save the `spam-stat' hash table as lisp file. | |
410 | With a prefix argument save unconditionally." | |
411 | (interactive "P") | |
412 | (when (or force spam-stat-dirty) | |
01c52d31 MB |
413 | (let ((coding-system-for-write spam-stat-coding-system)) |
414 | (with-temp-file spam-stat-file | |
7029bc7d | 415 | (let ((standard-output (current-buffer))) |
01c52d31 MB |
416 | (insert (format ";-*- coding: %s; -*-\n" spam-stat-coding-system)) |
417 | (insert (format "(setq spam-stat-ngood %d spam-stat-nbad %d | |
418 | spam-stat (spam-stat-to-hash-table '(" spam-stat-ngood spam-stat-nbad)) | |
419 | (maphash (lambda (word entry) | |
420 | (prin1 (list word | |
421 | (spam-stat-good entry) | |
422 | (spam-stat-bad entry)))) | |
423 | spam-stat) | |
424 | (insert ")))")))) | |
425 | (message "Saved %s." spam-stat-file) | |
426 | (setq spam-stat-dirty nil | |
427 | spam-stat-last-saved-at (nth 5 (file-attributes spam-stat-file))))) | |
23f87bed MB |
428 | |
429 | (defun spam-stat-load () | |
430 | "Read the `spam-stat' hash table from disk." | |
431 | ;; TODO: maybe we should warn the user if spam-stat-dirty is t? | |
01c52d31 MB |
432 | (let ((coding-system-for-read spam-stat-coding-system)) |
433 | (cond (spam-stat-dirty (message "Spam stat not loaded: spam-stat-dirty t")) | |
434 | ((or (not (boundp 'spam-stat-last-saved-at)) | |
435 | (null spam-stat-last-saved-at) | |
436 | (not (equal spam-stat-last-saved-at | |
437 | (nth 5 (file-attributes spam-stat-file))))) | |
0577a430 | 438 | (progn |
01c52d31 MB |
439 | (load-file spam-stat-file) |
440 | (setq spam-stat-dirty nil | |
0577a430 | 441 | spam-stat-last-saved-at |
01c52d31 | 442 | (nth 5 (file-attributes spam-stat-file))))) |
0577a430 | 443 | (t (message "Spam stat file not loaded: no change in disk."))))) |
23f87bed MB |
444 | |
445 | (defun spam-stat-to-hash-table (entries) | |
446 | "Turn list ENTRIES into a hash table and store as `spam-stat'. | |
447 | Every element in ENTRIES has the form \(WORD GOOD BAD) where WORD is | |
448 | the word string, NGOOD is the number of good mails it has appeared in, | |
449 | NBAD is the number of bad mails it has appeared in, GOOD is the number | |
450 | of times it appeared in good mails, and BAD is the number of times it | |
451 | has appeared in bad mails." | |
01c52d31 MB |
452 | (let ((table (make-hash-table :size (length entries) |
453 | :test 'equal))) | |
23f87bed MB |
454 | (mapc (lambda (l) |
455 | (puthash (car l) | |
456 | (spam-stat-make-entry (nth 1 l) (nth 2 l)) | |
457 | table)) | |
458 | entries) | |
459 | table)) | |
460 | ||
461 | (defun spam-stat-reset () | |
462 | "Reset `spam-stat' to an empty hash-table. | |
463 | This deletes all the statistics." | |
464 | (interactive) | |
465 | (setq spam-stat (make-hash-table :test 'equal) | |
466 | spam-stat-ngood 0 | |
467 | spam-stat-nbad 0) | |
468 | (setq spam-stat-dirty t)) | |
469 | ||
470 | ;; Scoring buffers | |
471 | ||
472 | (defvar spam-stat-score-data nil | |
473 | "Raw data used in the last run of `spam-stat-score-buffer'.") | |
474 | ||
475 | (defsubst spam-stat-score-word (word) | |
476 | "Return score for WORD. | |
477 | The default score for unknown words is stored in | |
478 | `spam-stat-unknown-word-score'." | |
479 | (spam-stat-score (gethash word spam-stat))) | |
480 | ||
481 | (defun spam-stat-buffer-words-with-scores () | |
482 | "Process current buffer, return the 15 most conspicuous words. | |
483 | These are the words whose spam-stat differs the most from 0.5. | |
484 | The list returned contains elements of the form \(WORD SCORE DIFF), | |
485 | where DIFF is the difference between SCORE and 0.5." | |
01c52d31 MB |
486 | (let (result word score) |
487 | (maphash (lambda (word ignore) | |
488 | (setq score (spam-stat-score-word word) | |
489 | result (cons (list word score (abs (- score 0.5))) | |
490 | result))) | |
491 | (spam-stat-buffer-words)) | |
492 | (setq result (sort result (lambda (a b) (< (nth 2 b) (nth 2 a))))) | |
493 | (setcdr (nthcdr 14 result) nil) | |
494 | result)) | |
23f87bed | 495 | |
da946239 KY |
496 | (eval-when-compile |
497 | (defmacro spam-stat-called-interactively-p (kind) | |
498 | (condition-case nil | |
499 | (progn | |
500 | (eval '(called-interactively-p 'any)) | |
501 | ;; Emacs >=23.2 | |
502 | `(called-interactively-p ,kind)) | |
503 | ;; Emacs <23.2 | |
504 | (wrong-number-of-arguments '(called-interactively-p)) | |
505 | ;; XEmacs | |
506 | (void-function '(interactive-p))))) | |
507 | ||
23f87bed | 508 | (defun spam-stat-score-buffer () |
01c52d31 MB |
509 | "Return a score describing the spam-probability for this buffer. |
510 | Add user supplied modifications if supplied." | |
0577a430 | 511 | (interactive) ; helps in debugging. |
23f87bed | 512 | (setq spam-stat-score-data (spam-stat-buffer-words-with-scores)) |
01c52d31 MB |
513 | (let* ((probs (mapcar 'cadr spam-stat-score-data)) |
514 | (prod (apply #'* probs)) | |
0577a430 | 515 | (score0 |
01c52d31 MB |
516 | (/ prod (+ prod (apply #'* (mapcar #'(lambda (x) (- 1 x)) |
517 | probs))))) | |
518 | (score1s | |
519 | (condition-case | |
520 | spam-stat-error-holder | |
521 | (spam-stat-score-buffer-user score0) | |
522 | (error nil))) | |
523 | (ans | |
524 | (if score1s (+ score0 score1s) score0))) | |
da946239 | 525 | (when (spam-stat-called-interactively-p 'any) |
01c52d31 MB |
526 | (message "%S" ans)) |
527 | ans)) | |
528 | ||
529 | (defun spam-stat-score-buffer-user (&rest args) | |
530 | (let* ((scores | |
0577a430 JB |
531 | (mapcar |
532 | (lambda (fn) | |
01c52d31 MB |
533 | (apply fn args)) |
534 | spam-stat-score-buffer-user-functions))) | |
0577a430 | 535 | (if (memq nil scores) nil |
01c52d31 | 536 | (apply #'+ scores)))) |
23f87bed MB |
537 | |
538 | (defun spam-stat-split-fancy () | |
539 | "Return the name of the spam group if the current mail is spam. | |
540 | Use this function on `nnmail-split-fancy'. If you are interested in | |
541 | the raw data used for the last run of `spam-stat-score-buffer', | |
542 | check the variable `spam-stat-score-data'." | |
01c52d31 | 543 | (condition-case spam-stat-error-holder |
23f87bed MB |
544 | (progn |
545 | (set-buffer spam-stat-buffer) | |
546 | (goto-char (point-min)) | |
01c52d31 | 547 | (when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshold) |
23f87bed MB |
548 | (when (boundp 'nnmail-split-trace) |
549 | (mapc (lambda (entry) | |
550 | (push entry nnmail-split-trace)) | |
551 | spam-stat-score-data)) | |
552 | spam-stat-split-fancy-spam-group)) | |
01c52d31 | 553 | (error (message "Error in spam-stat-split-fancy: %S" spam-stat-error-holder) |
23f87bed MB |
554 | nil))) |
555 | ||
556 | ;; Testing | |
557 | ||
01c52d31 | 558 | (defun spam-stat-strip-xref () |
84b5fcf6 | 559 | "Strip the Xref header." |
01c52d31 MB |
560 | (save-restriction |
561 | (mail-narrow-to-head) | |
562 | (when (re-search-forward "^Xref:.*\n" nil t) | |
563 | (delete-region (match-beginning 0) (match-end 0))))) | |
564 | ||
aa8f8277 GM |
565 | (autoload 'time-to-number-of-days "time-date") |
566 | ||
23f87bed MB |
567 | (defun spam-stat-process-directory (dir func) |
568 | "Process all the regular files in directory DIR using function FUNC." | |
569 | (let* ((files (directory-files dir t "^[^.]")) | |
570 | (max (/ (length files) 100.0)) | |
571 | (count 0)) | |
572 | (with-temp-buffer | |
573 | (dolist (f files) | |
574 | (when (and (file-readable-p f) | |
575 | (file-regular-p f) | |
01c52d31 MB |
576 | (> (nth 7 (file-attributes f)) 0) |
577 | (< (time-to-number-of-days (time-since (nth 5 (file-attributes f)))) | |
578 | spam-stat-process-directory-age)) | |
23f87bed MB |
579 | (setq count (1+ count)) |
580 | (message "Reading %s: %.2f%%" dir (/ count max)) | |
01c52d31 MB |
581 | (insert-file-contents-literally f) |
582 | (spam-stat-strip-xref) | |
23f87bed MB |
583 | (funcall func) |
584 | (erase-buffer)))))) | |
585 | ||
586 | (defun spam-stat-process-spam-directory (dir) | |
587 | "Process all the regular files in directory DIR as spam." | |
588 | (interactive "D") | |
589 | (spam-stat-process-directory dir 'spam-stat-buffer-is-spam)) | |
590 | ||
591 | (defun spam-stat-process-non-spam-directory (dir) | |
592 | "Process all the regular files in directory DIR as non-spam." | |
593 | (interactive "D") | |
594 | (spam-stat-process-directory dir 'spam-stat-buffer-is-non-spam)) | |
595 | ||
596 | (defun spam-stat-count () | |
597 | "Return size of `spam-stat'." | |
598 | (interactive) | |
599 | (hash-table-count spam-stat)) | |
600 | ||
01c52d31 | 601 | (defun spam-stat-test-directory (dir &optional verbose) |
23f87bed MB |
602 | "Test all the regular files in directory DIR for spam. |
603 | If the result is 1.0, then all files are considered spam. | |
604 | If the result is 0.0, non of the files is considered spam. | |
01c52d31 MB |
605 | You can use this to determine error rates. |
606 | ||
607 | If VERBOSE is non-nil display names of files detected as spam or | |
608 | non-spam in a temporary buffer. If it is the symbol `ham', | |
609 | display non-spam files; otherwise display spam files." | |
610 | (interactive "DDirectory: ") | |
23f87bed | 611 | (let* ((files (directory-files dir t "^[^.]")) |
01c52d31 MB |
612 | display-files |
613 | buffer-score | |
23f87bed MB |
614 | (total (length files)) |
615 | (score 0.0); float | |
616 | (max (/ total 100.0)); float | |
617 | (count 0)) | |
618 | (with-temp-buffer | |
619 | (dolist (f files) | |
620 | (when (and (file-readable-p f) | |
621 | (file-regular-p f) | |
622 | (> (nth 7 (file-attributes f)) 0)) | |
623 | (setq count (1+ count)) | |
01c52d31 MB |
624 | (message "Reading %.2f%%, score %.2f" |
625 | (/ count max) (/ score count)) | |
626 | (insert-file-contents-literally f) | |
627 | (setq buffer-score (spam-stat-score-buffer)) | |
628 | (when (> buffer-score 0.9) | |
23f87bed | 629 | (setq score (1+ score))) |
01c52d31 MB |
630 | (when verbose |
631 | (if (> buffer-score 0.9) | |
632 | (unless (eq verbose 'ham) (push f display-files)) | |
633 | (when (eq verbose 'ham) (push f display-files)))) | |
23f87bed | 634 | (erase-buffer)))) |
01c52d31 MB |
635 | (when display-files |
636 | (with-output-to-temp-buffer "*spam-stat results*" | |
637 | (dolist (file display-files) | |
638 | (princ file) | |
639 | (terpri)))) | |
23f87bed MB |
640 | (message "Final score: %d / %d = %f" score total (/ score total)))) |
641 | ||
642 | ;; Shrinking the dictionary | |
643 | ||
644 | (defun spam-stat-reduce-size (&optional count) | |
645 | "Reduce the size of `spam-stat'. | |
646 | This removes all words that occur less than COUNT from the dictionary. | |
647 | COUNT defaults to 5" | |
648 | (interactive) | |
649 | (setq count (or count 5)) | |
650 | (maphash (lambda (key entry) | |
651 | (when (< (+ (spam-stat-good entry) | |
652 | (spam-stat-bad entry)) | |
653 | count) | |
654 | (remhash key spam-stat))) | |
655 | spam-stat) | |
656 | (setq spam-stat-dirty t)) | |
657 | ||
658 | (defun spam-stat-install-hooks-function () | |
01c52d31 | 659 | "Install the spam-stat function hooks." |
23f87bed MB |
660 | (interactive) |
661 | (add-hook 'nnmail-prepare-incoming-message-hook | |
662 | 'spam-stat-store-current-buffer) | |
663 | (add-hook 'gnus-select-article-hook | |
664 | 'spam-stat-store-gnus-article-buffer)) | |
665 | ||
23f87bed | 666 | (defun spam-stat-unload-hook () |
01c52d31 | 667 | "Uninstall the spam-stat function hooks." |
23f87bed MB |
668 | (interactive) |
669 | (remove-hook 'nnmail-prepare-incoming-message-hook | |
670 | 'spam-stat-store-current-buffer) | |
671 | (remove-hook 'gnus-select-article-hook | |
672 | 'spam-stat-store-gnus-article-buffer)) | |
673 | ||
e0281050 RS |
674 | (add-hook 'spam-stat-unload-hook 'spam-stat-unload-hook) |
675 | ||
23f87bed MB |
676 | (provide 'spam-stat) |
677 | ||
23f87bed | 678 | ;;; spam-stat.el ends here |