| 1 | ;;; spam-stat.el --- detecting spam based on statistics |
| 2 | |
| 3 | ;; Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, |
| 4 | ;; 2010 Free Software Foundation, Inc. |
| 5 | |
| 6 | ;; Author: Alex Schroeder <alex@gnu.org> |
| 7 | ;; Keywords: network |
| 8 | ;; URL: http://www.emacswiki.org/cgi-bin/wiki.pl?SpamStat |
| 9 | |
| 10 | ;; This file is part of GNU Emacs. |
| 11 | |
| 12 | ;; GNU Emacs is free software: you can redistribute it and/or modify |
| 13 | ;; it under the terms of the GNU General Public License as published by |
| 14 | ;; the Free Software Foundation, either version 3 of the License, or |
| 15 | ;; (at your option) any later version. |
| 16 | |
| 17 | ;; GNU Emacs is distributed in the hope that it will be useful, |
| 18 | ;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 19 | ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 20 | ;; GNU General Public License for more details. |
| 21 | |
| 22 | ;; You should have received a copy of the GNU General Public License |
| 23 | ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. |
| 24 | |
| 25 | ;;; Commentary: |
| 26 | |
| 27 | ;; This implements spam analysis according to Paul Graham in "A Plan |
| 28 | ;; for Spam". The basis for all this is a statistical distribution of |
| 29 | ;; words for your spam and non-spam mails. We need this information |
| 30 | ;; in a hash-table so that the analysis can use the information when |
| 31 | ;; looking at your mails. Therefore, before you begin, you need tons |
| 32 | ;; of mails (Graham uses 4000 non-spam and 4000 spam mails for his |
| 33 | ;; experiments). |
| 34 | ;; |
| 35 | ;; The main interface to using spam-stat, are the following functions: |
| 36 | ;; |
| 37 | ;; `spam-stat-buffer-is-spam' -- called in a buffer, that buffer is |
| 38 | ;; considered to be a new spam mail; use this for new mail that has |
| 39 | ;; not been processed before |
| 40 | ;; |
| 41 | ;; `spam-stat-buffer-is-non-spam' -- called in a buffer, that buffer |
| 42 | ;; is considered to be a new non-spam mail; use this for new mail that |
| 43 | ;; has not been processed before |
| 44 | ;; |
| 45 | ;; `spam-stat-buffer-change-to-spam' -- called in a buffer, that |
| 46 | ;; buffer is no longer considered to be normal mail but spam; use this |
| 47 | ;; to change the status of a mail that has already been processed as |
| 48 | ;; non-spam |
| 49 | ;; |
| 50 | ;; `spam-stat-buffer-change-to-non-spam' -- called in a buffer, that |
| 51 | ;; buffer is no longer considered to be spam but normal mail; use this |
| 52 | ;; to change the status of a mail that has already been processed as |
| 53 | ;; spam |
| 54 | ;; |
| 55 | ;; `spam-stat-save' -- save the hash table to the file; the filename |
| 56 | ;; used is stored in the variable `spam-stat-file' |
| 57 | ;; |
| 58 | ;; `spam-stat-load' -- load the hash table from a file; the filename |
| 59 | ;; used is stored in the variable `spam-stat-file' |
| 60 | ;; |
| 61 | ;; `spam-stat-score-word' -- return the spam score for a word |
| 62 | ;; |
| 63 | ;; `spam-stat-score-buffer' -- return the spam score for a buffer |
| 64 | ;; |
| 65 | ;; `spam-stat-split-fancy' -- for fancy mail splitting; add |
| 66 | ;; the rule (: spam-stat-split-fancy) to `nnmail-split-fancy' |
| 67 | ;; |
| 68 | ;; This requires the following in your ~/.gnus file: |
| 69 | ;; |
| 70 | ;; (require 'spam-stat) |
| 71 | ;; (spam-stat-load) |
| 72 | |
| 73 | ;;; Testing: |
| 74 | |
| 75 | ;; Typical test will involve calls to the following functions: |
| 76 | ;; |
| 77 | ;; Reset: (spam-stat-reset) |
| 78 | ;; Learn spam: (spam-stat-process-spam-directory "~/Mail/mail/spam") |
| 79 | ;; Learn non-spam: (spam-stat-process-non-spam-directory "~/Mail/mail/misc") |
| 80 | ;; Save table: (spam-stat-save) |
| 81 | ;; File size: (nth 7 (file-attributes spam-stat-file)) |
| 82 | ;; Number of words: (hash-table-count spam-stat) |
| 83 | ;; Test spam: (spam-stat-test-directory "~/Mail/mail/spam") |
| 84 | ;; Test non-spam: (spam-stat-test-directory "~/Mail/mail/misc") |
| 85 | ;; Reduce table size: (spam-stat-reduce-size) |
| 86 | ;; Save table: (spam-stat-save) |
| 87 | ;; File size: (nth 7 (file-attributes spam-stat-file)) |
| 88 | ;; Number of words: (hash-table-count spam-stat) |
| 89 | ;; Test spam: (spam-stat-test-directory "~/Mail/mail/spam") |
| 90 | ;; Test non-spam: (spam-stat-test-directory "~/Mail/mail/misc") |
| 91 | |
| 92 | ;;; Dictionary Creation: |
| 93 | |
| 94 | ;; Typically, you will filter away mailing lists etc. using specific |
| 95 | ;; rules in `nnmail-split-fancy'. Somewhere among these rules, you |
| 96 | ;; will filter spam. Here is how you would create your dictionary: |
| 97 | |
| 98 | ;; Reset: (spam-stat-reset) |
| 99 | ;; Learn spam: (spam-stat-process-spam-directory "~/Mail/mail/spam") |
| 100 | ;; Learn non-spam: (spam-stat-process-non-spam-directory "~/Mail/mail/misc") |
| 101 | ;; Repeat for any other non-spam group you need... |
| 102 | ;; Reduce table size: (spam-stat-reduce-size) |
| 103 | ;; Save table: (spam-stat-save) |
| 104 | |
| 105 | ;;; Todo: |
| 106 | |
| 107 | ;; Speed it up. Integrate with Gnus such that it uses spam and expiry |
| 108 | ;; marks to call the appropriate functions when leaving the summary |
| 109 | ;; buffer and saves the hash table when leaving Gnus. More testing: |
| 110 | ;; More mails, disabling SpamAssassin, double checking algorithm, find |
| 111 | ;; improved algorithm. |
| 112 | |
| 113 | ;;; Thanks: |
| 114 | |
| 115 | ;; Ted Zlatanov <tzz@lifelogs.com> |
| 116 | ;; Jesper Harder <harder@myrealbox.com> |
| 117 | ;; Dan Schmidt <dfan@dfan.org> |
| 118 | ;; Lasse Rasinen <lrasinen@iki.fi> |
| 119 | ;; Milan Zamazal <pdm@zamazal.org> |
| 120 | |
| 121 | \f |
| 122 | |
| 123 | ;;; Code: |
| 124 | (require 'mail-parse) |
| 125 | |
| 126 | (defvar gnus-original-article-buffer) |
| 127 | |
| 128 | (defgroup spam-stat nil |
| 129 | "Statistical spam detection for Emacs. |
| 130 | Use the functions to build a dictionary of words and their statistical |
| 131 | distribution in spam and non-spam mails. Then use a function to determine |
| 132 | whether a buffer contains spam or not." |
| 133 | :version "22.1" |
| 134 | :group 'gnus) |
| 135 | |
| 136 | (defcustom spam-stat-file "~/.spam-stat.el" |
| 137 | "File used to save and load the dictionary. |
| 138 | See `spam-stat-to-hash-table' for the format of the file." |
| 139 | :type 'file |
| 140 | :group 'spam-stat) |
| 141 | |
| 142 | (defcustom spam-stat-install-hooks t |
| 143 | "Whether spam-stat should install its hooks in Gnus. |
| 144 | This is set to nil if you use spam-stat through spam.el." |
| 145 | :type 'boolean |
| 146 | :group 'spam-stat) |
| 147 | |
| 148 | (defcustom spam-stat-unknown-word-score 0.2 |
| 149 | "The score to use for unknown words. |
| 150 | Also used for words that don't appear often enough." |
| 151 | :type 'number |
| 152 | :group 'spam-stat) |
| 153 | |
| 154 | (defcustom spam-stat-max-word-length 15 |
| 155 | "Only words shorter than this will be considered." |
| 156 | :type 'integer |
| 157 | :group 'spam-stat) |
| 158 | |
| 159 | (defcustom spam-stat-max-buffer-length 10240 |
| 160 | "Only the beginning of buffers will be analyzed. |
| 161 | This variable says how many characters this will be." |
| 162 | :type 'integer |
| 163 | :group 'spam-stat) |
| 164 | |
| 165 | (defcustom spam-stat-split-fancy-spam-group "mail.spam" |
| 166 | "Name of the group where spam should be stored. |
| 167 | If `spam-stat-split-fancy' is used in fancy splitting rules. Has |
| 168 | no effect when spam-stat is invoked through spam.el." |
| 169 | :type 'string |
| 170 | :group 'spam-stat) |
| 171 | |
| 172 | (defcustom spam-stat-split-fancy-spam-threshold 0.9 |
| 173 | "Spam score threshold in spam-stat-split-fancy." |
| 174 | :type 'number |
| 175 | :group 'spam-stat) |
| 176 | |
| 177 | (defcustom spam-stat-washing-hook nil |
| 178 | "Hook applied to each message before analysis." |
| 179 | :type 'hook |
| 180 | :group 'spam-stat) |
| 181 | |
| 182 | (defcustom spam-stat-score-buffer-user-functions nil |
| 183 | "List of additional scoring functions. |
| 184 | Called one by one on the buffer. |
| 185 | |
| 186 | If all of these functions return non-nil answers, these numerical |
| 187 | answers are added to the computed spam stat score on the buffer. If |
| 188 | you defun such functions, make sure they don't return the buffer in a |
| 189 | narrowed state or such: use, for example, `save-excursion'. Each of |
| 190 | your functions is also passed the initial spam-stat score which might |
| 191 | aid in your scoring. |
| 192 | |
| 193 | Also be careful when defining such functions. If they take a long |
| 194 | time, they will slow down your mail splitting. Thus, if the buffer is |
| 195 | large, don't forget to use smaller regions, by wrapping your work in, |
| 196 | say, `with-spam-stat-max-buffer-size'." |
| 197 | :type '(repeat sexp) |
| 198 | :group 'spam-stat) |
| 199 | |
| 200 | (defcustom spam-stat-process-directory-age 90 |
| 201 | "Max. age of files to be processed in directory, in days. |
| 202 | When using `spam-stat-process-spam-directory' or |
| 203 | `spam-stat-process-non-spam-directory', only files that have |
| 204 | been touched in this many days will be considered. Without |
| 205 | this filter, re-training spam-stat with several thousand messages |
| 206 | will start to take a very long time." |
| 207 | :type 'number |
| 208 | :group 'spam-stat) |
| 209 | |
| 210 | (defvar spam-stat-last-saved-at nil |
| 211 | "Time stamp of last change of spam-stat-file on this run") |
| 212 | |
| 213 | (defvar spam-stat-syntax-table |
| 214 | (let ((table (copy-syntax-table text-mode-syntax-table))) |
| 215 | (modify-syntax-entry ?- "w" table) |
| 216 | (modify-syntax-entry ?_ "w" table) |
| 217 | (modify-syntax-entry ?. "w" table) |
| 218 | (modify-syntax-entry ?! "w" table) |
| 219 | (modify-syntax-entry ?? "w" table) |
| 220 | (modify-syntax-entry ?+ "w" table) |
| 221 | table) |
| 222 | "Syntax table used when processing mails for statistical analysis. |
| 223 | The important part is which characters are word constituents.") |
| 224 | |
| 225 | (defvar spam-stat-dirty nil |
| 226 | "Whether the spam-stat database needs saving.") |
| 227 | |
| 228 | (defvar spam-stat-buffer nil |
| 229 | "Buffer to use for scoring while splitting. |
| 230 | This is set by hooking into Gnus.") |
| 231 | |
| 232 | (defvar spam-stat-buffer-name " *spam stat buffer*" |
| 233 | "Name of the `spam-stat-buffer'.") |
| 234 | |
| 235 | (defvar spam-stat-coding-system |
| 236 | (if (mm-coding-system-p 'emacs-mule) 'emacs-mule 'raw-text) |
| 237 | "Coding system used for `spam-stat-file'.") |
| 238 | |
| 239 | ;; Hooking into Gnus |
| 240 | |
| 241 | (defun spam-stat-store-current-buffer () |
| 242 | "Store a copy of the current buffer in `spam-stat-buffer'." |
| 243 | (let ((buf (current-buffer))) |
| 244 | (with-current-buffer (get-buffer-create spam-stat-buffer-name) |
| 245 | (erase-buffer) |
| 246 | (insert-buffer-substring buf) |
| 247 | (setq spam-stat-buffer (current-buffer))))) |
| 248 | |
| 249 | (defun spam-stat-store-gnus-article-buffer () |
| 250 | "Store a copy of the current article in `spam-stat-buffer'. |
| 251 | This uses `gnus-article-buffer'." |
| 252 | (with-current-buffer gnus-original-article-buffer |
| 253 | (spam-stat-store-current-buffer))) |
| 254 | |
| 255 | ;; Data -- not using defstruct in order to save space and time |
| 256 | |
| 257 | (defvar spam-stat (make-hash-table :test 'equal) |
| 258 | "Hash table used to store the statistics. |
| 259 | Use `spam-stat-load' to load the file. |
| 260 | Every word is used as a key in this table. The value is a vector. |
| 261 | Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', |
| 262 | `spam-stat-bad', and `spam-stat-score' to access this vector.") |
| 263 | |
| 264 | (defvar spam-stat-ngood 0 |
| 265 | "The number of good mails in the dictionary.") |
| 266 | |
| 267 | (defvar spam-stat-nbad 0 |
| 268 | "The number of bad mails in the dictionary.") |
| 269 | |
| 270 | (defvar spam-stat-error-holder nil |
| 271 | "A holder for condition-case errors while scoring buffers.") |
| 272 | |
| 273 | (defsubst spam-stat-good (entry) |
| 274 | "Return the number of times this word belongs to good mails." |
| 275 | (aref entry 0)) |
| 276 | |
| 277 | (defsubst spam-stat-bad (entry) |
| 278 | "Return the number of times this word belongs to bad mails." |
| 279 | (aref entry 1)) |
| 280 | |
| 281 | (defsubst spam-stat-score (entry) |
| 282 | "Set the score of this word." |
| 283 | (if entry |
| 284 | (aref entry 2) |
| 285 | spam-stat-unknown-word-score)) |
| 286 | |
| 287 | (defsubst spam-stat-set-good (entry value) |
| 288 | "Set the number of times this word belongs to good mails." |
| 289 | (aset entry 0 value)) |
| 290 | |
| 291 | (defsubst spam-stat-set-bad (entry value) |
| 292 | "Set the number of times this word belongs to bad mails." |
| 293 | (aset entry 1 value)) |
| 294 | |
| 295 | (defsubst spam-stat-set-score (entry value) |
| 296 | "Set the score of this word." |
| 297 | (aset entry 2 value)) |
| 298 | |
| 299 | (defsubst spam-stat-make-entry (good bad) |
| 300 | "Return a vector with the given properties." |
| 301 | (let ((entry (vector good bad nil))) |
| 302 | (spam-stat-set-score entry (spam-stat-compute-score entry)) |
| 303 | entry)) |
| 304 | |
| 305 | ;; Computing |
| 306 | |
| 307 | (defun spam-stat-compute-score (entry) |
| 308 | "Compute the score of this word. 1.0 means spam." |
| 309 | ;; promote all numbers to floats for the divisions |
| 310 | (let* ((g (* 2.0 (spam-stat-good entry))) |
| 311 | (b (float (spam-stat-bad entry)))) |
| 312 | (cond ((< (+ g b) 5) |
| 313 | .2) |
| 314 | ((= 0 spam-stat-ngood) |
| 315 | .99) |
| 316 | ((= 0 spam-stat-nbad) |
| 317 | .01) |
| 318 | (t |
| 319 | (max .01 |
| 320 | (min .99 (/ (/ b spam-stat-nbad) |
| 321 | (+ (/ g spam-stat-ngood) |
| 322 | (/ b spam-stat-nbad))))))))) |
| 323 | |
| 324 | ;; Parsing |
| 325 | |
| 326 | (defmacro with-spam-stat-max-buffer-size (&rest body) |
| 327 | "Narrow the buffer down to the first 4k characters, then evaluate BODY." |
| 328 | `(save-restriction |
| 329 | (when (> (- (point-max) |
| 330 | (point-min)) |
| 331 | spam-stat-max-buffer-length) |
| 332 | (narrow-to-region (point-min) |
| 333 | (+ (point-min) spam-stat-max-buffer-length))) |
| 334 | ,@body)) |
| 335 | |
| 336 | (defun spam-stat-buffer-words () |
| 337 | "Return a hash table of words and number of occurrences in the buffer." |
| 338 | (run-hooks 'spam-stat-washing-hook) |
| 339 | (with-spam-stat-max-buffer-size |
| 340 | (with-syntax-table spam-stat-syntax-table |
| 341 | (goto-char (point-min)) |
| 342 | (let ((result (make-hash-table :test 'equal)) |
| 343 | word count) |
| 344 | (while (re-search-forward "\\w+" nil t) |
| 345 | (setq word (match-string-no-properties 0) |
| 346 | count (1+ (gethash word result 0))) |
| 347 | (when (< (length word) spam-stat-max-word-length) |
| 348 | (puthash word count result))) |
| 349 | result)))) |
| 350 | |
| 351 | (defun spam-stat-buffer-is-spam () |
| 352 | "Consider current buffer to be a new spam mail." |
| 353 | (setq spam-stat-nbad (1+ spam-stat-nbad)) |
| 354 | (maphash |
| 355 | (lambda (word count) |
| 356 | (let ((entry (gethash word spam-stat))) |
| 357 | (if entry |
| 358 | (spam-stat-set-bad entry (+ count (spam-stat-bad entry))) |
| 359 | (setq entry (spam-stat-make-entry 0 count))) |
| 360 | (spam-stat-set-score entry (spam-stat-compute-score entry)) |
| 361 | (puthash word entry spam-stat))) |
| 362 | (spam-stat-buffer-words)) |
| 363 | (setq spam-stat-dirty t)) |
| 364 | |
| 365 | (defun spam-stat-buffer-is-non-spam () |
| 366 | "Consider current buffer to be a new non-spam mail." |
| 367 | (setq spam-stat-ngood (1+ spam-stat-ngood)) |
| 368 | (maphash |
| 369 | (lambda (word count) |
| 370 | (let ((entry (gethash word spam-stat))) |
| 371 | (if entry |
| 372 | (spam-stat-set-good entry (+ count (spam-stat-good entry))) |
| 373 | (setq entry (spam-stat-make-entry count 0))) |
| 374 | (spam-stat-set-score entry (spam-stat-compute-score entry)) |
| 375 | (puthash word entry spam-stat))) |
| 376 | (spam-stat-buffer-words)) |
| 377 | (setq spam-stat-dirty t)) |
| 378 | |
| 379 | (autoload 'gnus-message "gnus-util") |
| 380 | |
| 381 | (defun spam-stat-buffer-change-to-spam () |
| 382 | "Consider current buffer no longer normal mail but spam." |
| 383 | (setq spam-stat-nbad (1+ spam-stat-nbad) |
| 384 | spam-stat-ngood (1- spam-stat-ngood)) |
| 385 | (maphash |
| 386 | (lambda (word count) |
| 387 | (let ((entry (gethash word spam-stat))) |
| 388 | (if (not entry) |
| 389 | (gnus-message 8 "This buffer has unknown words in it") |
| 390 | (spam-stat-set-good entry (- (spam-stat-good entry) count)) |
| 391 | (spam-stat-set-bad entry (+ (spam-stat-bad entry) count)) |
| 392 | (spam-stat-set-score entry (spam-stat-compute-score entry)) |
| 393 | (puthash word entry spam-stat)))) |
| 394 | (spam-stat-buffer-words)) |
| 395 | (setq spam-stat-dirty t)) |
| 396 | |
| 397 | (defun spam-stat-buffer-change-to-non-spam () |
| 398 | "Consider current buffer no longer spam but normal mail." |
| 399 | (setq spam-stat-nbad (1- spam-stat-nbad) |
| 400 | spam-stat-ngood (1+ spam-stat-ngood)) |
| 401 | (maphash |
| 402 | (lambda (word count) |
| 403 | (let ((entry (gethash word spam-stat))) |
| 404 | (if (not entry) |
| 405 | (gnus-message 8 "This buffer has unknown words in it") |
| 406 | (spam-stat-set-good entry (+ (spam-stat-good entry) count)) |
| 407 | (spam-stat-set-bad entry (- (spam-stat-bad entry) count)) |
| 408 | (spam-stat-set-score entry (spam-stat-compute-score entry)) |
| 409 | (puthash word entry spam-stat)))) |
| 410 | (spam-stat-buffer-words)) |
| 411 | (setq spam-stat-dirty t)) |
| 412 | |
| 413 | ;; Saving and Loading |
| 414 | |
| 415 | (defun spam-stat-save (&optional force) |
| 416 | "Save the `spam-stat' hash table as lisp file. |
| 417 | With a prefix argument save unconditionally." |
| 418 | (interactive "P") |
| 419 | (when (or force spam-stat-dirty) |
| 420 | (let ((coding-system-for-write spam-stat-coding-system)) |
| 421 | (with-temp-file spam-stat-file |
| 422 | (let ((standard-output (current-buffer)) |
| 423 | (font-lock-maximum-size 0)) |
| 424 | (insert (format ";-*- coding: %s; -*-\n" spam-stat-coding-system)) |
| 425 | (insert (format "(setq spam-stat-ngood %d spam-stat-nbad %d |
| 426 | spam-stat (spam-stat-to-hash-table '(" spam-stat-ngood spam-stat-nbad)) |
| 427 | (maphash (lambda (word entry) |
| 428 | (prin1 (list word |
| 429 | (spam-stat-good entry) |
| 430 | (spam-stat-bad entry)))) |
| 431 | spam-stat) |
| 432 | (insert ")))")))) |
| 433 | (message "Saved %s." spam-stat-file) |
| 434 | (setq spam-stat-dirty nil |
| 435 | spam-stat-last-saved-at (nth 5 (file-attributes spam-stat-file))))) |
| 436 | |
| 437 | (defun spam-stat-load () |
| 438 | "Read the `spam-stat' hash table from disk." |
| 439 | ;; TODO: maybe we should warn the user if spam-stat-dirty is t? |
| 440 | (let ((coding-system-for-read spam-stat-coding-system)) |
| 441 | (cond (spam-stat-dirty (message "Spam stat not loaded: spam-stat-dirty t")) |
| 442 | ((or (not (boundp 'spam-stat-last-saved-at)) |
| 443 | (null spam-stat-last-saved-at) |
| 444 | (not (equal spam-stat-last-saved-at |
| 445 | (nth 5 (file-attributes spam-stat-file))))) |
| 446 | (progn |
| 447 | (load-file spam-stat-file) |
| 448 | (setq spam-stat-dirty nil |
| 449 | spam-stat-last-saved-at |
| 450 | (nth 5 (file-attributes spam-stat-file))))) |
| 451 | (t (message "Spam stat file not loaded: no change in disk."))))) |
| 452 | |
| 453 | (defun spam-stat-to-hash-table (entries) |
| 454 | "Turn list ENTRIES into a hash table and store as `spam-stat'. |
| 455 | Every element in ENTRIES has the form \(WORD GOOD BAD) where WORD is |
| 456 | the word string, NGOOD is the number of good mails it has appeared in, |
| 457 | NBAD is the number of bad mails it has appeared in, GOOD is the number |
| 458 | of times it appeared in good mails, and BAD is the number of times it |
| 459 | has appeared in bad mails." |
| 460 | (let ((table (make-hash-table :size (length entries) |
| 461 | :test 'equal))) |
| 462 | (mapc (lambda (l) |
| 463 | (puthash (car l) |
| 464 | (spam-stat-make-entry (nth 1 l) (nth 2 l)) |
| 465 | table)) |
| 466 | entries) |
| 467 | table)) |
| 468 | |
| 469 | (defun spam-stat-reset () |
| 470 | "Reset `spam-stat' to an empty hash-table. |
| 471 | This deletes all the statistics." |
| 472 | (interactive) |
| 473 | (setq spam-stat (make-hash-table :test 'equal) |
| 474 | spam-stat-ngood 0 |
| 475 | spam-stat-nbad 0) |
| 476 | (setq spam-stat-dirty t)) |
| 477 | |
| 478 | ;; Scoring buffers |
| 479 | |
| 480 | (defvar spam-stat-score-data nil |
| 481 | "Raw data used in the last run of `spam-stat-score-buffer'.") |
| 482 | |
| 483 | (defsubst spam-stat-score-word (word) |
| 484 | "Return score for WORD. |
| 485 | The default score for unknown words is stored in |
| 486 | `spam-stat-unknown-word-score'." |
| 487 | (spam-stat-score (gethash word spam-stat))) |
| 488 | |
| 489 | (defun spam-stat-buffer-words-with-scores () |
| 490 | "Process current buffer, return the 15 most conspicuous words. |
| 491 | These are the words whose spam-stat differs the most from 0.5. |
| 492 | The list returned contains elements of the form \(WORD SCORE DIFF), |
| 493 | where DIFF is the difference between SCORE and 0.5." |
| 494 | (let (result word score) |
| 495 | (maphash (lambda (word ignore) |
| 496 | (setq score (spam-stat-score-word word) |
| 497 | result (cons (list word score (abs (- score 0.5))) |
| 498 | result))) |
| 499 | (spam-stat-buffer-words)) |
| 500 | (setq result (sort result (lambda (a b) (< (nth 2 b) (nth 2 a))))) |
| 501 | (setcdr (nthcdr 14 result) nil) |
| 502 | result)) |
| 503 | |
| 504 | (defun spam-stat-score-buffer () |
| 505 | "Return a score describing the spam-probability for this buffer. |
| 506 | Add user supplied modifications if supplied." |
| 507 | (interactive) ; helps in debugging. |
| 508 | (setq spam-stat-score-data (spam-stat-buffer-words-with-scores)) |
| 509 | (let* ((probs (mapcar 'cadr spam-stat-score-data)) |
| 510 | (prod (apply #'* probs)) |
| 511 | (score0 |
| 512 | (/ prod (+ prod (apply #'* (mapcar #'(lambda (x) (- 1 x)) |
| 513 | probs))))) |
| 514 | (score1s |
| 515 | (condition-case |
| 516 | spam-stat-error-holder |
| 517 | (spam-stat-score-buffer-user score0) |
| 518 | (error nil))) |
| 519 | (ans |
| 520 | (if score1s (+ score0 score1s) score0))) |
| 521 | (when (interactive-p) |
| 522 | (message "%S" ans)) |
| 523 | ans)) |
| 524 | |
| 525 | (defun spam-stat-score-buffer-user (&rest args) |
| 526 | (let* ((scores |
| 527 | (mapcar |
| 528 | (lambda (fn) |
| 529 | (apply fn args)) |
| 530 | spam-stat-score-buffer-user-functions))) |
| 531 | (if (memq nil scores) nil |
| 532 | (apply #'+ scores)))) |
| 533 | |
| 534 | (defun spam-stat-split-fancy () |
| 535 | "Return the name of the spam group if the current mail is spam. |
| 536 | Use this function on `nnmail-split-fancy'. If you are interested in |
| 537 | the raw data used for the last run of `spam-stat-score-buffer', |
| 538 | check the variable `spam-stat-score-data'." |
| 539 | (condition-case spam-stat-error-holder |
| 540 | (progn |
| 541 | (set-buffer spam-stat-buffer) |
| 542 | (goto-char (point-min)) |
| 543 | (when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshold) |
| 544 | (when (boundp 'nnmail-split-trace) |
| 545 | (mapc (lambda (entry) |
| 546 | (push entry nnmail-split-trace)) |
| 547 | spam-stat-score-data)) |
| 548 | spam-stat-split-fancy-spam-group)) |
| 549 | (error (message "Error in spam-stat-split-fancy: %S" spam-stat-error-holder) |
| 550 | nil))) |
| 551 | |
| 552 | ;; Testing |
| 553 | |
| 554 | (defun spam-stat-strip-xref () |
| 555 | "Strip the Xref header." |
| 556 | (save-restriction |
| 557 | (mail-narrow-to-head) |
| 558 | (when (re-search-forward "^Xref:.*\n" nil t) |
| 559 | (delete-region (match-beginning 0) (match-end 0))))) |
| 560 | |
| 561 | (autoload 'time-to-number-of-days "time-date") |
| 562 | |
| 563 | (defun spam-stat-process-directory (dir func) |
| 564 | "Process all the regular files in directory DIR using function FUNC." |
| 565 | (let* ((files (directory-files dir t "^[^.]")) |
| 566 | (max (/ (length files) 100.0)) |
| 567 | (count 0)) |
| 568 | (with-temp-buffer |
| 569 | (dolist (f files) |
| 570 | (when (and (file-readable-p f) |
| 571 | (file-regular-p f) |
| 572 | (> (nth 7 (file-attributes f)) 0) |
| 573 | (< (time-to-number-of-days (time-since (nth 5 (file-attributes f)))) |
| 574 | spam-stat-process-directory-age)) |
| 575 | (setq count (1+ count)) |
| 576 | (message "Reading %s: %.2f%%" dir (/ count max)) |
| 577 | (insert-file-contents-literally f) |
| 578 | (spam-stat-strip-xref) |
| 579 | (funcall func) |
| 580 | (erase-buffer)))))) |
| 581 | |
| 582 | (defun spam-stat-process-spam-directory (dir) |
| 583 | "Process all the regular files in directory DIR as spam." |
| 584 | (interactive "D") |
| 585 | (spam-stat-process-directory dir 'spam-stat-buffer-is-spam)) |
| 586 | |
| 587 | (defun spam-stat-process-non-spam-directory (dir) |
| 588 | "Process all the regular files in directory DIR as non-spam." |
| 589 | (interactive "D") |
| 590 | (spam-stat-process-directory dir 'spam-stat-buffer-is-non-spam)) |
| 591 | |
| 592 | (defun spam-stat-count () |
| 593 | "Return size of `spam-stat'." |
| 594 | (interactive) |
| 595 | (hash-table-count spam-stat)) |
| 596 | |
| 597 | (defun spam-stat-test-directory (dir &optional verbose) |
| 598 | "Test all the regular files in directory DIR for spam. |
| 599 | If the result is 1.0, then all files are considered spam. |
| 600 | If the result is 0.0, non of the files is considered spam. |
| 601 | You can use this to determine error rates. |
| 602 | |
| 603 | If VERBOSE is non-nil display names of files detected as spam or |
| 604 | non-spam in a temporary buffer. If it is the symbol `ham', |
| 605 | display non-spam files; otherwise display spam files." |
| 606 | (interactive "DDirectory: ") |
| 607 | (let* ((files (directory-files dir t "^[^.]")) |
| 608 | display-files |
| 609 | buffer-score |
| 610 | (total (length files)) |
| 611 | (score 0.0); float |
| 612 | (max (/ total 100.0)); float |
| 613 | (count 0)) |
| 614 | (with-temp-buffer |
| 615 | (dolist (f files) |
| 616 | (when (and (file-readable-p f) |
| 617 | (file-regular-p f) |
| 618 | (> (nth 7 (file-attributes f)) 0)) |
| 619 | (setq count (1+ count)) |
| 620 | (message "Reading %.2f%%, score %.2f" |
| 621 | (/ count max) (/ score count)) |
| 622 | (insert-file-contents-literally f) |
| 623 | (setq buffer-score (spam-stat-score-buffer)) |
| 624 | (when (> buffer-score 0.9) |
| 625 | (setq score (1+ score))) |
| 626 | (when verbose |
| 627 | (if (> buffer-score 0.9) |
| 628 | (unless (eq verbose 'ham) (push f display-files)) |
| 629 | (when (eq verbose 'ham) (push f display-files)))) |
| 630 | (erase-buffer)))) |
| 631 | (when display-files |
| 632 | (with-output-to-temp-buffer "*spam-stat results*" |
| 633 | (dolist (file display-files) |
| 634 | (princ file) |
| 635 | (terpri)))) |
| 636 | (message "Final score: %d / %d = %f" score total (/ score total)))) |
| 637 | |
| 638 | ;; Shrinking the dictionary |
| 639 | |
| 640 | (defun spam-stat-reduce-size (&optional count) |
| 641 | "Reduce the size of `spam-stat'. |
| 642 | This removes all words that occur less than COUNT from the dictionary. |
| 643 | COUNT defaults to 5" |
| 644 | (interactive) |
| 645 | (setq count (or count 5)) |
| 646 | (maphash (lambda (key entry) |
| 647 | (when (< (+ (spam-stat-good entry) |
| 648 | (spam-stat-bad entry)) |
| 649 | count) |
| 650 | (remhash key spam-stat))) |
| 651 | spam-stat) |
| 652 | (setq spam-stat-dirty t)) |
| 653 | |
| 654 | (defun spam-stat-install-hooks-function () |
| 655 | "Install the spam-stat function hooks." |
| 656 | (interactive) |
| 657 | (add-hook 'nnmail-prepare-incoming-message-hook |
| 658 | 'spam-stat-store-current-buffer) |
| 659 | (add-hook 'gnus-select-article-hook |
| 660 | 'spam-stat-store-gnus-article-buffer)) |
| 661 | |
| 662 | (when spam-stat-install-hooks |
| 663 | (spam-stat-install-hooks-function)) |
| 664 | |
| 665 | (defun spam-stat-unload-hook () |
| 666 | "Uninstall the spam-stat function hooks." |
| 667 | (interactive) |
| 668 | (remove-hook 'nnmail-prepare-incoming-message-hook |
| 669 | 'spam-stat-store-current-buffer) |
| 670 | (remove-hook 'gnus-select-article-hook |
| 671 | 'spam-stat-store-gnus-article-buffer)) |
| 672 | |
| 673 | (add-hook 'spam-stat-unload-hook 'spam-stat-unload-hook) |
| 674 | |
| 675 | (provide 'spam-stat) |
| 676 | |
| 677 | ;;; spam-stat.el ends here |