Mercurial > emacs
comparison lisp/gnus/spam-stat.el @ 56927:55fd4f77387a after-merge-gnus-5_10
Revision: miles@gnu.org--gnu-2004/emacs--cvs-trunk--0--patch-523
Merge from emacs--gnus--5.10, gnus--rel--5.10
Patches applied:
* miles@gnu.org--gnu-2004/emacs--gnus--5.10--base-0
tag of miles@gnu.org--gnu-2004/emacs--cvs-trunk--0--patch-464
* miles@gnu.org--gnu-2004/emacs--gnus--5.10--patch-1
Import from CVS branch gnus-5_10-branch
* miles@gnu.org--gnu-2004/emacs--gnus--5.10--patch-2
Merge from lorentey@elte.hu--2004/emacs--multi-tty--0, emacs--cvs-trunk--0
* miles@gnu.org--gnu-2004/emacs--gnus--5.10--patch-3
Merge from gnus--rel--5.10
* miles@gnu.org--gnu-2004/emacs--gnus--5.10--patch-4
Merge from gnus--rel--5.10
* miles@gnu.org--gnu-2004/gnus--rel--5.10--patch-18
Update from CVS
* miles@gnu.org--gnu-2004/gnus--rel--5.10--patch-19
Remove autoconf-generated files from archive
* miles@gnu.org--gnu-2004/gnus--rel--5.10--patch-20
Update from CVS
author | Miles Bader <miles@gnu.org> |
---|---|
date | Sat, 04 Sep 2004 13:13:48 +0000 |
parents | |
children | a19498f00f9a |
comparison
equal
deleted
inserted
replaced
56926:f8e248e9a717 | 56927:55fd4f77387a |
---|---|
1 ;;; spam-stat.el --- detecting spam based on statistics | |
2 | |
3 ;; Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc. | |
4 | |
5 ;; Author: Alex Schroeder <alex@gnu.org> | |
6 ;; Keywords: network | |
7 ;; URL: http://www.emacswiki.org/cgi-bin/wiki.pl?SpamStat | |
8 | |
9 ;; This file is part of GNU Emacs. | |
10 | |
11 ;; This is free software; you can redistribute it and/or modify it | |
12 ;; under the terms of the GNU General Public License as published by | |
13 ;; the Free Software Foundation; either version 2, or (at your option) | |
14 ;; any later version. | |
15 | |
16 ;; This is distributed in the hope that it will be useful, but WITHOUT | |
17 ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
18 ;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public | |
19 ;; License for more details. | |
20 | |
21 ;; You should have received a copy of the GNU General Public License | |
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the | |
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
24 ;; Boston, MA 02111-1307, USA. | |
25 | |
26 ;;; Commentary: | |
27 | |
28 ;; This implements spam analysis according to Paul Graham in "A Plan | |
29 ;; for Spam". The basis for all this is a statistical distribution of | |
30 ;; words for your spam and non-spam mails. We need this information | |
31 ;; in a hash-table so that the analysis can use the information when | |
32 ;; looking at your mails. Therefore, before you begin, you need tons | |
33 ;; of mails (Graham uses 4000 non-spam and 4000 spam mails for his | |
34 ;; experiments). | |
35 ;; | |
36 ;; The main interface to using spam-stat, are the following functions: | |
37 ;; | |
38 ;; `spam-stat-buffer-is-spam' -- called in a buffer, that buffer is | |
39 ;; considered to be a new spam mail; use this for new mail that has | |
40 ;; not been processed before | |
41 ;; | |
42 ;; `spam-stat-buffer-is-non-spam' -- called in a buffer, that buffer | |
43 ;; is considered to be a new non-spam mail; use this for new mail that | |
44 ;; has not been processed before | |
45 ;; | |
46 ;; `spam-stat-buffer-change-to-spam' -- called in a buffer, that | |
47 ;; buffer is no longer considered to be normal mail but spam; use this | |
48 ;; to change the status of a mail that has already been processed as | |
49 ;; non-spam | |
50 ;; | |
51 ;; `spam-stat-buffer-change-to-non-spam' -- called in a buffer, that | |
52 ;; buffer is no longer considered to be spam but normal mail; use this | |
53 ;; to change the status of a mail that has already been processed as | |
54 ;; spam | |
55 ;; | |
56 ;; `spam-stat-save' -- save the hash table to the file; the filename | |
57 ;; used is stored in the variable `spam-stat-file' | |
58 ;; | |
59 ;; `spam-stat-load' -- load the hash table from a file; the filename | |
60 ;; used is stored in the variable `spam-stat-file' | |
61 ;; | |
62 ;; `spam-stat-score-word' -- return the spam score for a word | |
63 ;; | |
64 ;; `spam-stat-score-buffer' -- return the spam score for a buffer | |
65 ;; | |
66 ;; `spam-stat-split-fancy' -- for fancy mail splitting; add | |
67 ;; the rule (: spam-stat-split-fancy) to `nnmail-split-fancy' | |
68 ;; | |
69 ;; This requires the following in your ~/.gnus file: | |
70 ;; | |
71 ;; (require 'spam-stat) | |
72 ;; (spam-stat-load) | |
73 | |
74 ;;; Testing: | |
75 | |
76 ;; Typical test will involve calls to the following functions: | |
77 ;; | |
78 ;; Reset: (spam-stat-reset) | |
79 ;; Learn spam: (spam-stat-process-spam-directory "~/Mail/mail/spam") | |
80 ;; Learn non-spam: (spam-stat-process-non-spam-directory "~/Mail/mail/misc") | |
81 ;; Save table: (spam-stat-save) | |
82 ;; File size: (nth 7 (file-attributes spam-stat-file)) | |
83 ;; Number of words: (hash-table-count spam-stat) | |
84 ;; Test spam: (spam-stat-test-directory "~/Mail/mail/spam") | |
85 ;; Test non-spam: (spam-stat-test-directory "~/Mail/mail/misc") | |
86 ;; Reduce table size: (spam-stat-reduce-size) | |
87 ;; Save table: (spam-stat-save) | |
88 ;; File size: (nth 7 (file-attributes spam-stat-file)) | |
89 ;; Number of words: (hash-table-count spam-stat) | |
90 ;; Test spam: (spam-stat-test-directory "~/Mail/mail/spam") | |
91 ;; Test non-spam: (spam-stat-test-directory "~/Mail/mail/misc") | |
92 | |
93 ;;; Dictionary Creation: | |
94 | |
95 ;; Typically, you will filter away mailing lists etc. using specific | |
96 ;; rules in `nnmail-split-fancy'. Somewhere among these rules, you | |
97 ;; will filter spam. Here is how you would create your dictionary: | |
98 | |
99 ;; Reset: (spam-stat-reset) | |
100 ;; Learn spam: (spam-stat-process-spam-directory "~/Mail/mail/spam") | |
101 ;; Learn non-spam: (spam-stat-process-non-spam-directory "~/Mail/mail/misc") | |
102 ;; Repeat for any other non-spam group you need... | |
103 ;; Reduce table size: (spam-stat-reduce-size) | |
104 ;; Save table: (spam-stat-save) | |
105 | |
106 ;;; Todo: | |
107 | |
108 ;; Speed it up. Integrate with Gnus such that it uses spam and expiry | |
109 ;; marks to call the appropriate functions when leaving the summary | |
110 ;; buffer and saves the hash table when leaving Gnus. More testing: | |
111 ;; More mails, disabling SpamAssassin, double checking algorithm, find | |
112 ;; improved algorithm. | |
113 | |
114 ;;; Thanks: | |
115 | |
116 ;; Ted Zlatanov <tzz@lifelogs.com> | |
117 ;; Jesper Harder <harder@myrealbox.com> | |
118 ;; Dan Schmidt <dfan@dfan.org> | |
119 ;; Lasse Rasinen <lrasinen@iki.fi> | |
120 ;; Milan Zamazal <pdm@zamazal.org> | |
121 | |
122 | |
123 | |
124 ;;; Code: | |
125 | |
126 (defgroup spam-stat nil | |
127 "Statistical spam detection for Emacs. | |
128 Use the functions to build a dictionary of words and their statistical | |
129 distribution in spam and non-spam mails. Then use a function to determine | |
130 whether a buffer contains spam or not." | |
131 :group 'gnus) | |
132 | |
133 (defcustom spam-stat-file "~/.spam-stat.el" | |
134 "File used to save and load the dictionary. | |
135 See `spam-stat-to-hash-table' for the format of the file." | |
136 :type 'file | |
137 :group 'spam-stat) | |
138 | |
139 (defcustom spam-stat-install-hooks t | |
140 "Whether spam-stat should install its hooks in Gnus. | |
141 This is set to nil if you use spam-stat through spam.el." | |
142 :type 'boolean | |
143 :group 'spam-stat) | |
144 | |
145 (defcustom spam-stat-unknown-word-score 0.2 | |
146 "The score to use for unknown words. | |
147 Also used for words that don't appear often enough." | |
148 :type 'number | |
149 :group 'spam-stat) | |
150 | |
151 (defcustom spam-stat-max-word-length 15 | |
152 "Only words shorter than this will be considered." | |
153 :type 'integer | |
154 :group 'spam-stat) | |
155 | |
156 (defcustom spam-stat-max-buffer-length 10240 | |
157 "Only the beginning of buffers will be analyzed. | |
158 This variable says how many characters this will be." | |
159 :type 'integer | |
160 :group 'spam-stat) | |
161 | |
162 (defcustom spam-stat-split-fancy-spam-group "mail.spam" | |
163 "Name of the group where spam should be stored, if | |
164 `spam-stat-split-fancy' is used in fancy splitting rules. Has no | |
165 effect when spam-stat is invoked through spam.el." | |
166 :type 'string | |
167 :group 'spam-stat) | |
168 | |
169 (defcustom spam-stat-split-fancy-spam-threshhold 0.9 | |
170 "Spam score threshhold in spam-stat-split-fancy." | |
171 :type 'number | |
172 :group 'spam-stat) | |
173 | |
174 (defvar spam-stat-syntax-table | |
175 (let ((table (copy-syntax-table text-mode-syntax-table))) | |
176 (modify-syntax-entry ?- "w" table) | |
177 (modify-syntax-entry ?_ "w" table) | |
178 (modify-syntax-entry ?. "w" table) | |
179 (modify-syntax-entry ?! "w" table) | |
180 (modify-syntax-entry ?? "w" table) | |
181 (modify-syntax-entry ?+ "w" table) | |
182 table) | |
183 "Syntax table used when processing mails for statistical analysis. | |
184 The important part is which characters are word constituents.") | |
185 | |
186 (defvar spam-stat-dirty nil | |
187 "Whether the spam-stat database needs saving.") | |
188 | |
189 (defvar spam-stat-buffer nil | |
190 "Buffer to use for scoring while splitting. | |
191 This is set by hooking into Gnus.") | |
192 | |
193 (defvar spam-stat-buffer-name " *spam stat buffer*" | |
194 "Name of the `spam-stat-buffer'.") | |
195 | |
196 ;; Functions missing in Emacs 20 | |
197 | |
198 (when (memq nil (mapcar 'fboundp | |
199 '(gethash hash-table-count make-hash-table | |
200 mapc puthash))) | |
201 (require 'cl) | |
202 (unless (fboundp 'puthash) | |
203 ;; alias puthash is missing from Emacs 20 cl-extra.el | |
204 (defalias 'puthash 'cl-puthash))) | |
205 | |
206 (eval-when-compile | |
207 (unless (fboundp 'with-syntax-table) | |
208 ;; Imported from Emacs 21.2 | |
209 (defmacro with-syntax-table (table &rest body) "\ | |
210 Evaluate BODY with syntax table of current buffer set to a copy of TABLE. | |
211 The syntax table of the current buffer is saved, BODY is evaluated, and the | |
212 saved table is restored, even in case of an abnormal exit. | |
213 Value is what BODY returns." | |
214 (let ((old-table (make-symbol "table")) | |
215 (old-buffer (make-symbol "buffer"))) | |
216 `(let ((,old-table (syntax-table)) | |
217 (,old-buffer (current-buffer))) | |
218 (unwind-protect | |
219 (progn | |
220 (set-syntax-table (copy-syntax-table ,table)) | |
221 ,@body) | |
222 (save-current-buffer | |
223 (set-buffer ,old-buffer) | |
224 (set-syntax-table ,old-table)))))))) | |
225 | |
226 ;; Hooking into Gnus | |
227 | |
228 (defun spam-stat-store-current-buffer () | |
229 "Store a copy of the current buffer in `spam-stat-buffer'." | |
230 (save-excursion | |
231 (let ((str (buffer-string))) | |
232 (set-buffer (get-buffer-create spam-stat-buffer-name)) | |
233 (erase-buffer) | |
234 (insert str) | |
235 (setq spam-stat-buffer (current-buffer))))) | |
236 | |
237 (defun spam-stat-store-gnus-article-buffer () | |
238 "Store a copy of the current article in `spam-stat-buffer'. | |
239 This uses `gnus-article-buffer'." | |
240 (save-excursion | |
241 (set-buffer gnus-original-article-buffer) | |
242 (spam-stat-store-current-buffer))) | |
243 | |
244 ;; Data -- not using defstruct in order to save space and time | |
245 | |
246 (defvar spam-stat (make-hash-table :test 'equal) | |
247 "Hash table used to store the statistics. | |
248 Use `spam-stat-load' to load the file. | |
249 Every word is used as a key in this table. The value is a vector. | |
250 Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', | |
251 `spam-stat-bad', and `spam-stat-score' to access this vector.") | |
252 | |
253 (defvar spam-stat-ngood 0 | |
254 "The number of good mails in the dictionary.") | |
255 | |
256 (defvar spam-stat-nbad 0 | |
257 "The number of bad mails in the dictionary.") | |
258 | |
259 (defsubst spam-stat-good (entry) | |
260 "Return the number of times this word belongs to good mails." | |
261 (aref entry 0)) | |
262 | |
263 (defsubst spam-stat-bad (entry) | |
264 "Return the number of times this word belongs to bad mails." | |
265 (aref entry 1)) | |
266 | |
267 (defsubst spam-stat-score (entry) | |
268 "Set the score of this word." | |
269 (if entry | |
270 (aref entry 2) | |
271 spam-stat-unknown-word-score)) | |
272 | |
273 (defsubst spam-stat-set-good (entry value) | |
274 "Set the number of times this word belongs to good mails." | |
275 (aset entry 0 value)) | |
276 | |
277 (defsubst spam-stat-set-bad (entry value) | |
278 "Set the number of times this word belongs to bad mails." | |
279 (aset entry 1 value)) | |
280 | |
281 (defsubst spam-stat-set-score (entry value) | |
282 "Set the score of this word." | |
283 (aset entry 2 value)) | |
284 | |
285 (defsubst spam-stat-make-entry (good bad) | |
286 "Return a vector with the given properties." | |
287 (let ((entry (vector good bad nil))) | |
288 (spam-stat-set-score entry (spam-stat-compute-score entry)) | |
289 entry)) | |
290 | |
291 ;; Computing | |
292 | |
293 (defun spam-stat-compute-score (entry) | |
294 "Compute the score of this word. 1.0 means spam." | |
295 ;; promote all numbers to floats for the divisions | |
296 (let* ((g (* 2.0 (spam-stat-good entry))) | |
297 (b (float (spam-stat-bad entry)))) | |
298 (cond ((< (+ g b) 5) | |
299 .2) | |
300 ((= 0 spam-stat-ngood) | |
301 .99) | |
302 ((= 0 spam-stat-nbad) | |
303 .01) | |
304 (t | |
305 (max .01 | |
306 (min .99 (/ (/ b spam-stat-nbad) | |
307 (+ (/ g spam-stat-ngood) | |
308 (/ b spam-stat-nbad))))))))) | |
309 | |
310 ;; Parsing | |
311 | |
312 (defmacro with-spam-stat-max-buffer-size (&rest body) | |
313 "Narrows the buffer down to the first 4k characters, then evaluates BODY." | |
314 `(save-restriction | |
315 (when (> (- (point-max) | |
316 (point-min)) | |
317 spam-stat-max-buffer-length) | |
318 (narrow-to-region (point-min) | |
319 (+ (point-min) spam-stat-max-buffer-length))) | |
320 ,@body)) | |
321 | |
322 (defun spam-stat-buffer-words () | |
323 "Return a hash table of words and number of occurences in the buffer." | |
324 (with-spam-stat-max-buffer-size | |
325 (with-syntax-table spam-stat-syntax-table | |
326 (goto-char (point-min)) | |
327 (let ((result (make-hash-table :test 'equal)) | |
328 word count) | |
329 (while (re-search-forward "\\w+" nil t) | |
330 (setq word (match-string-no-properties 0) | |
331 count (1+ (gethash word result 0))) | |
332 (when (< (length word) spam-stat-max-word-length) | |
333 (puthash word count result))) | |
334 result)))) | |
335 | |
336 (defun spam-stat-buffer-is-spam () | |
337 "Consider current buffer to be a new spam mail." | |
338 (setq spam-stat-nbad (1+ spam-stat-nbad)) | |
339 (maphash | |
340 (lambda (word count) | |
341 (let ((entry (gethash word spam-stat))) | |
342 (if entry | |
343 (spam-stat-set-bad entry (+ count (spam-stat-bad entry))) | |
344 (setq entry (spam-stat-make-entry 0 count))) | |
345 (spam-stat-set-score entry (spam-stat-compute-score entry)) | |
346 (puthash word entry spam-stat))) | |
347 (spam-stat-buffer-words)) | |
348 (setq spam-stat-dirty t)) | |
349 | |
350 (defun spam-stat-buffer-is-non-spam () | |
351 "Consider current buffer to be a new non-spam mail." | |
352 (setq spam-stat-ngood (1+ spam-stat-ngood)) | |
353 (maphash | |
354 (lambda (word count) | |
355 (let ((entry (gethash word spam-stat))) | |
356 (if entry | |
357 (spam-stat-set-good entry (+ count (spam-stat-good entry))) | |
358 (setq entry (spam-stat-make-entry count 0))) | |
359 (spam-stat-set-score entry (spam-stat-compute-score entry)) | |
360 (puthash word entry spam-stat))) | |
361 (spam-stat-buffer-words)) | |
362 (setq spam-stat-dirty t)) | |
363 | |
364 (defun spam-stat-buffer-change-to-spam () | |
365 "Consider current buffer no longer normal mail but spam." | |
366 (setq spam-stat-nbad (1+ spam-stat-nbad) | |
367 spam-stat-ngood (1- spam-stat-ngood)) | |
368 (maphash | |
369 (lambda (word count) | |
370 (let ((entry (gethash word spam-stat))) | |
371 (if (not entry) | |
372 (error "This buffer has unknown words in it.") | |
373 (spam-stat-set-good entry (- (spam-stat-good entry) count)) | |
374 (spam-stat-set-bad entry (+ (spam-stat-bad entry) count)) | |
375 (spam-stat-set-score entry (spam-stat-compute-score entry)) | |
376 (puthash word entry spam-stat)))) | |
377 (spam-stat-buffer-words)) | |
378 (setq spam-stat-dirty t)) | |
379 | |
380 (defun spam-stat-buffer-change-to-non-spam () | |
381 "Consider current buffer no longer spam but normal mail." | |
382 (setq spam-stat-nbad (1- spam-stat-nbad) | |
383 spam-stat-ngood (1+ spam-stat-ngood)) | |
384 (maphash | |
385 (lambda (word count) | |
386 (let ((entry (gethash word spam-stat))) | |
387 (if (not entry) | |
388 (error "This buffer has unknown words in it.") | |
389 (spam-stat-set-good entry (+ (spam-stat-good entry) count)) | |
390 (spam-stat-set-bad entry (- (spam-stat-bad entry) count)) | |
391 (spam-stat-set-score entry (spam-stat-compute-score entry)) | |
392 (puthash word entry spam-stat)))) | |
393 (spam-stat-buffer-words)) | |
394 (setq spam-stat-dirty t)) | |
395 | |
396 ;; Saving and Loading | |
397 | |
398 (defun spam-stat-save (&optional force) | |
399 "Save the `spam-stat' hash table as lisp file. | |
400 With a prefix argument save unconditionally." | |
401 (interactive "P") | |
402 (when (or force spam-stat-dirty) | |
403 (with-temp-buffer | |
404 (let ((standard-output (current-buffer)) | |
405 (font-lock-maximum-size 0)) | |
406 (insert "(setq spam-stat-ngood " | |
407 (number-to-string spam-stat-ngood) | |
408 " spam-stat-nbad " | |
409 (number-to-string spam-stat-nbad) | |
410 " spam-stat (spam-stat-to-hash-table '(") | |
411 (maphash (lambda (word entry) | |
412 (prin1 (list word | |
413 (spam-stat-good entry) | |
414 (spam-stat-bad entry)))) | |
415 spam-stat) | |
416 (insert ")))") | |
417 (write-file spam-stat-file))) | |
418 (setq spam-stat-dirty nil))) | |
419 | |
420 (defun spam-stat-load () | |
421 "Read the `spam-stat' hash table from disk." | |
422 ;; TODO: maybe we should warn the user if spam-stat-dirty is t? | |
423 (load-file spam-stat-file) | |
424 (setq spam-stat-dirty nil)) | |
425 | |
426 (defun spam-stat-to-hash-table (entries) | |
427 "Turn list ENTRIES into a hash table and store as `spam-stat'. | |
428 Every element in ENTRIES has the form \(WORD GOOD BAD) where WORD is | |
429 the word string, NGOOD is the number of good mails it has appeared in, | |
430 NBAD is the number of bad mails it has appeared in, GOOD is the number | |
431 of times it appeared in good mails, and BAD is the number of times it | |
432 has appeared in bad mails." | |
433 (let ((table (make-hash-table :test 'equal))) | |
434 (mapc (lambda (l) | |
435 (puthash (car l) | |
436 (spam-stat-make-entry (nth 1 l) (nth 2 l)) | |
437 table)) | |
438 entries) | |
439 table)) | |
440 | |
441 (defun spam-stat-reset () | |
442 "Reset `spam-stat' to an empty hash-table. | |
443 This deletes all the statistics." | |
444 (interactive) | |
445 (setq spam-stat (make-hash-table :test 'equal) | |
446 spam-stat-ngood 0 | |
447 spam-stat-nbad 0) | |
448 (setq spam-stat-dirty t)) | |
449 | |
450 ;; Scoring buffers | |
451 | |
452 (defvar spam-stat-score-data nil | |
453 "Raw data used in the last run of `spam-stat-score-buffer'.") | |
454 | |
455 (defsubst spam-stat-score-word (word) | |
456 "Return score for WORD. | |
457 The default score for unknown words is stored in | |
458 `spam-stat-unknown-word-score'." | |
459 (spam-stat-score (gethash word spam-stat))) | |
460 | |
461 (defun spam-stat-buffer-words-with-scores () | |
462 "Process current buffer, return the 15 most conspicuous words. | |
463 These are the words whose spam-stat differs the most from 0.5. | |
464 The list returned contains elements of the form \(WORD SCORE DIFF), | |
465 where DIFF is the difference between SCORE and 0.5." | |
466 (with-spam-stat-max-buffer-size | |
467 (with-syntax-table spam-stat-syntax-table | |
468 (let (result word score) | |
469 (maphash (lambda (word ignore) | |
470 (setq score (spam-stat-score-word word) | |
471 result (cons (list word score (abs (- score 0.5))) | |
472 result))) | |
473 (spam-stat-buffer-words)) | |
474 (setq result (sort result (lambda (a b) (< (nth 2 b) (nth 2 a))))) | |
475 (setcdr (nthcdr 14 result) nil) | |
476 result)))) | |
477 | |
478 (defun spam-stat-score-buffer () | |
479 "Return a score describing the spam-probability for this buffer." | |
480 (setq spam-stat-score-data (spam-stat-buffer-words-with-scores)) | |
481 (let* ((probs (mapcar (lambda (e) (cadr e)) spam-stat-score-data)) | |
482 (prod (apply #'* probs))) | |
483 (/ prod (+ prod (apply #'* (mapcar #'(lambda (x) (- 1 x)) | |
484 probs)))))) | |
485 | |
486 (defun spam-stat-split-fancy () | |
487 "Return the name of the spam group if the current mail is spam. | |
488 Use this function on `nnmail-split-fancy'. If you are interested in | |
489 the raw data used for the last run of `spam-stat-score-buffer', | |
490 check the variable `spam-stat-score-data'." | |
491 (condition-case var | |
492 (progn | |
493 (set-buffer spam-stat-buffer) | |
494 (goto-char (point-min)) | |
495 (when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshhold) | |
496 (when (boundp 'nnmail-split-trace) | |
497 (mapc (lambda (entry) | |
498 (push entry nnmail-split-trace)) | |
499 spam-stat-score-data)) | |
500 spam-stat-split-fancy-spam-group)) | |
501 (error (message "Error in spam-stat-split-fancy: %S" var) | |
502 nil))) | |
503 | |
504 ;; Testing | |
505 | |
506 (defun spam-stat-process-directory (dir func) | |
507 "Process all the regular files in directory DIR using function FUNC." | |
508 (let* ((files (directory-files dir t "^[^.]")) | |
509 (max (/ (length files) 100.0)) | |
510 (count 0)) | |
511 (with-temp-buffer | |
512 (dolist (f files) | |
513 (when (and (file-readable-p f) | |
514 (file-regular-p f) | |
515 (> (nth 7 (file-attributes f)) 0)) | |
516 (setq count (1+ count)) | |
517 (message "Reading %s: %.2f%%" dir (/ count max)) | |
518 (insert-file-contents f) | |
519 (funcall func) | |
520 (erase-buffer)))))) | |
521 | |
522 (defun spam-stat-process-spam-directory (dir) | |
523 "Process all the regular files in directory DIR as spam." | |
524 (interactive "D") | |
525 (spam-stat-process-directory dir 'spam-stat-buffer-is-spam)) | |
526 | |
527 (defun spam-stat-process-non-spam-directory (dir) | |
528 "Process all the regular files in directory DIR as non-spam." | |
529 (interactive "D") | |
530 (spam-stat-process-directory dir 'spam-stat-buffer-is-non-spam)) | |
531 | |
532 (defun spam-stat-count () | |
533 "Return size of `spam-stat'." | |
534 (interactive) | |
535 (hash-table-count spam-stat)) | |
536 | |
537 (defun spam-stat-test-directory (dir) | |
538 "Test all the regular files in directory DIR for spam. | |
539 If the result is 1.0, then all files are considered spam. | |
540 If the result is 0.0, non of the files is considered spam. | |
541 You can use this to determine error rates." | |
542 (interactive "D") | |
543 (let* ((files (directory-files dir t "^[^.]")) | |
544 (total (length files)) | |
545 (score 0.0); float | |
546 (max (/ total 100.0)); float | |
547 (count 0)) | |
548 (with-temp-buffer | |
549 (dolist (f files) | |
550 (when (and (file-readable-p f) | |
551 (file-regular-p f) | |
552 (> (nth 7 (file-attributes f)) 0)) | |
553 (setq count (1+ count)) | |
554 (message "Reading %.2f%%, score %.2f%%" | |
555 (/ count max) (/ score count)) | |
556 (insert-file-contents f) | |
557 (when (> (spam-stat-score-buffer) 0.9) | |
558 (setq score (1+ score))) | |
559 (erase-buffer)))) | |
560 (message "Final score: %d / %d = %f" score total (/ score total)))) | |
561 | |
562 ;; Shrinking the dictionary | |
563 | |
564 (defun spam-stat-reduce-size (&optional count) | |
565 "Reduce the size of `spam-stat'. | |
566 This removes all words that occur less than COUNT from the dictionary. | |
567 COUNT defaults to 5" | |
568 (interactive) | |
569 (setq count (or count 5)) | |
570 (maphash (lambda (key entry) | |
571 (when (< (+ (spam-stat-good entry) | |
572 (spam-stat-bad entry)) | |
573 count) | |
574 (remhash key spam-stat))) | |
575 spam-stat) | |
576 (setq spam-stat-dirty t)) | |
577 | |
578 (defun spam-stat-install-hooks-function () | |
579 "Install the spam-stat function hooks" | |
580 (interactive) | |
581 (add-hook 'nnmail-prepare-incoming-message-hook | |
582 'spam-stat-store-current-buffer) | |
583 (add-hook 'gnus-select-article-hook | |
584 'spam-stat-store-gnus-article-buffer)) | |
585 | |
586 (when spam-stat-install-hooks | |
587 (spam-stat-install-hooks-function)) | |
588 | |
589 (defun spam-stat-unload-hook () | |
590 "Uninstall the spam-stat function hooks" | |
591 (interactive) | |
592 (remove-hook 'nnmail-prepare-incoming-message-hook | |
593 'spam-stat-store-current-buffer) | |
594 (remove-hook 'gnus-select-article-hook | |
595 'spam-stat-store-gnus-article-buffer)) | |
596 | |
597 (provide 'spam-stat) | |
598 | |
599 ;;; arch-tag: ff1d2200-8ddb-42fb-bb7b-1b5e20448554 | |
600 ;;; spam-stat.el ends here |