;;; Investigating "training.dat" of the Mozilla Spam Filter ;;; according to Paul Grahams algorithm ;;; $Id: msf.scm,v 1.3 2003/05/25 15:49:26 bernd Exp bernd $ ;(define *training-file* "/home/bernd/.mozilla/bernd/ye22pse4.slt/training.dat") (define *training-file* "training.dat") ;; read 32-bit network order integer from stream (define read-ninteger (lambda (stream) (let loop ((i 4) (result 0)) (if (zero? i) result (loop (- i 1) (+ (char->integer (read-char stream)) (* result #x100))))))) ;; read value and string from stream (define read-word (lambda (stream) (let* ((val (read-ninteger stream)) (len (read-ninteger stream)) (str (make-string len))) (let loop ((i 0)) (if (= i len) (list str val) (begin (string-set! str i (read-char stream)) (loop (+ i 1)))))))) ;; read all words of category (define read-words (lambda (n stream) (let loop ((n n) (words '())) (if (zero? n) words (loop (- n 1) (cons (read-word stream) words)))))) ;; read file and build token lists (define read-file (lambda (file-name) (let* ((stream (open-input-file file-name)) (magic-cookie (read-ninteger stream)) (number-good-messages (read-ninteger stream)) (number-bad-messages (read-ninteger stream)) (number-good-tokens (read-ninteger stream)) (good-tokens (read-words number-good-tokens stream)) (number-bad-tokens (read-ninteger stream)) (bad-tokens (read-words number-bad-tokens stream))) (list number-good-messages number-bad-messages good-tokens bad-tokens)))) ;; get token-value from list entry (define token-value (lambda (x) (cadr x))) ;; get token-string from list entry (define token-string (lambda (x) (car x))) ;; sort tokens according to value (define sort-tokens (lambda (lst) (sort lst (lambda (a b) (> (token-value a) (token-value b)))))) ;; print 5 most/least used tokens (define print-some-tokens (lambda (lst count) (format #t "number of tokens: ~A\n" (length lst)) (let loop ((n count) (l (sort-tokens lst))) (if (> n 0) (begin (format #t "~A\n" (car l)) (loop (- n 1) (cdr l))))) (display "...\n") (let loop ((n count) (l (reverse (sort-tokens lst)))) (if (> n 0) (begin (format #t "~A\n" (car l)) (loop (- n 1) (cdr l))))))) ;; get token value from list (define get-token-value (lambda (token lst) (let ((t (assoc token lst))) (if (not t) 0 (token-value t))))) ;; calculate spam-probality of a single token (define spam-probability1 (lambda (word) (let ((g (* 2 (get-token-value word *good-tokens*))) (b (get-token-value word *bad-tokens*))) (list word (if (< (+ g b) 5) 0.4 (max .01 (min .99 (/ (min 1 (/ b *number-bad-messages*)) (+ (min 1 (/ g *number-good-messages*)) (min 1 (/ b *number-bad-messages*))))))))))) ;; calculate spam-probality (define spam-probability (lambda (lst) (let ((probs (map (lambda (x) (token-value (spam-probability1 x))) lst))) (let ((prod (apply * probs))) (/ prod (+ prod (apply * (map (lambda (x) (- 1 x)) probs)))))))) ;; pretty-print the spam probability (define print-spam-probability (lambda (lst) (format #t "~S ~A\n" lst (spam-probability lst)))) ;; read-in the data base and set global variables (define *spam-list* (read-file *training-file*)) (define *number-good-messages* (car *spam-list*)) (define *number-bad-messages* (cadr *spam-list*)) (define *good-tokens* (caddr *spam-list*)) (define *bad-tokens* (cadddr *spam-list*)) ;; some examples (print-spam-probability '("free")) (print-spam-probability '("millions")) (print-spam-probability '("free" "millions")) (print-spam-probability '("creativechips")) (print-spam-probability '("samples")) (print-spam-probability '("creativechips" "samples")) (print-spam-probability '("creativechips" "free" "samples"))