;;; Investigating "training.dat" of the Mozilla Spam Filter
;;; according to Paul Grahams algorithm
;;; $Id: msf.scm,v 1.3 2003/05/25 15:49:26 bernd Exp bernd $


;(define *training-file* "/home/bernd/.mozilla/bernd/ye22pse4.slt/training.dat")
(define *training-file* "training.dat")


;; read 32-bit network order integer from stream
(define read-ninteger
  (lambda (stream)
     (let loop ((i 4) (result 0))
        (if (zero? i)
          result
          (loop (- i 1)
                (+ (char->integer (read-char stream)) (* result #x100)))))))


;; read value and string from stream
(define read-word
  (lambda (stream)
    (let* ((val (read-ninteger stream))
           (len (read-ninteger stream))
           (str (make-string len)))
      (let loop ((i 0))
        (if (= i len)
          (list str val)
          (begin
            (string-set! str i (read-char stream))
            (loop (+ i 1))))))))


;; read all words of category
(define read-words
  (lambda (n stream)
    (let loop ((n n) (words '()))
      (if (zero? n)
         words
         (loop (- n 1) (cons (read-word stream) words))))))


;; read file and build token lists
(define read-file
  (lambda (file-name)
    (let* ((stream               (open-input-file file-name))
           (magic-cookie         (read-ninteger stream))
           (number-good-messages (read-ninteger stream))
           (number-bad-messages  (read-ninteger stream))
           (number-good-tokens   (read-ninteger stream))
           (good-tokens          (read-words number-good-tokens stream))
           (number-bad-tokens    (read-ninteger stream))
           (bad-tokens           (read-words number-bad-tokens stream)))
      (list number-good-messages number-bad-messages good-tokens bad-tokens))))

    
;; get token-value from list entry
(define token-value
  (lambda (x) (cadr x)))


;; get token-string from list entry
(define token-string
  (lambda (x) (car x)))


;; sort tokens according to value
(define sort-tokens
  (lambda (lst)
    (sort lst (lambda (a b) (> (token-value a) (token-value b))))))


;; print 5 most/least used tokens
(define print-some-tokens
  (lambda (lst count)
    (format #t "number of tokens: ~A\n" (length lst))
      (let loop ((n count) (l (sort-tokens lst)))
        (if (> n 0)
          (begin
            (format #t "~A\n" (car l))
            (loop (- n 1) (cdr l)))))
      (display "...\n")
      (let loop ((n count) (l (reverse (sort-tokens lst))))
        (if (> n 0)
          (begin
            (format #t "~A\n" (car l))
            (loop (- n 1) (cdr l)))))))


;; get token value from list
(define get-token-value
  (lambda (token lst)
    (let  ((t (assoc token lst)))
      (if (not t)
        0
        (token-value t))))) 


;; calculate spam-probality of a single token
(define spam-probability1
  (lambda (word)
    (let ((g (* 2 (get-token-value word *good-tokens*)))
          (b      (get-token-value word *bad-tokens*)))
      (list word
        (if (< (+ g b) 5)
          0.4
            (max .01 (min .99
              (/ (min 1 (/ b *number-bad-messages*))
                (+ (min 1 (/ g *number-good-messages*))
                   (min 1 (/ b *number-bad-messages*)))))))))))


;; calculate spam-probality
(define spam-probability
  (lambda (lst)
    (let ((probs (map (lambda (x) (token-value (spam-probability1 x))) lst))) 
      (let ((prod (apply * probs)))
         (/ prod (+ prod (apply * (map (lambda (x) (- 1 x)) probs))))))))


;; pretty-print the spam probability
(define print-spam-probability
  (lambda (lst)
    (format #t "~S ~A\n" lst (spam-probability lst))))


;; read-in the data base and set global variables
(define *spam-list* (read-file *training-file*))
(define *number-good-messages* (car *spam-list*))
(define *number-bad-messages* (cadr *spam-list*))
(define *good-tokens* (caddr *spam-list*))
(define *bad-tokens* (cadddr *spam-list*))


;; some examples
(print-spam-probability '("free"))
(print-spam-probability '("millions"))
(print-spam-probability '("free" "millions"))
(print-spam-probability '("creativechips"))
(print-spam-probability '("samples"))
(print-spam-probability '("creativechips" "samples"))
(print-spam-probability '("creativechips" "free" "samples"))
