;;; Spelling Corrector
;;;
;;; http://norvig.com/spell-correct.html
;;; http://groups.google.de/group/comp.lang.lisp/msg/ce273af0e324ff0d
;;;
;;; $Id: spell.lisp,v 1.13 2008/11/02 17:07:18 bernd Exp $

;;; Auxiliary functions

(defun string-token (string)
  "Return list of tokens (alpha-chars) of STRING. List is in reverse
order but this is don't care."
  (let ((words '())
	i 
	(j 0))
    (loop 
       (setf i (position-if #'alpha-char-p string :start j))
       (unless i (return words))
       (setf j (position-if-not #'alpha-char-p string :start i))
       (push (subseq string i j) words)
       (unless j (return words)))))


;;; Here starts Norvigs algorithm

(defun words (file)
  "Get a list of all words from FILE."
  (with-open-file (stream file :external-format :iso-8859-1)
    (loop for line = (read-line stream nil)
       while line
       if (plusp (length line))
       nconc (string-token (string-downcase line)))))

(defun train (features)
  "Create a hash table which contains the occurrence frequency of
words from FEATURES."
  (let ((models (make-hash-table :test #'equal)))
    (dolist (f features models)
      (incf (gethash f models 0)))))

(defparameter *n-words* (train (words #p"big.txt")))

(defun dictionaryp (word)
  "True if word is found in dictionary."
  (declare (type string word)
	   (optimize speed))
  (multiple-value-bind (value presentp)
      (gethash word *n-words*)
    (declare (type symbol presentp)
	     (ignore value))
     presentp))

(defmacro doedits ((ed word) &body body)
  "Modify (delete/transpose/alter/insert) the string WORD and store
result string in ED."
  (assert (symbolp ed))
  (let ((n (gensym))
	(i (gensym))
	(c (gensym)))
    `(let ((,n (length ,word)))
       ;; deletions
       (if (plusp ,n)
	   (let ((,ed (make-string (1- ,n))))
	     (dotimes (,i ,n)
	       (replace ,ed ,word :start1 0 :end1 ,i :start2 0)
	       (replace ,ed ,word :start1 ,i :start2 (1+ ,i))
	       ,@body)))
       ;; transpositions
       (let ((,ed (make-string ,n)))
	 (dotimes (,i (1- ,n))
	   (replace ,ed ,word)
	   (setf (char ,ed ,i) (char ,word (1+ ,i))
		 (char ,ed (1+ ,i)) (char ,word ,i))
	   ,@body))
       ;; alterations
       (let ((,ed (make-string ,n)))
	 (loop for ,c from #.(char-code #\a) to #.(char-code #\z) do
	      (dotimes (,i ,n)
		(replace ,ed ,word)
		(setf (char ,ed ,i) (code-char ,c))
		,@body)))
       ;; insertions
       (let ((,ed (make-string (1+ ,n))))
	 (loop for ,c from #.(char-code #\a) to #.(char-code #\z) do
	      (dotimes (,i (1+ ,n))
		(replace ,ed ,word :start1 0 :end1 ,i :start2 0)
		(setf (char ,ed ,i) (code-char ,c))
		(replace ,ed ,word :start1 (1+ ,i) :start2 ,i)
		,@body))))))

;;; not needed for CORRECT
(defun edits1 (word) 
  "Create a hash-table of all words with Levenshtein distance of one." 
  (let ((words (make-hash-table :test #'equal :size (+ 25 (* 54 (length word))))))
    (doedits (e1 word)
      (setf (gethash (copy-seq e1) words) t))
    words))

;;; not needed for CORRECT
(defun edits2 (word)
 "Create a hash-table of all words with Levenshtein distance of two." 
  (let ((words (make-hash-table :test #'equal :size (* #1=(+ 25 (* 54 (length word))) #1#))))
    (doedits (e1 word)
      (doedits (e2 e1)
        (setf (gethash (copy-seq e2) words) t)))
    words))

(defun known-edits1 (word)
  "Create a list of all words with Levenshtein distance of one when
they are in dictionary."
  (let (words)
    (doedits (e1 word)
      (if (dictionaryp e1)
	  (pushnew (copy-seq e1) words :test #'string=)))
    words))
	     
(defun known-edits2 (word)
  "Create a list of all words with Levenshtein distance of two when
they are in dictionary."
  (let (words)
    (doedits (e1 word)
      (doedits (e2 e1)
	(if (dictionaryp e2)
	    (pushnew (copy-seq e2) words :test #'string=))))
    words))

(defun correct (word)
  "Return corrected word from dictionary. Additionally the occurrence
frequency of the found word in dictionary is returned."
  (let ((candidates (or (and (dictionaryp word) (list word)) ; in directoy?
			(known-edits1 word)                  ; in directory with distance=1?
			(known-edits2 word))))               ; in directory with distance=2?
    (loop with max = 0 and found = word
       for w in candidates
       for n = (gethash w *n-words* 0)
       do (if (> n  max)
	      (setf max n  found w))
       finally (return (values found n)))))


#|
Python: 15 s
CL-USER> (time (spelltest *tests1*))
(SPELLTEST *TESTS1*) took 8,948,352 microseconds (8.948352 seconds) to run 
                    with 2 available CPU cores.
During that period, 8,939,189 microseconds (8.939189 seconds) were spent in user mode
                    9,606 microseconds (0.009606 seconds) were spent in system mode
2,728 microseconds (0.002728 seconds) was spent in GC.
 6,353,680 bytes of memory allocated.
68
270
75
15

Python: 25 s
CL-USER> (time (spelltest *tests2*))
(SPELLTEST *TESTS2*) took 15,581,771 microseconds (15.581770 seconds) to run 
                    with 2 available CPU cores.
During that period, 15,552,899 microseconds (15.552900 seconds) were spent in user mode
                    24,823 microseconds (0.024823 seconds) were spent in system mode
4,603 microseconds (0.004603 seconds) was spent in GC.
 10,756,592 bytes of memory allocated.
130
400
68
43
|#
