#|

These are routines for creating and analyzing Huffman codes for compressing
strings.  The algorithms come from Cormen, ed. 1, pp. 337 - 343.

The Huffman cost for an encoded string (in bits) is:

                B(T) =  SUM   f(c)*d (c)
                       c in C       T

where:
  T is the text being encoded with the prefix(-free) encoding.
  C is the character set used in T
  c is each character in C
  f(c) is the number of occurrences of c in T
  d (c) is the depth of c in the Huffman encoding tree.
   T
  

The encoding is accomplished by first making a histogram of the text.
Convert each character/frequency pair to a node object.  Then...

Sort the node objects according to increasing f(c) (with secondary
sorting to deal with identical f(c) and (leaf) nodes that are for characters
vs those for parents.  Then...

LOOP from 1 to (length nodes) - 1:
	Pick the two nodes with lowest f(c) and create a parent node
	  holding these two children.
	Replace the two child nodes in the list with the parent node.
	Resort the list of nodes.


Once the tree is built, traverse it, creating a string as we reach
each node, adding "0" for each time we go down a left branch, "1"
when going down a right branch.  The string we have at each leaf is
the Huffman encoding for that leaf node's character c.

To encode the string, simply replace each letter by the encoding associated
with that character, resulting in one big bit-vector.

I'm keeping the encoded text in an array of 166 bit-vectors, one for each
letter in the original text.  This makes it easier to see what's going on,
and easier to decode the result to check my work.

This code is *NOT* efficient, but makes the process obvious (I hope).

|#


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; FUNCTIONS FOR COMPUTING HUFFMAN ENCODING ;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;; Return a hashtable whose keys are the characters in text, and whose
;;; values are the number of occurrences of the key character in text.
;;; If case-sensitive is nil, case is folded down.

(defun histogram (text &optional (case-sensitive t))
  (let ((result (make-hash-table :test #'eql))
        (test (if case-sensitive #'(lambda (x) x) #'char-downcase)))
    (declare (special test result))
    (map nil #'(lambda (c)
                 (let ((d (funcall test c)))
		   ;; still terribly inefficient
                   (multiple-value-bind (val is-set) (gethash d result)
                     (unless is-set
                       (setf (gethash d result) 0))
                     (incf (gethash d result)))))
         text)
    result))


;;; We want a structure that fits into a Huffman tree as per Cormen ed. 1,
;;; p. 341.
(defstruct huffman-node
  (c nil :type character)
  (freq nil :type integer)
  (child-0 nil :type huffman-node)
  (child-1 nil :type huffman-node)
  (encoding nil :type string))


;;; Convert a histogram hashtable to a list of huffman nodes.
(defun nodes-from-histogram (histo)
  (let ((result (list)))
    (maphash #'(lambda (key value)
                 (push (make-huffman-node :c key :freq value) result))
             histo)
    result))
  
  
;;; Sort a list of huffman nodes according to increasing freq.
;;; NOTE: Ordering is *still* indeterminate if two nodes have
;;;       no characters (they are both parents), and they have
;;;       the same combined frequency of children.
(defun huffman-ordering (node1 node2)
  (let ((f1 (huffman-node-freq node1))
        (f2 (huffman-node-freq node2))
        (c1 (huffman-node-c node1))
        (c2 (huffman-node-c node2)))
    (or (< f1 f2)
        (and (= f1 f2)
             (or
              (and c1 c2 (char< c1 c2))
              (null c2))))))


;;; Join two nodes, making a new parent for the both of them.
;;; Return the new parent, which the caller must keep track of.
(defun join-nodes (node-0 node-1)
  (make-huffman-node
   :freq (+ (huffman-node-freq node-0) (huffman-node-freq node-1))
   :child-0 node-0
   :child-1 node-1))


;;; Take a sorted list of huffman-nodes and build a tree bottom-up.
;;; Select the two nodes in list with the lowest frequencies, grabbing
;;; the first in cases of equal frequencies.  Join them under a new 
;;; parent node.  Replace the two nodes with the parent node.
;;; Keep going until we have just one node (the super-parent) in the
;;; list.
(defun make-huffman-tree (nodes)
  (dotimes (i (- (length nodes) 1))
    (push (join-nodes (pop nodes) (pop nodes)) nodes)
    (setq nodes (stable-sort nodes 'huffman-ordering)))
  nodes)


;;; Process a huffman tree, returning a hashtable whose keys are
;;; the characters being encoded, the values the huffman string codes.
;;; (build-encodings (car <some-huffman-tree>)) => hashtable
(defun build-encodings (huffman-tree)
  (let ((result (make-hash-table :test #'eql)))
    (labels ((get-encoding (node encoding-thus-far)
	       (let ((c (huffman-node-c node)))
		 (cond
		  (c (setf (gethash c result) encoding-thus-far)
		     (setf (huffman-node-encoding node) encoding-thus-far))
		  (t (get-encoding
		      (huffman-node-child-0 node) 
		      (concatenate 'bit-vector encoding-thus-far #*0))
		     (get-encoding 
		      (huffman-node-child-1 node)
		      (concatenate 'bit-vector encoding-thus-far #*1)))))))
      (get-encoding 
       (car huffman-tree)
       (make-array 8 :element-type 'bit :adjustable t :fill-pointer 0)))
    result))


;;; Use the hashtable of encodings to translate each character of text to
;;; a bit-vector.  Return an array of all the bit-vectors.
;;; This is only a demo, so it doesn't concatenate all the bitvectors for
;;; transmission.
(defun huffman-encode (text encodings)
  (let ((result (make-array (length text) :adjustable t :fill-pointer 0)))
    (map nil #'(lambda (c)
                 (vector-push-extend (gethash c encodings) result))
         text)
    result))


;;; Compute the cost of a Huffman encoding given a histogram hashtable
;;; of characters in a string and a hashtable with keys in C and values
;;; being the corresponding 0-1 Huffman strings.
(defun predict-huffman-cost (hist encodings)
  (let ((cost 0))
    (maphash #'(lambda (c freq)
                 (incf cost (* freq (length (gethash c encodings)))))
             hist)
    cost))
      


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; BEGIN FUNCTIONS FOR COMPUTING HUFFMAN DECODING ;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;; Decode a (cheap-dirty demo) string encoded with huffman-encode.
;;; A real decoder would read a bit-array.
(defun huffman-decode (bits encodings &optional (cheap t))
  (if cheap
      (let ((reverse-encodings (make-hash-table :test #'equal)))
        (maphash 
	 #'(lambda (key value) (setf (gethash value reverse-encodings) key))
	 encodings)
        (map 'string #'(lambda (c) (gethash c reverse-encodings)) bits))
    (error "We don't yet have real bit-array encoding. We're waiting on proper decoding.")))
      


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;       BEGIN FUNCTIONS FOR PRETTY PRINTING      ;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;; Convert a histogram hashtable to an array of key-value conses,
;;; sorted by decreasing frequency.  If two characters have the
;;; same frequency, they are sorted in increasing alphabetic order.
;;; (histo-sort (HASHTABLE 'EQL (#\y . 2) (#\x . 7) (#\z . 5) (#\q . 5))) ==>
;;;   #((#\x . 7) (#\q . 5) (#\z . 5) (#\y . 2))
(defun histo-sort (hist)
  (let ((seq (make-array 10 :adjustable t :fill-pointer 0)))
    (maphash #'(lambda (key value) (vector-push-extend (cons key value) seq))
             hist)
    (stable-sort seq #'(lambda (x y)
			 (let ((x2 (cdr x))
			       (y2 (cdr y)))
			   (or (> x2 y2)
			       (and (= x2 y2)
				    (char< (car x) (car y)))))))))


;;; Pretty-print a hashtable.
(defun pprint-hashtable (table stream &optional (keystring "KEY")
						(valuestring "VALUE"))
  (maphash
   #'(lambda (key value)
       (format stream "~A: ~A    ~A: ~A~%" keystring key valuestring value))
   table))


;;; Pretty-print an alist : #((#\a . 1) (#\b . 3) <etc>), one element per line,
;;; indented by indent-level * 4 spaces.
(defun pprint-vector (vec stream &optional (indent-level 0))
  (let ((indent-string (subseq "                                               " 0 (* 4 indent-level))))
    (map nil #'(lambda (e) (format stream "~A~%" e))
         vec)))

;;; Pretty-print a list of huffman nodes.
(defun pprint-list (nodes stream)
  (dolist (node nodes)
    (format stream "~A~%" node)))


;;; Pretty-print a huffman tree.
(defun pprint-huffman-tree (huffman-tree stream)
  (labels ((print-node (node)
	     (let ((c (huffman-node-c node)))
	       (cond
		(c (format stream "~A~%" node))
		(t (format stream "~A~%" node)
		   (print-node (huffman-node-child-0 node))
		   (print-node (huffman-node-child-1 node)))))))
    (print-node (car huffman-tree))))

      
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;    BEGIN FUNCTIONS FOR LISPWORKS GUI OUTPUT    ;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;; Pretty-print a huffman-node (for debugging under allegro).
#-lispworks
(defmethod print-object ((node huffman-node) stream)
  (format stream "#S(HUFFMAN-NODE :c ~A :freq ~D :child-0 ~A :child-1 ~A :encoding ~A"
          (huffman-node-c node) (huffman-node-freq node) 
          (let ((c0 (huffman-node-child-0 node)))
            (when c0 (huffman-node-c c0)))
          (let ((c1 (huffman-node-child-1 node)))
            (when c1 (huffman-node-c c1)))
          (huffman-node-encoding node)))

;;; Pretty-print a huffman-node (esp. for use in a LispWorks graph-pane).
#+lispworks
(defmethod print-object ((node huffman-node) stream)
  (if (and (huffman-node-child-0 node) (huffman-node-child-1 node))
      (format stream "(~D)" (huffman-node-freq node))
    (format stream "(~S (~D) ~A)"
            (huffman-node-c node)
	    (huffman-node-freq node)
	    (huffman-node-encoding node))))


;;; Put the tree into a Lispworks graph-pane.
#+lispworks
(defun display-huffman-tree (text huffman-tree)
  (capi:contain
   (make-instance 'capi:column-layout
     :description (list
		   (make-instance 'capi:graph-pane
		     :roots (list (car huffman-tree))
		     :children-function 'huffman-node-children
		     :best-width 300 :best-height 400)
		   
		   (make-instance
		       'capi:display-pane
		     :text text)
		   )
     )))


;;; Return a list in 0-1 order of the children of a node.
;;; This is for use in a Lispworks graph-pane.
#+lispworks
(defun huffman-node-children (node)
  (let ((c0 (huffman-node-child-0 node))
        (c1 (huffman-node-child-1 node))
        (result (list)))
    (if c1 (push c1 result))
    (if c0 (push c0 result))))



;;;;;;;;;;;;;;;;;;;;;;
;;; TEST FUNCTIONS ;;;
;;;;;;;;;;;;;;;;;;;;;;


(defvar /text "Politicians can fool some of the people all of the time, and they can fool all of the people some of the time, but they cannot fool all of the people all of the time.")


;;; Huffman encode text.  If verbose is non-nil, spew all the diagnostics.  If
;;; filename is non-nil, write the diagnostics (if any) into filename.
(defun show-work (text &optional (verbose nil) (filename nil))
  (let* ((histo (histogram text))
	 (nodes (nodes-from-histogram histo))
	 (sorted-nodes (stable-sort nodes 'huffman-ordering))
	 (tree (make-huffman-tree (copy-list sorted-nodes)))
	 (encodings (build-encodings tree))
	 (encoded-text (huffman-encode text encodings))
	 (stream (if filename
		     (open filename
			   :direction :output
			   :if-does-not-exist :create
			   :if-exists :supersede)
		   *standard-output*)))
    
    (when verbose
      (format stream "TEXT:~%~A~%~%" text)
      (format stream "HISTOGRAM:~%~A~%" (reverse (histo-sort histo)))
      ;;(format stream "~ANODES: ~%")
      ;;(pprint-list sorted-nodes stream)
      (format stream "~%ENCODINGS:~%")
      (pprint-hashtable encodings stream "CHAR" "ENCODING")
      (format stream "~%TREE AFTER COMPUTING ENCODINGS:     ~%")
      (pprint-huffman-tree tree stream)
      (format stream "~%ENCODED TEXT:~%~A~%" encoded-text)
      (format stream "~%DECODED TEXT:~%~A~%" (huffman-decode encoded-text encodings))
      (format stream "~%EXPECTED COST IN BITS: ~A~%"
	      (predict-huffman-cost histo encodings))
      (format stream "ACTUAL COST IN BITS:   ~A~%"
	      (reduce #'+ (map 'vector #'(lambda (x) (length x)) encoded-text)))
      (unless (eq stream *standard-output*)
	(finish-output stream)
	(close stream)))
    
    #+lispworks (display-huffman-tree text tree)
    ))



