贝叶斯分类:Common Lisp实现

最近利用闲暇时间学习Common Lisp,作为编程实践,我实现了一个简陋的
贝叶斯分类器(bayesian)。这只是一个代码玩具,没有实际的工程价值。文章最后我

贴了这个贝叶斯的代码。


贝叶斯公式如下:


可以这样简单的推导出贝叶斯公式:




代码的流程,首先是统计样本数据;然后根据bayesian定理,创建一个
用于分类的模型;最后在用这个模型去对测试数据进行分类。

宏add-class-data负责统计数据, 她的第0个参数是分类,第1个参数是
在这个分类下的一个数据项,最后一个参数是个hash table,用于记录。
调用一次add-class-data就增加一项数据的统计值。

宏calc-bayesian-model会生成一个贝叶斯模型,并作为她的返回值。
她的参数是记录了多种统计结果的hash table。

函数bayesian-predict对测试数据进行分类。她的第0个参数是一个列表,
存放的是一个待测试的向量。第1个参数是分类模型。

函数mytest用于驱动这个玩具。

我初学Lisp,代码幼稚。随便写写玩具程序,变量胡乱起名。宏里面
的变量名应该用(gensym)生成,但我图方便没这么干。但代码还是可以
正常执行的,分类结果还能勉强接受。我在fedora 17上用sbcl简单的
测试了程序,可以这样执行:
    sbcl --script ./bayesian.lisp


bayesian.lisp:
===========================================
;;; 2013年 03月 10日 星期日 10:16:49 CST
;;; author: 李小丹(Li Shao Dan) 字 殊恒(shuheng)
;;; K.I.S.S
;;; S.P.O.T


(defmacro add-count (k h) `(incf (gethash ,k ,h 0)))

(defmacro add-class-data (c k h)
 `(multiple-value-bind (v p) (gethash ,c ,h)
     (if p (add-count ,k v) (add-count ,k (setf (gethash ,c ,h) (make-hash-table :test #'equal))))))

(defun print-k-v (k v) (format t "~t~t~a: ~a~%" k v))
(defun print-class (k v) (format t "~a:~%" k) (maphash #'print-k-v v))
(defmacro print-all (h) `(progn (maphash #'print-class ,h) (format t "====================~%")))

(defmacro calc-one-sum (h) `(let ((s 0)) (maphash #'(lambda (k v) (setf s (+ s v))) ,h) s))
(defmacro calc-item-sum (h &optional (c nil c-p))
 `(if ,c-p (multiple-value-bind (v p) (gethash ,c ,h) (if p (calc-one-sum v)))
             (let ((s 0)) (maphash #'(lambda (k v) (setf s (+ s (calc-one-sum v)))) ,h) s)))
(defmacro count-class (h) `(hash-table-count ,h))
(defmacro count-item-uniq (h &optional (c nil c-p))
 `(if ,c-p (multiple-value-bind (v p) (gethash ,c ,h) (if p (hash-table-count v)))
     (let ((s 0)) (maphash #'(lambda (k v) (setf s (+ s (hash-table-count v)))) ,h) s)))

(defmacro count-item (h c i)
 `(multiple-value-bind (ch ch-p) (gethash ,c ,h)
     (if ch-p (gethash ,i ch 0))))

(defmacro calc-p-class (h c)
 `(multiple-value-bind (v p) (gethash ,c ,h)
     (if p (let (sc cc) (setf sc (calc-item-sum ,h)) (setf cc (calc-one-sum v))
            (/ cc sc)))))

(defmacro calc-p-item-class (h c i)
 `(let ((sum (calc-item-sum ,h ,c)))
     (if (and sum (not (equal 0 sum))) (/ (count-item ,h ,c ,i) sum))))

(defmacro calc-bayesian-model (h)
 `(let ((items nil) (classes nil) (ret (make-hash-table :test #'equal)))
     (maphash #'(lambda (k v) (pushnew k classes) (maphash #'(lambda (kk vv) (pushnew kk items)) v)) ,h)
     (dolist (i items)
      (let ((sum 0) (temp))
        (multiple-value-bind (value v-p) (gethash i ret)
          (if (not v-p) (setf value (setf (gethash i ret) (make-hash-table :test #'equal))))
          (dolist (c classes)
            (setf temp (* (calc-p-item-class ,h c i) (calc-p-class ,h c)))
            (incf sum temp)
            (setf (gethash c value) temp))
          (maphash #'(lambda (x y) (setf (gethash x value) (/ y sum))) value)
          (maphash #'(lambda (x y) (if (equal 0 y) (setf (gethash x value) 1/100000))) value)))) ret))

(defun bayesian-predict (vec model)
 (let ((coll (make-hash-table :test #'equal)) (temp 0) (mc) (sum 0) (classes nil))
  (maphash #'(lambda (k v) (maphash #'(lambda (kk vv) (pushnew kk classes)) v)) model)
  (dolist (c classes)
   (let ((mul 1))
    (dolist (d vec)
     (multiple-value-bind (val v-p) (gethash d model)
      (if v-p (setf mul (* mul (gethash c val 1/100000))) (setf mul (* mul (/ 1 (length classes)))))))
    (setf (gethash c coll) mul)))
  (maphash #'(lambda (k v) (incf sum v) (when (> v temp) (setf temp v) (setf mc k))) coll)
  (if (>= (/ temp sum) 1/2) mc)))


;; ---------- test ---------------

(defvar *h* (make-hash-table :test #'equal))

(defun mytest()
  (add-class-data 1 "a" *h*)
  (add-class-data 1 "a" *h*)
  (add-class-data 1 "a" *h*)
  (add-class-data 1 "b" *h*)
  (add-class-data 1 "c" *h*)
  (add-class-data 1 "c" *h*)
  (add-class-data 1 "d" *h*)

  (add-class-data 2 "a" *h*)
  (add-class-data 2 "a" *h*)
  (add-class-data 2 "c" *h*)
  (add-class-data 2 "c" *h*)
  (add-class-data 2 "d" *h*)
  (add-class-data 2 "d" *h*)
  (add-class-data 2 "f" *h*)
  (add-class-data 2 "f" *h*)
  (add-class-data 2 "f" *h*)

  (add-class-data 3 "a" *h*)
  (add-class-data 3 "a" *h*)
  (add-class-data 3 "b" *h*)
  (add-class-data 3 "b" *h*)
  (add-class-data 3 "d" *h*)
  (add-class-data 3 "d" *h*)
  (add-class-data 3 "d" *h*)
  (add-class-data 3 "d" *h*)
  (add-class-data 3 "f" *h*)
  (add-class-data 3 "f" *h*)

  (print-all *h*)

;  (format t "sum is ~a.~%" (calc-item-sum *h*))
;  (dotimes (i 5) (format t "class(~a) sum: ~a.~%" i (calc-item-sum *h* i)))
;  (format t "class count ~a.~%" (count-class *h*))
;  (format t "item count: ~a.~%" (count-item-uniq *h*))
;  (dotimes (i 5) (format t "item(~a) type count: ~a.~%" i (count-item-uniq *h* i)))
;  (dotimes (i 5) (format t "class(~a) probability: ~a.~%" i (calc-p-class *h* i)))
;  (dotimes (i 5) (format t "class(~a) item(~a) count: ~a.~%" i "aa" (count-item *h* i "aa")))
;  (dotimes (i 5) (format t "class(~a) item(~a) count: ~a.~%" i "a" (count-item *h* i "a")))
;  (dotimes (i 5) (format t "class(~a) item(~a) count: ~a.~%" i "b" (count-item *h* i "b")))
;  (dotimes (i 5) (format t "class(~a) item(~a) count: ~a.~%" i "bd" (count-item *h* i "bd")))
;  (dotimes (i 5) (format t "class(~a) item(~a) count: ~a.~%" i "c" (count-item *h* i "c")))
;  (dotimes (i 5) (format t "class(~a) item(~a) count: ~a.~%" i "aka" (count-item *h* i "aka")))
;  (dotimes (i 5) (format t "class(~a) item(~a) probability: ~a.~%" i "aa" (calc-p-item-class *h* i "aa")))
;  (dotimes (i 5) (format t "class(~a) item(~a) probability: ~a.~%" i "a" (calc-p-item-class *h* i "a")))
;  (dotimes (i 5) (format t "class(~a) item(~a) probability: ~a.~%" i "b" (calc-p-item-class *h* i "b")))
;  (dotimes (i 5) (format t "class(~a) item(~a) probability: ~a.~%" i "bd" (calc-p-item-class *h* i "bd")))
;  (dotimes (i 5) (format t "class(~a) item(~a) probability: ~a.~%" i "c" (calc-p-item-class *h* i "c")))
;  (dotimes (i 5) (format t "class(~a) item(~a) probability: ~a.~%" i "aka" (calc-p-item-class *h* i "aka")))
  (setf *h* (calc-bayesian-model *h*))
  (print-all *h*)
  (format t "~a~%" (bayesian-predict (list "a" "c") *h*))
  (format t "====================~%")
  (format t "~a~%" (bayesian-predict (list "k" "a" "m" "n" "o") *h*))
  (format t "====================~%")
  (format t "~a~%" (bayesian-predict (list "a" "b" "c" "f") *h*))
  (format t "====================~%")
  (format t "~a~%" (bayesian-predict (list "a" "b" "c" "d" "f") *h*))
  (format t "====================~%")
  (format t "~a~%" (bayesian-predict (list "a" "d" "d" "c" "f") *h*))
  (format t "====================~%")
  (format t "~a~%" (bayesian-predict (list "f" "d" "f" "d" "d" "d" "f") *h*))
  (format t "====================~%")
  (format t "~a~%" (bayesian-predict (list "a" "c" "c" "d" "f") *h*))
  (format t "====================~%"))

(mytest)


====================================================================

代码完

看看,有点意思吧!


  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
朴素贝叶斯分类器是一种常用的机器学习算法,用于进行文本分类、垃圾邮件过滤等任务。在Python中,有多个包可以实现朴素贝叶斯分类器,其中最常用的包是scikit-learn(sklearn)。 scikit-learn是一个功能强大的机器学习库,提供了丰富的机器学习算法实现,包括朴素贝叶斯分类器。使用scikit-learn实现朴素贝叶斯分类器的步骤如下: 1. 导入所需的库和模块: ```python from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB ``` 2. 准备数据集: ```python # 假设有两个类别的文本数据,分别为正面和负面 X_train = ['I love this movie', 'This movie is great', 'I hate this movie', 'This movie is terrible'] y_train = ['positive', 'positive', 'negative', 'negative'] ``` 3. 特征提取: ```python # 使用CountVectorizer将文本转换为特征向量 vectorizer = CountVectorizer() X_train_vec = vectorizer.fit_transform(X_train) ``` 4. 构建朴素贝叶斯分类器模型并进行训练: ```python # 创建朴素贝叶斯分类器对象 classifier = MultinomialNB() # 使用训练数据进行模型训练 classifier.fit(X_train_vec, y_train) ``` 5. 使用模型进行预测: ```python # 假设有一个新的文本需要进行分类 X_test = ['This movie is amazing'] # 将新文本转换为特征向量 X_test_vec = vectorizer.transform(X_test) # 使用训练好的模型进行预测 y_pred = classifier.predict(X_test_vec) print(y_pred) ``` 以上就是使用scikit-learn包实现朴素贝叶斯分类器的基本步骤。你可以根据自己的需求和数据进行相应的调整和扩展。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值