概率及初始模型
#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import codecs
import numpy as np
import math
import jieba
#计算对数先验概率
def getLogPrior(train):
'''
totals:{'y':单词总数,'no':'单词总数'}
samples:{'y':样本数,'n':样本数}
logPrior:{'y':先验概率,'no':先验概率}
'''
a1 = train[train['1']==0]
a2 = train[train['1']==1]
samples={'y':len(a1),'n':len(a2)}
docSum = samples['y']+samples['n']
prior={'y':samples['y']/docSum, 'n':samples['n']/docSum}
logPrior={'y':math.log(prior['y']), 'n':math.log(prior['n'])}
return (docSum,samples,logPrior,prior)
def getConditionPro(train):
conditionPro={'y':{},'n':{}}
logConditionPro={'y':{},'n':{}}
docSum,samples,logPrior,prior=getLogPrior(train)
a1 = train[train['1']==0]
a2 = train[train['1']==1]
classNum=2
wordSet=set()
for X_word in train['2']:
for word in X_word:
wordSet.add(word)
#分类别计算词出现的次数
words={'y':{},'n':{}}
for messge in a1['2']:
settemp=set(messge)
for ch in messge:
if ch in words['y']:
words['y'][ch]+=1
else:
words['y'].setdefault(ch,1)
for messge in a2['2']:
settemp=set(messge)
for ch in messge: