逻辑回归
import pandas as pda
fname='E:/programCode/huigui.csv'
dataf=pda.read_csv(fname)
#自变量x
x=dataf.iloc[:,1:4].as_matrix()#[:,1:2]中的右边代表列取谁,左边的代表行取谁
#因变量y
y=dataf.iloc[:,0:1].as_matrix()#.as_matrix作用为将其转换为数组,0:1前闭后开
from sklearn.linear_model import LogisticRegression as LR#逻辑回归模型
from sklearn.linear_model import RandomizedLogisticRegression as RLR#随机回归模型
import joblib
r1=RLR()
r1.fit(x,y)#用数据进行训练
r1.get_support(indices=True)#进行特征筛选
#print(dataf.columns[r1.get_support(indices=True)])
t=dataf[dataf.columns[r1.get_support(indices=True)]].as_matrix()
r2=LR()
r2.fit(t,y)
print('训练结束')
print('模型正确率:'+r2.score(x,y))
决策树(ID3算法、C4.5算法、CART算法)
import pandas as pda
import numpy
fname='E:/programCode/lesson.csv'
dataf=pda.read_csv(fname)
x=dataf.iloc[:,1:5].as_matrix()#iloc[:,1:5]提取所有行,1到4列
y=dataf.iloc[:,5].as_matrix()
for i in range(0,len(x)):
for j in range(0,len(x[i])):
thisdata=x[i][j]
if(thisdata=='是' or thisdata=='高' or thisdata=='多'):
x[i][j]=int(1)
else:
x[i][j]=-1
for i in range(0,len(y)):
thisdata=y[i]
if(thisdata=='高'):
y[i]=1
else:
y[i]=-1
#容易错的地方:直接输入x,y训练
#正确的做法:转化好格式,将x,y转化为数据框,然后在转换为数组并指定格式
xf=pda.DataFrame(x)
yf=pda.DataFrame(y)
x2=xf.as_matrix().astype(int)
y2=yf.as_matrix().astype(int)
#建立决策树
from sklearn.tree import DecisionTreeClassifier as DTC
dtc=DTC(criterion='entropy')#entropy为信息熵,即为训练标准为信息熵
dtc.fit(x2,y2)#进行训练生成决策树
#直接验证的方式,预测销量高低
import numpy as npy
x3=npy.array([[1,-1,-1,1],[1,1,1,1],[-1,-1,-1,-1]])
rst=dtc.predict(x3)
print(rst)
#决策树可视化
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
with open('E:/programCode/dtc.dot','w') as file:
export_graphviz(dtc,feature_names=['combat','num','promotion','datum'],out_file=file)#第二个参数为各特征的名称,第三个参数为指定文件
#生成的dtc.dot文件可用Gaphic转换为PDF或其他文件,查看决策树