# python数据分析与挖掘实战 第六章 拓展思考

### 企业偷漏税识别模型

• 1、数据探索
• 偷漏税企业分布

import pandas as pd
data = pd.read_excel(inputfile,index_col=0)

t = pd.DataFrame(data.groupby([data['销售模式'],data['输出']]).size()).unstack()[0]
t['异常比率']=t['异常']/t.sum(axis=1)
t.sort_values('异常比率',ascending=False)

4S店 20 56 0.263158

import matplotlib.pyplot as plt

plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.sans-serif'] = ['SimHei']

plt.bar(range(len(t.index)),t['异常比率'],tick_label=t.index)
plt.xticks(rotation=90)
plt.show()

• 2 模型构建

data['输出']=data['输出'].replace('正常',1)
data['输出']=data['输出'].replace('异常',0)
for m,n in enumerate(set(data['销售类型'])):
data['销售类型'] = data['销售类型'].replace(n, m+1)
for m,n in enumerate(set(data['销售模式'])):
data['销售模式'] = data['销售模式'].replace(n, m+1)

from random import shuffle
data=data.as_matrix()
shuffle(data)
p=0.8
train=data[:int(len(data)*p),:]
test = data[int(len(data)*p):,:]
• 开始做LM神经网络模型：
from keras.models import Sequential
from keras.layers.core import Dense, Activation

net = Sequential()

hist = net.fit(train[:, :14], train[:, 14], epochs=1000, batch_size=1)

net.save_weights('E:\\ch06model.h5')

predict_result = net.predict_classes(train[:, :14]).reshape(len(train)) # 用训练集预测下
predict_result_test = net.predict_classes(test[:, :14]).reshape(len(test)) # 用测试集预测下

cm_plot(train[:, 14], predict_result).show()
cm_plot(test[:, 14], predict_result_test).show()

• 那么，再来做CART模型看看
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()  # 建立模型
tree.fit(train[:, :14], train[:, 14])  # 训练模型

cm_plot(train[:, 14], tree.predict(train[:,:14])).show()
cm_plot(test[:, 14], tree.predict(test[:,:14])).show()

• 那么做一个ROC评价对比一下。
from sklearn.metrics import roc_curve
from matplotlib import pyplot as plt

# LM模型
predict_result_test = net.predict(test[:, :14]).reshape(len(test))
fpr1, tpr1, thresholds1 = roc_curve(test[:, 14], predict_result_test, pos_label=1)
plt.plot(fpr1, tpr1, linewidth=2, label='ROC OF LM')

# CART模型
predict_result_test = tree.predict_proba(test[:, :14])[:, 1]
fpr, tpr, thresholds = roc_curve(test[:, 14], predict_result_test, pos_label=1)
plt.plot(fpr, tpr, linewidth=2, label='ROC OF CART')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.ylim(0, 1.05)
plt.xlim(0, 1.05)
plt.legend(loc=4)
plt.show()