python数据分析与挖掘实战第五章总结

最新推荐文章于 2023-02-21 11:24:47 发布

一蓑烟雨晴

最新推荐文章于 2023-02-21 11:24:47 发布

阅读量1.5k

点赞数 2

分类专栏： python数学分析与挖掘实战

本文链接：https://blog.csdn.net/qq_44941689/article/details/113656101

版权

python数学分析与挖掘实战专栏收录该内容

13 篇文章 5 订阅

订阅专栏

本博客旨在帮助学生自己巩固所学，若能帮得上他人也是荣幸之至
首先以下是借鉴过的几个github库，非常感谢：
https://github.com/apachecn/python_data_analysis_and_mining_action
https://github.com/keefecn/python_practice_of_data_analysis_and_mining
https://github.com/Stormzudi/Python-Data-Mining
https://github.com/Echo9573/DataAnalysisbyPython

1 岭回归分析

在这里插入图片描述
本案例为前七列指标回归第八列是否违约

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : 5-1_lasso.py
# @Author: Stormzudi
# @Date  : 2020/7/24 22:27

"""
实现岭回归分析
"""

import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge  # 通过sklearn.linermode1加载岭回归方法
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures  # 通过sklearn.preprocessing加载PolynomialFeatures用于创建多项式特征
from sklearn.model_selection import train_test_split  # 交叉验证

filename = '../data/bankloan.xls'
data_df = pd.read_excel(filename)
data = np.array(data_df)
# print(data[:, 8])
# plt.plot(data[:, 8])  # 展示车流量信息


X = data[:, 0:8]  # X用于保存1-4维数据,即属性
y = data[:, 8]  # y用于保存第5维数据,即车流量
poly = PolynomialFeatures(6)  # 用于创建最高次数6次方的的多项式特征,多次试验后决定采用6次
X = poly.fit_transform(X)  # X为创建的多项式特征

train_set_X, test_set_X, train_set_y, test_set_y = train_test_split(X, y, test_size=0.3, random_state=0)
# 将所有数据划分为训练集和测试集, test_ size表示测试集的比例， random_state是随机数种子

clf = Ridge(alpha=1.0, fit_intercept=True)  # 创建岭回归实例
clf.fit(train_set_X, train_set_y)  # 调用fit函数使用训练集训练回归器
score = clf.score(test_set_X, test_set_y)  # 评价拟合值的好坏(最大值：1)
print(score)
'''
# 利用测试集计算回归曲线的拟合优度，clf. score返回值为0.7620拟合优度,
# 用于评价拟合好坏,最大为1,无最小值,当对所有输入都输出同一个值时,拟合优度为0。
# '''

# 绘制拟合曲线
start = 200  # 画一段200到300范围内的拟合曲线
end = 300
y_pre = clf.predict(X)  # 是调用predict函数的拟合值
time = np.arange(start, end)

fig = plt.figure()  # 定义一个图片
ax = fig.add_subplot(1, 1, 1)
ax.plot(time, y[start:end], label='real')
ax.plot(time, y_pre[start:end],'r', label='predict')  # 展示真实数据(蓝色)以及拟合的曲线(红色)
plt.legend(loc = 'upper left') # 设置图例的位置
props = {
    'title' : 'Traffic flow forecast',
    'xlabel' : 'Period of time[200-300]',
    'ylabel' : 'Number of traffic'
}
ax.set(**props)
plt.show()

2 画一个决策树

在这里插入图片描述

# -*- coding: utf-8 -*-
# 使用ID3决策树算法预测销量高低
import pandas as pd
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.tree import export_graphviz


# 参数初始化
inputfile = '../data/sales_data.xls'
data = pd.read_excel(inputfile, index_col = u'序号')  # 导入数据

# 数据是类别标签，要将它转换为数据
# 用1来表示“好”“是”“高” 这三个属性，用-1来表示“坏”“否”“低”
data[data == u'好'] = 1
data[data == u'是'] = 1
data[data == u'高'] = 1
data[data != 1] = -1
x = data.iloc[:,:3].values.astype(int)
y = data.iloc[:,3].values.astype(int)


dtc = DTC(criterion='entropy')  # 建立决策树模型，基于信息熵
dtc.fit(x, y)  # 训练模型

# 导入相关函数，可视化决策树。
# 导出的结果是一个dot文件，需要安装Graphviz才能将它转换为pdf或png等格式。

x = pd.DataFrame(x)
with open("tree.dot", 'w') as f:
    f = export_graphviz(dtc, feature_names = x.columns, out_file = f)

digraph Tree {
node [shape=box] ;
0 [label="1 <= 0.0\nentropy = 0.998\nsamples = 34\nvalue = [16, 18]"] ;
1 [label="2 <= 0.0\nentropy = 0.934\nsamples = 20\nvalue = [13, 7]"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="0 <= 0.0\nentropy = 0.544\nsamples = 8\nvalue = [7, 1]"] ;
1 -> 2 ;
3 [label="entropy = 0.0\nsamples = 4\nvalue = [4, 0]"] ;
2 -> 3 ;
4 [label="entropy = 0.811\nsamples = 4\nvalue = [3, 1]"] ;
2 -> 4 ;
5 [label="0 <= 0.0\nentropy = 1.0\nsamples = 12\nvalue = [6, 6]"] ;
1 -> 5 ;
6 [label="entropy = 0.971\nsamples = 5\nvalue = [3, 2]"] ;
5 -> 6 ;
7 [label="entropy = 0.985\nsamples = 7\nvalue = [3, 4]"] ;
5 -> 7 ;
8 [label="0 <= 0.0\nentropy = 0.75\nsamples = 14\nvalue = [3, 11]"] ;
0 -> 8 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
9 [label="2 <= 0.0\nentropy = 0.954\nsamples = 8\nvalue = [3, 5]"] ;
8 -> 9 ;
10 [label="entropy = 0.918\nsamples = 3\nvalue = [2, 1]"] ;
9 -> 10 ;
11 [label="entropy = 0.722\nsamples = 5\nvalue = [1, 4]"] ;
9 -> 11 ;
12 [label="entropy = 0.0\nsamples = 6\nvalue = [0, 6]"] ;
8 -> 12 ;
}

参见博客https://www.cnblogs.com/JXcola/p/12548288.html可转化为图片
在这里插入图片描述

3 k-means聚类并聚类可视化

在这里插入图片描述

# -*- coding: utf-8 -*-
# 使用K-Means算法聚类消费行为特征数据

from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号


# 参数初始化
inputfile = '../data/consumption_data.xls'  # 销量及其他属性数据
outputfile = '../tmp/data_type.xls'  # 保存结果的文件名
k = 3 #聚类的类别
iteration = 500 #聚类最大循环次数
data = pd.read_excel(inputfile, index_col = 'Id') #读取数据
data_zs = 1.0*(data - data.mean())/data.std() #数据标准化

model = KMeans(n_clusters = k, n_jobs = 4, max_iter = iteration) #分为k类，并发数4
model.fit(data_zs) #开始聚类

# 简单打印结果
r1 = pd.Series(model.labels_).value_counts()  # 统计各个类别的数目
r2 = pd.DataFrame(model.cluster_centers_)  # 找出聚类中心
r = pd.concat([r2, r1], axis = 1)  # 横向连接（0是纵向），得到聚类中心对应的类别下的数目
r.columns = list(data.columns) + [u'类别数目']  # 重命名表头
print(r)

# 详细输出原始数据及其类别
r = pd.concat([data, pd.Series(model.labels_, index = data.index)], axis = 1)   # 详细输出每个样本对应的类别
r.columns = list(data.columns) + [u'聚类类别']  # 重命名表头
r.to_excel(outputfile)  # 保存结果


def density_plot(data): #自定义作图函数
    p = data.plot(kind='kde', linewidth = 2, subplots = True, sharex = False)
    [p[i].set_ylabel(u'密度') for i in range(k)]
    plt.legend()
    return plt

pic_output = '../tmp/pd_' #概率密度图文件名前缀
for i in range(k):
    density_plot(data[r[u'聚类类别']==i]).savefig(u'%s%s.png' %(pic_output, i))


# 绘制聚类效果图
tsne = TSNE()
tsne.fit_transform(data_zs) #进行数据降维
tsne = pd.DataFrame(tsne.embedding_, index = data_zs.index) #转换数据格式


# 不同类别用不同颜色和样式绘图
fig = plt.figure()  # 增加一个画布
d = tsne[r[u'聚类类别'] == 0]
plt.plot(d[0], d[1], 'r.')
d = tsne[r[u'聚类类别'] == 1]
plt.plot(d[0], d[1], 'go')
d = tsne[r[u'聚类类别'] == 2]
plt.plot(d[0], d[1], 'b*')
plt.show()

在这里插入图片描述

4 关联规则 Apriori

在这里插入图片描述

#-*- coding: utf-8 -*-
#使用Apriori算法挖掘菜品订单关联规则
from __future__ import print_function
import pandas as pd
from apriori import * # 导入自行编写的apriori函数

inputfile = '../data/menu_orders.xls'
outputfile = '../tmp/apriori_rules.xls' #结果文件
data = pd.read_excel(inputfile, header = None)

print(u'\n转换原始数据至0-1矩阵...')
ct = lambda x : pd.Series(1, index = x[pd.notnull(x)]) #转换0-1矩阵的过渡函数
b = map(ct, data.values) #用map方式执行
data = pd.DataFrame(list(b)).fillna(0) #实现矩阵转换，空值用0填充
print(u'\n转换完毕。')
del b #删除中间变量b，节省内存

support = 0.2 #最小支持度
confidence = 0.5 #最小置信度
ms = '---' #连接符，默认'--'，用来区分不同元素，如A--B。需要保证原始表格中不含有该字符

find_rule(data, support, confidence, ms).to_excel(outputfile)  # 保存结果

#-*- coding: utf-8 -*-
from __future__ import print_function
import pandas as pd

#自定义连接函数，用于实现L_{k-1}到C_k的连接
def connect_string(x, ms):
    x = list(map(lambda i:sorted(i.split(ms)), x))
    l = len(x[0])
    r = []
    for i in range(len(x)):
        for j in range(i, len(x)):
            if x[i][:l-1] == x[j][:l-1] and x[i][l-1] != x[j][l-1]:
                r.append(x[i][:l-1]+sorted([x[j][l-1],x[i][l-1]]))
    return r

#寻找关联规则的函数
def find_rule(d, support, confidence, ms = u'--'):
    result = pd.DataFrame(index=['support', 'confidence']) #定义输出结果
  
    support_series = 1.0*d.sum()/len(d) #支持度序列
    column = list(support_series[support_series > support].index) #初步根据支持度筛选
    k = 0
  
    while len(column) > 1:
        k = k+1
        print(u'\n正在进行第%s次搜索...' %k)
        column = connect_string(column, ms)
        print(u'数目：%s...' %len(column))
        sf = lambda i: d[i].prod(axis=1, numeric_only = True) #新一批支持度的计算函数
    
    #创建连接数据，这一步耗时、耗内存最严重。当数据集较大时，可以考虑并行运算优化。
    d_2 = pd.DataFrame(list(map(sf,column)), index = [ms.join(i) for i in column]).T
    
    support_series_2 = 1.0*d_2[[ms.join(i) for i in column]].sum()/len(d) #计算连接后的支持度
    column = list(support_series_2[support_series_2 > support].index) #新一轮支持度筛选
    support_series = support_series.append(support_series_2)
    column2 = []
    
    for i in column: #遍历可能的推理，如{A,B,C}究竟是A+B-->C还是B+C-->A还是C+A-->B？
        i = i.split(ms)
        for j in range(len(i)):
            column2.append(i[:j]+i[j+1:]+i[j:j+1])
    
    cofidence_series = pd.Series(index=[ms.join(i) for i in column2]) #定义置信度序列
 
    for i in column2: #计算置信度序列
        cofidence_series[ms.join(i)] = support_series[ms.join(sorted(i))]/support_series[ms.join(i[:len(i)-1])]
    
    for i in cofidence_series[cofidence_series > confidence].index: #置信度筛选
        result[i] = 0.0
        result[i]['confidence'] = cofidence_series[i]
        result[i]['support'] = support_series[ms.join(sorted(i.split(ms)))]
  
    result = result.T.sort(['confidence','support'], ascending = False) #结果整理，输出
    print(u'\n结果为：')
    print(result)
  
    return result

           support  confidence
e---a          0.3    1.000000
e---c          0.3    1.000000
c---e---a      0.3    1.000000
a---e---c      0.3    1.000000
c---a          0.5    0.714286
a---c          0.5    0.714286
a---b          0.5    0.714286
c---b          0.5    0.714286
b---a          0.5    0.625000
b---c          0.5    0.625000
a---c---e      0.3    0.600000
b---c---a      0.3    0.600000
a---c---b      0.3    0.600000
a---b---c      0.3    0.600000

5 时间序列

有点乱，运行完看
在这里插入图片描述

#-*- coding: utf-8 -*-
#arima时序模型

import pandas as pd

#参数初始化
discfile = '../data/arima_data.xls'
forecastnum = 5

#读取数据，指定日期列为指标，Pandas自动将“日期”列识别为Datetime格式
data = pd.read_excel(discfile, index_col = u'日期')

#时序图
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
data.plot()
plt.show()

#自相关图
from statsmodels.graphics.tsaplots import plot_acf
plot_acf(data).show()

#平稳性检测
from statsmodels.tsa.stattools import adfuller as ADF
print(u'原始序列的ADF检验结果为：', ADF(data[u'销量']))
#返回值依次为adf、pvalue、usedlag、nobs、critical values、icbest、regresults、resstore

#差分后的结果
D_data = data.diff().dropna()
D_data.columns = [u'销量差分']
D_data.plot() #时序图
plt.show()
plot_acf(D_data).show() #自相关图
from statsmodels.graphics.tsaplots import plot_pacf
plot_pacf(D_data).show() #偏自相关图
print(u'差分序列的ADF检验结果为：', ADF(D_data[u'销量差分'])) #平稳性检测

#白噪声检验
from statsmodels.stats.diagnostic import acorr_ljungbox
print(u'差分序列的白噪声检验结果为：', acorr_ljungbox(D_data, lags=1)) #返回统计量和p值

from statsmodels.tsa.arima_model import ARIMA

data[u'销量'] = data[u'销量'].astype(float)
#定阶
pmax = int(len(D_data)/10)  # 一般阶数不超过length/10
qmax = int(len(D_data)/10)  # 一般阶数不超过length/10
bic_matrix = [] #bic矩阵
for p in range(pmax+1):
    tmp = []
    for q in range(qmax+1):
        try: #存在部分报错，所以用try来跳过报错。
            tmp.append(ARIMA(data, (p,1,q)).fit().bic)
        except:
            tmp.append(None)
    bic_matrix.append(tmp)

bic_matrix = pd.DataFrame(bic_matrix)  # 从中可以找出最小值

p,q = bic_matrix.stack().idxmin()  # 先用stack展平，然后用idxmin找出最小值位置。
print(u'BIC最小的p值和q值为：%s、%s' %(p,q)) 
model = ARIMA(data, (p,1,q)).fit()  # 建立ARIMA(0, 1, 1)模型
model.summary2()  # 给出一份模型报告
model.forecast(5)  # 作为期5天的预测，返回预测结果、标准误差、置信区间。

6 聚类分析查离群点

在这里插入图片描述

#-*- coding: utf-8 -*-
#使用K-Means算法聚类消费行为特征数据

import numpy as np
import pandas as pd

#参数初始化
inputfile = '../data/consumption_data.xls' #销量及其他属性数据
k = 3 #聚类的类别
threshold = 2 #离散点阈值
iteration = 500 #聚类最大循环次数
data = pd.read_excel(inputfile, index_col = 'Id') #读取数据
data_zs = 1.0*(data - data.mean())/data.std() #数据标准化

from sklearn.cluster import KMeans
model = KMeans(n_clusters = k, n_jobs = 4, max_iter = iteration) #分为k类，并发数4
model.fit(data_zs) #开始聚类

#标准化数据及其类别
r = pd.concat([data_zs, pd.Series(model.labels_, index = data.index)], axis = 1)  #每个样本对应的类别
r.columns = list(data.columns) + [u'聚类类别']  # 重命名表头

norm = []
for i in range(k):  # 逐一处理
  norm_tmp = r[['R', 'F', 'M']][r[u'聚类类别'] == i]-model.cluster_centers_[i]
  norm_tmp = norm_tmp.apply(np.linalg.norm, axis = 1) #求出绝对距离
  norm.append(norm_tmp/norm_tmp.median()) #求相对距离并添加

norm = pd.concat(norm) #合并

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
norm[norm <= threshold].plot(style = 'go')  # 正常点

discrete_points = norm[norm > threshold]  # 离群点
discrete_points.plot(style = 'ro')

for i in range(len(discrete_points)):  # 离群点做标记
    id = discrete_points.index[i]
    n = discrete_points.iloc[i]
    plt.annotate('(%s, %0.2f)'%(id, n), xy = (id, n), xytext = (id, n))

plt.xlabel(u'编号')
plt.ylabel(u'相对距离')
plt.show()

在这里插入图片描述

一蓑烟雨晴

关注

2
点赞
踩
9

收藏

觉得还不错? 一键收藏
1
评论
python数据分析与挖掘实战第五章总结

本博客旨在帮助学生自己巩固所学，若能帮得上他人也是荣幸之至首先以下是借鉴过的几个github库，非常感谢：https://github.com/apachecn/python_data_analysis_and_mining_actionhttps://github.com/keefecn/python_practice_of_data_analysis_and_mininghttps://github.com/Stormzudi/Python-Data-Mininghttps://github.
复制链接

扫一扫