本博客旨在帮助学生自己巩固所学,若能帮得上他人也是荣幸之至
首先以下是借鉴过的几个github库,非常感谢:
https://github.com/apachecn/python_data_analysis_and_mining_action
https://github.com/keefecn/python_practice_of_data_analysis_and_mining
https://github.com/Stormzudi/Python-Data-Mining
https://github.com/Echo9573/DataAnalysisbyPython
主要参考https://blog.csdn.net/qq_41709378/article/details/107443313
1 拉格朗日插值填补缺失值
牛顿插值见https://blog.csdn.net/wwxy1995/article/details/84440110,二者结果一样的
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : 4-1_lagrange_newton_interp.py
# @Author: Stormzudi
# @Date : 2020/7/22 16:01
# 拉格朗日插值代码
import pandas as pd # 导入数据分析库Pandas
from scipy.interpolate import lagrange # 导入拉格朗日插值函数
inputfile = '../data/catering_sale.xls' # 销量数据路径
outputfile = '../tmp/sales.xls' # 输出数据路径
data = pd.read_excel(inputfile) # 读入数据
data[u'销量'][(data[u'销量'] < 400) | (data[u'销量'] > 5000)] = None # 过滤异常值,将其变为空值
# 自定义列向量插值函数
# s为列向量,n为被插值的位置,k为取前后的数据个数,默认为5
def ployinterp_column(s, n, k=5):
y = s[list(range(n-k, n)) + list(range(n+1, n+1+k))] # 取数
y = y[y.notnull()] # 剔除空值
return lagrange(y.index, list(y))(n) # 插值并返回插值结果
# 逐个元素判断是否需要插值
for i in data.columns:
for j in range(len(data)):
if (data[i].isnull())[j]: # 如果为空即插值。
data[i][j] = ployinterp_column(data[i], j)
data.to_excel(outputfile) # 输出结果,写入文件
2 规范化
# -*- coding: utf-8 -*-
# 数据规范化
import pandas as pd
import numpy as np
datafile = '../data/normalization_data.xls' # 参数初始化
data = pd.read_excel(datafile, header = None) # 读取数据
(data - data.min())/(data.max() - data.min()) # 最小-最大规范化
(data - data.mean())/data.std() # 零-均值规范化
data/10**np.ceil(np.log10(data.abs().max())) # 小数定标规范化
3 数据离散化
将该列数据用三种方法分成4分
分别是等宽,等频,基于聚类分析
#-*- coding: utf-8 -*-
# 数据规范化
import pandas as pd
datafile = '../data/discretization_data.xls' # 参数初始化
data = pd.read_excel(datafile) # 读取数据
data = data[u'肝气郁结证型系数'].copy()
k = 4
d1 = pd.cut(data, k, labels=range(k)) # 等宽离散化,各个类比依次命名为0,1,2,3
# 等频率离散化
w = [1.0 * i / k for i in range(k + 1)]
print(data.describe(percentiles=w))
w = data.describe(percentiles=w)[4:4 + k + 1] # 使用describe函数自动计算分位数
w[0] = w[0] * (1 - 1e-10)
d2 = pd.cut(data, w, labels=range(k))
from sklearn.cluster import KMeans # 引入KMeans
kmodel = KMeans(n_clusters=k, n_jobs=4) # 建立模型,n_jobs是并行数,一般等于CPU数较好
kmodel.fit(data.values.reshape((len(data),1))) # 训练模型
c=pd.DataFrame(kmodel.cluster_centers_).sort_values(0) # 输出聚类中心,并且排序(默认是随机序的)
w=c.rolling(2).mean().iloc[1:] # 相邻两项求中点,作为边界点
w = [0] + list(w[0]) + [data.max()] # 把首末边界点加上
d3 = pd.cut(data, w, labels=range(k))
def cluster_plot(d, k): # 自定义作图函数来显示聚类结果
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
plt.figure(figsize=(8, 3))
for j in range(0, k):
plt.plot(data[d == j], [j for i in d[d == j]], 'o')
plt.ylim(-0.5, k - 0.5)
return plt
cluster_plot(d1, k).show()
cluster_plot(d2, k).show()
cluster_plot(d3, k).show()
展示结果中的一个图
4 小波变换
咱也不咋理解哈哈
#-*- coding: utf-8 -*-
# 利用小波分析进行特征分析
# 参数初始化
inputfile = '../data/leleccum.mat' # 提取自Matlab的信号文件
from scipy.io import loadmat # mat是MATLAB专用格式,需要用loadmat读取它
mat = loadmat(inputfile)
print(mat['leleccum'][0])
signal = mat['leleccum'][0]
import pywt # 导入PyWavelets
coeffs = pywt.wavedec(signal, 'bior3.7', level=5)
print(coeffs)
# 返回结果为level+1个数组,第一个数组为逼近系数数组,后面的依次是细节系数数组
[420.20278994 423.52653517 423.52271225 ... 323.96580997 323.2400761
323.85476049]
[array([2415.1478541 , 2395.74470824, 2402.22022728, 2408.90987352,
2402.22022728, 2395.74470824, 2415.1478541 , 2369.53622493,
1958.0913368 , 1983.87619596, 1901.68851538, 1651.86483216,
1482.45129628, 1356.98779058, 1257.4459793 , 1265.75505172,
1363.66712581, 1427.53767222, 1568.87951307, 1893.80694993,
2295.89161125, 2555.9239482 , 2778.31817145, 2871.0940301 ,
2954.38189098, 2981.0281365 , 2986.06286012, 3091.56214184,
3085.0678644 , 2840.05639099, 2782.74679521, 2776.99922688,
2833.0658032 , 2907.76710805, 2496.58749928, 2443.95791914,
2338.50723857, 2394.15834442, 2186.86013504, 2142.10730351,
2066.37469747, 2097.47366057, 2190.20987484, 2024.82470966,
1999.88792082, 1761.22260043, 2012.8983115 , 1733.14320566,
1955.69105593, 2296.53399998, 2332.11621828, 2436.91433782,
2248.43497823, 1928.01215666, 1900.73383661, 1804.08152916,
1596.93576991, 1375.26325034, 1301.52662997, 1239.15426738,
1186.59596164, 1319.79503991, 1366.29061126, 1541.13036373,
1840.28203581, 2332.24861782, 2493.05709766, 2756.64959852,
2845.85405655, 2889.08956115, 2900.45305889, 2894.26919258,
2840.00331868, 2972.87057918, 2734.41261131, 2706.91816977,
2748.45656461, 2728.48445985, 2699.97766246, 2573.64021822,
2465.86126471, 2389.76210231, 2228.72532938, 2147.04749027,
2101.5149566 , 2060.59130892, 2073.90160123, 2125.05661853,
2006.49905922, 1892.43376708, 1792.18694605, 1688.28436526,
1759.13437455, 1662.84067347, 2211.37879446, 2298.66750686,
2229.3528378 , 2250.43556987, 1739.81121296, 1711.93766043,
1658.80982905, 1343.09569093, 1170.87330461, 930.3307274 ,
881.12593524, 806.05407736, 796.07602554, 770.19910471,
746.44388457, 872.75531896, 1072.73155416, 1203.88261161,
1402.5617364 , 1520.92224501, 1899.51673709, 1836.55406856,
1874.02882644, 1860.42136727, 1843.63833987, 1803.4998732 ,
1888.2388324 , 1808.61624732, 1669.61176324, 1589.11409167,
1454.81743823, 1309.27429412, 1217.19395153, 1155.90443861,
1100.16891616, 1135.66615726, 1090.30057756, 1175.24958262,
1265.17133627, 1224.91217397, 1174.91998265, 1124.20079064,
1081.12884006, 1128.58871491, 1168.71694008, 1687.91722313,
1793.37627801, 1885.84661105, 1821.46713782, 1836.21073473,
1850.45948483, 1795.06691925, 1850.45948483, 1836.21073473,
1821.46713782]), array([-5.11468240e+00, -7.78481941e+00, -5.59636066e+00, 5.68434189e-14,
5.59636066e+00, 7.78481941e+00, 5.11468240e+00, -2.81258232e+00,
3.26062134e+01, 2.68735619e+01, 1.20971007e+01, -5.07671404e-01,
7.62522769e+00, -6.84072854e-01, -1.16884926e+01, -1.18088402e+01,
1.33926665e+01, -2.01235299e+01, 3.48260315e+00, -3.54659040e+00,
2.01837609e+01, 1.15681152e+01, -1.18638949e+01, 6.84726925e+00,
-4.11590600e+00, -1.02132760e+01, -1.27666042e+01, -9.20952401e+00,
2.77461418e+01, -3.98479118e+01, 5.15429218e+01, 6.60716803e+01,
3.46710408e+01, 5.15409412e+01, -1.97726577e+00, -1.73356501e+01,
-2.98726832e+01, 6.88709748e+01, 4.13412171e+01, -3.34948636e+00,
-3.74146943e+00, 1.88164077e+00, -1.79587054e+01, -1.10115156e+01,
5.35815207e+00, -9.54447993e+00, -3.58666643e+01, -2.27141233e+01,
-1.10221405e+02, -8.13158319e+01, -7.74112994e+01, -7.01880092e+01,
-3.44548030e+01, -8.16498691e+00, 2.65900570e+00, -6.23319888e+00,
4.87340502e+00, 4.68468083e+00, 9.18690091e+00, 1.85430639e+01,
2.93779255e+00, 2.58783705e+01, 5.40120757e+00, 5.17675770e+01,
1.61772930e+01, 1.32514251e+01, 6.44356661e+00, 9.97394561e+00,
-6.32494684e+00, -1.37030499e+01, 7.82329250e+00, 2.65234860e-02,
-1.50525686e+01, -5.40883815e+00, -3.98406609e+01, 5.40847540e+01,
4.94428826e+01, 1.69192021e+01, 2.05407784e+01, 6.23907404e+00,
2.08672203e+00, 1.88346482e+01, 1.65051009e+01, -2.48767801e+01,
1.21863576e+01, 3.11490193e+01, 6.93708899e+00, -1.01772953e+01,
-9.74729168e+00, 1.89445313e+01, 7.99277116e+00, 7.44593412e+00,
-4.47889132e+00, -9.75438570e+01, -9.69116185e+01, -5.69268401e+01,
-3.98340942e+01, -4.56499591e+01, -1.41379229e+01, -4.08223971e+01,
-3.17335250e+01, -1.63609602e+00, -4.09533079e+01, -3.34194655e+01,
-2.24169494e+01, -2.33365528e+01, 1.66316122e+01, -8.71183199e+00,
-8.09314057e+00, -8.34584196e+00, 3.28930260e+01, 6.08832971e+01,
1.23425633e+01, -5.44692340e+01, -6.72967848e+01, -2.55981626e+01,
-1.59700109e+01, -5.97791744e+00, -6.95498159e+00, -1.49507701e+01,
6.94813379e+01, 2.99780448e+01, -2.88329977e+00, 1.96172085e+01,
1.07086206e+01, 1.35358732e+00, -9.96476145e+00, -9.87509065e+00,
3.28294967e-01, 4.38088766e+00, 2.28724457e+00, 7.26065444e+00,
1.73104487e-01, -9.90509246e-01, 1.12392282e+01, 6.73495793e+00,
-6.68352711e+00, -2.67788156e+00, -6.92047375e+01, -5.60700568e+01,
-6.98683373e+01, -6.56027327e+01, -7.28593276e+00, -3.41060513e-13,
7.28593276e+00, 0.00000000e+00, -7.28593276e+00, 3.41060513e-13,
7.28593276e+00]), array([ 1....]
进程已结束,退出代码 0
5 主成分分析PCA
#-*- coding: utf-8 -*-
# 主成分分析 降维
import pandas as pd
# 参数初始化
inputfile = '../data/principal_component.xls'
outputfile = '../tmp/dimention_reducted.xls' # 降维后的数据
data = pd.read_excel(inputfile, header=None) # 读入数据
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(data)
print(pca.components_) # 返回模型的各个特征向量
print(pca.explained_variance_ratio_) # 返回各个成分各自的方差百分比
# 以上说明取前3个已经很不错了
print("-----------4-6.2-----------------")
pca=PCA(3)
pca.fit(data)
low_d=pca.transform(data)
pd.DataFrame(low_d).to_excel(outputfile)
print(low_d)
print(pca.inverse_transform(low_d))