第四章 数据预处理
4.1 数据清洗
4.1.1 缺失值处理
- 拉格朗日插值:
- 有误-未改
# 用拉格朗日法进行插补
import pandas as pd
from scipy.interpolate import lagrange # 导入拉格朗日函数
import xlwt
inputfile = './data/catering_sale.xls' # 销售数据路径
outputfile = './sales.xls' # 输出数据路径
data = pd.read_excel(inputfile) # 读入数据
data[u'销量'][(data[u'销量'] < 400) | (data[u'销量'] > 5000)] = None # 过滤异常值,将其变为空值
# 自定义列向量插值函数
# s为列向量,n为被插值的位置,k为取前后的数据个数,默认为5
def ployinterp_column(s, n, k=5):
y = s[list(range(n-k,n)) + list(range(n+1, n+1+k))] # 取数
y = y[y.notnull()] # 剔除空值
return lagrange(y.index, list(y))(n) # 插值并返回插值结果
# 逐个元素判断是否需要插值
for i in data.columns:
for j in range(len(data)):
if (data[i].isnull())[j]: #如果为空即插值
data[i][j] = ployinterp_column(data[i],j)
data.to_excel(outputfile) # 输出结果, 写入文件
4.1.2 异常值处理
4.2 数据集成
4.2.1 实体识别
4.2.2 冗余属性识别
4.3 数据变换
4.3.1 简单函数变换
4.3.2 规范化
#-*- coding: utf-8 -*-
#数据规范化
import pandas as pd
import numpy as np
datafile = '../data/normalization_data.xls' #参数初始化
data = pd.read_excel(datafile, header = None) #读取数据
print(data)
a=(data - data.min())/(data.max() - data.min()) #最小-最大规范化
b=(data - data.mean())/data.std() #零-均值规范化
c=data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化
print(a,'\n',b,'\n',c)
0 1 2 3
0 78 521 602 2863
1 144 -600 -521 2245
2 95 -457 468 -1283
3 69 596 695 1054
4 190 527 691 2051
5 101 403 470 2487
6 146 413 435 2571
0 1 2 3
0 0.074380 0.937291 0.923520 1.000000
1 0.619835 0.000000 0.000000 0.850941
2 0.214876 0.119565 0.813322 0.000000
3 0.000000 1.000000 1.000000 0.563676
4 1.000000 0.942308 0.996711 0.804149
5 0.264463 0.838629 0.814967 0.909310
6 0.636364 0.846990 0.786184 0.929571
0 1 2 3
0 -0.905383 0.635863 0.464531 0.798149
1 0.604678 -1.587675 -2.193167 0.369390
2 -0.516428 -1.304030 0.147406 -2.078279
3 -1.111301 0.784628 0.684625 -0.456906
4 1.657146 0.647765 0.675159 0.234796
5 -0.379150 0.401807 0.152139 0.537286
6 0.650438 0.421642 0.069308 0.595564
0 1 2 3
0 0.078 0.521 0.602 0.2863
1 0.144 -0.600 -0.521 0.2245
2 0.095 -0.457 0.468 -0.1283
3 0.069 0.596 0.695 0.1054
4 0.190 0.527 0.691 0.2051
5 0.101 0.403 0.470 0.2487
6 0.146 0.413 0.435 0.2571
4.3.3 连续属性离散化
- 有误已改未看 Pandas属性错误
#-*- coding: utf-8 -*-
#数据规范化
import pandas as pd
datafile = '../data/discretization_data.xls' #参数初始化
data = pd.read_excel(datafile) #读取数据
data = data[u'肝气郁结证型系数'].copy()
k = 4
d1 = pd.cut(data, k, labels = range(k)) #等宽离散化,各个类比依次命名为0,1,2,3
#等频率离散化
w = [1.0*i/k for i in range(k+1)]
w = data.describe(percentiles = w)[4:4+k+1] #使用describe函数自动计算分位数
w[0] = w[0]*(1-1e-10)
d2 = pd.cut(data, w, labels = range(k))
from sklearn.cluster import KMeans #引入KMeans
kmodel = KMeans(n_clusters = k, n_jobs = 4) #建立模型,n_jobs是并行数,一般等于CPU数较好
# kmodel.fit(data.reshape((len(data), 1))) #训练模型
kmodel.fit(data.values.reshape((len(data), 1)))
# c = pd.DataFrame(kmodel.cluster_centers_).sort(0) #输出聚类中心,并且排序(默认是随机序的)
c = pd.DataFrame(kmodel.cluster_centers_).sort_values(0)
# w = pd.rolling_mean(c, 2).iloc[1:] #相邻两项求中点,作为边界点
w = c.rolling(2).mean().iloc[1:]
w = [0] + list(w[0]) + [data.max()] #把首末边界点加上
d3 = pd.cut(data, w, labels = range(k))
def cluster_plot(d, k): #自定义作图函数来显示聚类结果
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
plt.figure(figsize = (8, 3))
for j in range(0, k):
plt.plot(data[d==j], [j for i in d[d==j]], 'o')
plt.ylim(-0.5, k-0.5)
return plt
cluster_plot(d1, k).show()
cluster_plot(d2, k).show()
cluster_plot(d3, k).show()
4.3.4 属性改造
#-*- coding: utf-8 -*-
#线损率属性构造
import pandas as pd
#参数初始化
inputfile= '../data/electricity_data.xls' #供入供出电量数据
outputfile = '../tmp/electricity_data1.xls' #属性构造后数据文件
data = pd.read_excel(inputfile) #读入数据
print(data)
data[u'线损率'] = (data[u'供入电量'] - data[u'供出电量'])/data[u'供入电量']
print(data)
data.to_excel(outputfile, index = False) #保存结果
供入电量 供出电量
0 986 912
1 1208 1083
2 1108 975
3 1082 934
4 1285 1102
供入电量 供出电量 线损率
0 986 912 0.075051
1 1208 1083 0.103477
2 1108 975 0.120036
3 1082 934 0.136784
4 1285 1102 0.142412
4.3.5 小波变换
#-*- coding: utf-8 -*-
#利用小波分析进行特征分析
#参数初始化
inputfile= '../data/leleccum.mat' #提取自Matlab的信号文件
from scipy.io import loadmat #mat是MATLAB专用格式,需要用loadmat读取它
mat = loadmat(inputfile)
signal = mat['leleccum'][0]
print(signal)
print('------------------------------------------------------')
import pywt #导入PyWavelets
coeffs = pywt.wavedec(signal, 'bior3.7', level = 5)
#返回结果为level+1个数字,第一个数组为逼近系数数组,后面的依次是细节系数数组
print(coeffs)
[420.20278994 423.52653517 423.52271225 ... 323.96580997 323.2400761
323.85476049]
------------------------------------------------------
[array([2415.1478541 , 2395.74470824, 2402.22022728, 2408.90987352,
2402.22022728, 2395.74470824, 2415.1478541 , 2369.53622493,
1958.0913368 , 1983.87619596, 1901.68851538, 1651.86483216,
1482.45129628, 1356.98779058, 1257.4459793 , 1265.75505172,
1363.66712581, 1427.53767222, 1568.87951307, 1893.80694993,
2295.89161125, 2555.9239482 , 2778.31817145, 2871.0940301 ,
2954.38189098, 2981.0281365 , 2986.06286012, 3091.56214184,
3085.0678644 , 2840.05639099, 2782.74679521, 2776.99922688,
2833.0658032 , 2907.76710805, 2496.58749928, 2443.95791914,
2338.50723857, 2394.15834442, 2186.86013504, 2142.10730351,
2066.37469747, 2097.47366057, 2190.20987484, 2024.82470966,
1999.88792082, 1761.22260043, 2012.8983115 , 1733.14320566,
1955.69105593, 2296.53399998, 2332.11621828, 2436.91433782,
2248.43497823, 1928.01215666, 1900.73383661, 1804.08152916,
1596.93576991, 1375.26325034, 1301.52662997, 1239.15426738,
1186.59596164, 1319.79503991, 1366.29061126, 1541.13036373,
1840.28203581, 2332.24861782, 2493.05709766, 2756.64959852,
2845.85405655, 2889.08956115, 2900.45305889, 2894.26919258,
2840.00331868, 2972.87057918, 2734.41261131, 2706.91816977,
2748.45656461, 2728.48445985, 2699.97766246, 2573.64021822,
2465.86126471, 2389.76210231, 2228.72532938, 2147.04749027,
2101.5149566 , 2060.59130892, 2073.90160123, 2125.05661853,
2006.49905922, 1892.43376708, 1792.18694605, 1688.28436526,
1759.13437455, 1662.84067347, 2211.37879446, 2298.66750686,
2229.3528378 , 2250.43556987, 1739.81121296, 1711.93766043,
1658.80982905, 1343.09569093, 1170.87330461, 930.3307274 ,
881.12593524, 806.05407736, 796.07602554, 770.19910471,
746.44388457, 872.75531896, 1072.73155416, 1203.88261161,
1402.5617364 , 1520.92224501, 1899.51673709, 1836.55406856,
1874.02882644, 1860.42136727, 1843.63833987, 1803.4998732 ,
1888.2388324 , 1808.61624732, 1669.61176324, 1589.11409167,
1454.81743823, 1309.27429412, 1217.19395153, 1155.90443861,
1100.16891616, 1135.66615726, 1090.30057756, 1175.24958262,
1265.17133627, 1224.91217397, 1174.91998265, 1124.20079064,
1081.12884006, 1128.58871491, 1168.71694008, 1687.91722313,
1793.37627801, 1885.84661105, 1821.46713782, 1836.21073473,
1850.45948483, 1795.06691925, 1850.45948483, 1836.21073473,
1821.46713782])
4.4 数据规约
4.4.1 属性规约
#-*- coding: utf-8 -*-
#主成分分析 降维
import pandas as pd
#参数初始化
inputfile = '../data/principal_component.xls'
outputfile = '../tmp/dimention_reducted1.xls' #降维后的数据
data = pd.read_excel(inputfile, header = None) #读入数据
print(data)
print('------------------------------------------------------')
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(data)
print(pca.components_ )#返回模型的各个特征向量
print('------------------------------------------------------')
print(pca.explained_variance_ratio_ )#返回各个成分各自的方差百分比
0 1 2 3 4 5 6 7
0 40.4 24.7 7.2 6.1 8.3 8.7 2.442 20.0
1 25.0 12.7 11.2 11.0 12.9 20.2 3.542 9.1
2 13.2 3.3 3.9 4.3 4.4 5.5 0.578 3.6
3 22.3 6.7 5.6 3.7 6.0 7.4 0.176 7.3
4 34.3 11.8 7.1 7.1 8.0 8.9 1.726 27.5
5 35.6 12.5 16.4 16.7 22.8 29.3 3.017 26.6
6 22.0 7.8 9.9 10.2 12.6 17.6 0.847 10.6
7 48.4 13.4 10.9 9.9 10.9 13.9 1.772 17.8
8 40.6 19.1 19.8 19.0 29.7 39.6 2.449 35.8
9 24.8 8.0 9.8 8.9 11.9 16.2 0.789 13.7
10 12.5 9.7 4.2 4.2 4.6 6.5 0.874 3.9
11 1.8 0.6 0.7 0.7 0.8 1.1 0.056 1.0
12 32.3 13.9 9.4 8.3 9.8 13.3 2.126 17.1
13 38.5 9.1 11.3 9.5 12.2 16.4 1.327 11.6
------------------------------------------------------
[[ 0.56788461 0.2280431 0.23281436 0.22427336 0.3358618 0.43679539
0.03861081 0.46466998]
[ 0.64801531 0.24732373 -0.17085432 -0.2089819 -0.36050922 -0.55908747
0.00186891 0.05910423]
[-0.45139763 0.23802089 -0.17685792 -0.11843804 -0.05173347 -0.20091919
-0.00124421 0.80699041]
[-0.19404741 0.9021939 -0.00730164 -0.01424541 0.03106289 0.12563004
0.11152105 -0.3448924 ]
[-0.06133747 -0.03383817 0.12652433 0.64325682 -0.3896425 -0.10681901
0.63233277 0.04720838]
[ 0.02579655 -0.06678747 0.12816343 -0.57023937 -0.52642373 0.52280144
0.31167833 0.0754221 ]
[-0.03800378 0.09520111 0.15593386 0.34300352 -0.56640021 0.18985251
-0.69902952 0.04505823]
[-0.10147399 0.03937889 0.91023327 -0.18760016 0.06193777 -0.34598258
-0.02090066 0.02137393]]
------------------------------------------------------
[7.74011263e-01 1.56949443e-01 4.27594216e-02 2.40659228e-02
1.50278048e-03 4.10990447e-04 2.07718405e-04 9.24594471e-05]
import pandas as pd
#参数初始化
inputfile = '../data/principal_component.xls'
outputfile = '../tmp/dimention_reducted1.xls' #降维后的数据
data = pd.read_excel(inputfile, header = None) #读入数据
print(data)
print('------------------------------------------------------')
from sklearn.decomposition import PCA
pca = PCA(3)
pca.fit(data)
low_d=pca.transform(data)
pd.DataFrame(low_d).to_excel(outputfile)
print(low_d)#返回模型的各个特征向量
[[ 8.19133694 16.90402785 3.90991029]
[ 0.28527403 -6.48074989 -4.62870368]
[-23.70739074 -2.85245701 -0.4965231 ]
[-14.43202637 2.29917325 -1.50272151]
[ 5.4304568 10.00704077 9.52086923]
[ 24.15955898 -9.36428589 0.72657857]
[ -3.66134607 -7.60198615 -2.36439873]
[ 13.96761214 13.89123979 -6.44917778]
[ 40.88093588 -13.25685287 4.16539368]
[ -1.74887665 -4.23112299 -0.58980995]
[-21.94321959 -2.36645883 1.33203832]
[-36.70868069 -6.00536554 3.97183515]
[ 3.28750663 4.86380886 1.00424688]
[ 5.99885871 4.19398863 -8.59953736]]
4.4.2 数值规约
4.5 常用预处理函数
import pandas as pd
import numpy as np
D = pd.Series([1,1,2,3,5])
# 进行数据去重的两种方法
a=D.unique()
b=np.unique(D)
print(a,'\n',b)
[1 2 3 5]
[1 2 3 5]
- PCA
from sklearn.decomposition import PCA
import numpy as np
D=np.random.rand(10,4)
pca=PCA()
pca.fit(D)
print(pca.components_)
print('----------------------')
print(pca.explained_variance_ratio_)
[[ 0.93015398 0.05459401 0.35245154 -0.087241 ]
[ 0.14059859 0.86040809 -0.47129117 0.13347128]
[-0.05332749 0.09686805 -0.11852097 -0.98677532]
[-0.33496536 0.49732685 0.79975954 -0.02913565]]
----------------------
[0.38769127 0.30546941 0.23075548 0.07608384]