【数据分析与挖掘实战】02

第四章 数据预处理

4.1 数据清洗

4.1.1 缺失值处理

  • 拉格朗日插值:
  • 有误-未改
# 用拉格朗日法进行插补
import pandas as pd
from scipy.interpolate import lagrange # 导入拉格朗日函数
import xlwt
inputfile = './data/catering_sale.xls' # 销售数据路径
outputfile = './sales.xls' # 输出数据路径
data = pd.read_excel(inputfile) # 读入数据
data[u'销量'][(data[u'销量'] < 400) | (data[u'销量'] > 5000)] = None # 过滤异常值,将其变为空值

# 自定义列向量插值函数
# s为列向量,n为被插值的位置,k为取前后的数据个数,默认为5
def ployinterp_column(s, n, k=5):
    y = s[list(range(n-k,n)) + list(range(n+1, n+1+k))] # 取数
    y = y[y.notnull()] # 剔除空值
    return lagrange(y.index, list(y))(n) # 插值并返回插值结果

# 逐个元素判断是否需要插值
for i in data.columns:
    for j in range(len(data)):
        if (data[i].isnull())[j]: #如果为空即插值
            data[i][j] = ployinterp_column(data[i],j)

data.to_excel(outputfile) # 输出结果, 写入文件

4.1.2 异常值处理

4.2 数据集成

4.2.1 实体识别

4.2.2 冗余属性识别

4.3 数据变换

4.3.1 简单函数变换

4.3.2 规范化

#-*- coding: utf-8 -*-
#数据规范化
import pandas as pd
import numpy as np

datafile = '../data/normalization_data.xls' #参数初始化

data = pd.read_excel(datafile, header = None) #读取数据
print(data)
a=(data - data.min())/(data.max() - data.min()) #最小-最大规范化
b=(data - data.mean())/data.std() #零-均值规范化
c=data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化
print(a,'\n',b,'\n',c)
     0    1    2     3
0   78  521  602  2863
1  144 -600 -521  2245
2   95 -457  468 -1283
3   69  596  695  1054
4  190  527  691  2051
5  101  403  470  2487
6  146  413  435  2571
          0         1         2         3
0  0.074380  0.937291  0.923520  1.000000
1  0.619835  0.000000  0.000000  0.850941
2  0.214876  0.119565  0.813322  0.000000
3  0.000000  1.000000  1.000000  0.563676
4  1.000000  0.942308  0.996711  0.804149
5  0.264463  0.838629  0.814967  0.909310
6  0.636364  0.846990  0.786184  0.929571 
           0         1         2         3
0 -0.905383  0.635863  0.464531  0.798149
1  0.604678 -1.587675 -2.193167  0.369390
2 -0.516428 -1.304030  0.147406 -2.078279
3 -1.111301  0.784628  0.684625 -0.456906
4  1.657146  0.647765  0.675159  0.234796
5 -0.379150  0.401807  0.152139  0.537286
6  0.650438  0.421642  0.069308  0.595564 
        0      1      2       3
0  0.078  0.521  0.602  0.2863
1  0.144 -0.600 -0.521  0.2245
2  0.095 -0.457  0.468 -0.1283
3  0.069  0.596  0.695  0.1054
4  0.190  0.527  0.691  0.2051
5  0.101  0.403  0.470  0.2487
6  0.146  0.413  0.435  0.2571

4.3.3 连续属性离散化

#-*- coding: utf-8 -*-
#数据规范化
import pandas as pd

datafile = '../data/discretization_data.xls' #参数初始化
data = pd.read_excel(datafile) #读取数据
data = data[u'肝气郁结证型系数'].copy()
k = 4

d1 = pd.cut(data, k, labels = range(k)) #等宽离散化,各个类比依次命名为0,1,2,3

#等频率离散化
w = [1.0*i/k for i in range(k+1)]
w = data.describe(percentiles = w)[4:4+k+1] #使用describe函数自动计算分位数
w[0] = w[0]*(1-1e-10)
d2 = pd.cut(data, w, labels = range(k))

from sklearn.cluster import KMeans #引入KMeans
kmodel = KMeans(n_clusters = k, n_jobs = 4) #建立模型,n_jobs是并行数,一般等于CPU数较好
# kmodel.fit(data.reshape((len(data), 1))) #训练模型
kmodel.fit(data.values.reshape((len(data), 1)))
# c = pd.DataFrame(kmodel.cluster_centers_).sort(0) #输出聚类中心,并且排序(默认是随机序的)
c = pd.DataFrame(kmodel.cluster_centers_).sort_values(0)
# w = pd.rolling_mean(c, 2).iloc[1:] #相邻两项求中点,作为边界点
w = c.rolling(2).mean().iloc[1:]
w = [0] + list(w[0]) + [data.max()] #把首末边界点加上
d3 = pd.cut(data, w, labels = range(k))

def cluster_plot(d, k): #自定义作图函数来显示聚类结果
  import matplotlib.pyplot as plt
  plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签
  plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号
  
  plt.figure(figsize = (8, 3))
  for j in range(0, k):
    plt.plot(data[d==j], [j for i in d[d==j]], 'o')
  
  plt.ylim(-0.5, k-0.5)
  return plt

cluster_plot(d1, k).show()

cluster_plot(d2, k).show()
cluster_plot(d3, k).show()

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

4.3.4 属性改造

#-*- coding: utf-8 -*-
#线损率属性构造
import pandas as pd

#参数初始化
inputfile= '../data/electricity_data.xls' #供入供出电量数据
outputfile = '../tmp/electricity_data1.xls' #属性构造后数据文件

data = pd.read_excel(inputfile) #读入数据
print(data)
data[u'线损率'] = (data[u'供入电量'] - data[u'供出电量'])/data[u'供入电量']
print(data)
data.to_excel(outputfile, index = False) #保存结果

   供入电量  供出电量
0   986   912
1  1208  1083
2  1108   975
3  1082   934
4  1285  1102
   供入电量  供出电量       线损率
0   986   912  0.075051
1  1208  1083  0.103477
2  1108   975  0.120036
3  1082   934  0.136784
4  1285  1102  0.142412

4.3.5 小波变换

#-*- coding: utf-8 -*-
#利用小波分析进行特征分析

#参数初始化
inputfile= '../data/leleccum.mat' #提取自Matlab的信号文件

from scipy.io import loadmat #mat是MATLAB专用格式,需要用loadmat读取它
mat = loadmat(inputfile)
signal = mat['leleccum'][0]
print(signal)
print('------------------------------------------------------')
import pywt #导入PyWavelets
coeffs = pywt.wavedec(signal, 'bior3.7', level = 5)
#返回结果为level+1个数字,第一个数组为逼近系数数组,后面的依次是细节系数数组
print(coeffs)
[420.20278994 423.52653517 423.52271225 ... 323.96580997 323.2400761
 323.85476049]
------------------------------------------------------
[array([2415.1478541 , 2395.74470824, 2402.22022728, 2408.90987352,
       2402.22022728, 2395.74470824, 2415.1478541 , 2369.53622493,
       1958.0913368 , 1983.87619596, 1901.68851538, 1651.86483216,
       1482.45129628, 1356.98779058, 1257.4459793 , 1265.75505172,
       1363.66712581, 1427.53767222, 1568.87951307, 1893.80694993,
       2295.89161125, 2555.9239482 , 2778.31817145, 2871.0940301 ,
       2954.38189098, 2981.0281365 , 2986.06286012, 3091.56214184,
       3085.0678644 , 2840.05639099, 2782.74679521, 2776.99922688,
       2833.0658032 , 2907.76710805, 2496.58749928, 2443.95791914,
       2338.50723857, 2394.15834442, 2186.86013504, 2142.10730351,
       2066.37469747, 2097.47366057, 2190.20987484, 2024.82470966,
       1999.88792082, 1761.22260043, 2012.8983115 , 1733.14320566,
       1955.69105593, 2296.53399998, 2332.11621828, 2436.91433782,
       2248.43497823, 1928.01215666, 1900.73383661, 1804.08152916,
       1596.93576991, 1375.26325034, 1301.52662997, 1239.15426738,
       1186.59596164, 1319.79503991, 1366.29061126, 1541.13036373,
       1840.28203581, 2332.24861782, 2493.05709766, 2756.64959852,
       2845.85405655, 2889.08956115, 2900.45305889, 2894.26919258,
       2840.00331868, 2972.87057918, 2734.41261131, 2706.91816977,
       2748.45656461, 2728.48445985, 2699.97766246, 2573.64021822,
       2465.86126471, 2389.76210231, 2228.72532938, 2147.04749027,
       2101.5149566 , 2060.59130892, 2073.90160123, 2125.05661853,
       2006.49905922, 1892.43376708, 1792.18694605, 1688.28436526,
       1759.13437455, 1662.84067347, 2211.37879446, 2298.66750686,
       2229.3528378 , 2250.43556987, 1739.81121296, 1711.93766043,
       1658.80982905, 1343.09569093, 1170.87330461,  930.3307274 ,
        881.12593524,  806.05407736,  796.07602554,  770.19910471,
        746.44388457,  872.75531896, 1072.73155416, 1203.88261161,
       1402.5617364 , 1520.92224501, 1899.51673709, 1836.55406856,
       1874.02882644, 1860.42136727, 1843.63833987, 1803.4998732 ,
       1888.2388324 , 1808.61624732, 1669.61176324, 1589.11409167,
       1454.81743823, 1309.27429412, 1217.19395153, 1155.90443861,
       1100.16891616, 1135.66615726, 1090.30057756, 1175.24958262,
       1265.17133627, 1224.91217397, 1174.91998265, 1124.20079064,
       1081.12884006, 1128.58871491, 1168.71694008, 1687.91722313,
       1793.37627801, 1885.84661105, 1821.46713782, 1836.21073473,
       1850.45948483, 1795.06691925, 1850.45948483, 1836.21073473,
       1821.46713782])

4.4 数据规约

4.4.1 属性规约

#-*- coding: utf-8 -*-
#主成分分析 降维
import pandas as pd

#参数初始化
inputfile = '../data/principal_component.xls'
outputfile = '../tmp/dimention_reducted1.xls' #降维后的数据

data = pd.read_excel(inputfile, header = None) #读入数据
print(data)
print('------------------------------------------------------')
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(data)
print(pca.components_ )#返回模型的各个特征向量
print('------------------------------------------------------')
print(pca.explained_variance_ratio_ )#返回各个成分各自的方差百分比
       0     1     2     3     4     5      6     7
0   40.4  24.7   7.2   6.1   8.3   8.7  2.442  20.0
1   25.0  12.7  11.2  11.0  12.9  20.2  3.542   9.1
2   13.2   3.3   3.9   4.3   4.4   5.5  0.578   3.6
3   22.3   6.7   5.6   3.7   6.0   7.4  0.176   7.3
4   34.3  11.8   7.1   7.1   8.0   8.9  1.726  27.5
5   35.6  12.5  16.4  16.7  22.8  29.3  3.017  26.6
6   22.0   7.8   9.9  10.2  12.6  17.6  0.847  10.6
7   48.4  13.4  10.9   9.9  10.9  13.9  1.772  17.8
8   40.6  19.1  19.8  19.0  29.7  39.6  2.449  35.8
9   24.8   8.0   9.8   8.9  11.9  16.2  0.789  13.7
10  12.5   9.7   4.2   4.2   4.6   6.5  0.874   3.9
11   1.8   0.6   0.7   0.7   0.8   1.1  0.056   1.0
12  32.3  13.9   9.4   8.3   9.8  13.3  2.126  17.1
13  38.5   9.1  11.3   9.5  12.2  16.4  1.327  11.6
------------------------------------------------------
[[ 0.56788461  0.2280431   0.23281436  0.22427336  0.3358618   0.43679539
   0.03861081  0.46466998]
 [ 0.64801531  0.24732373 -0.17085432 -0.2089819  -0.36050922 -0.55908747
   0.00186891  0.05910423]
 [-0.45139763  0.23802089 -0.17685792 -0.11843804 -0.05173347 -0.20091919
  -0.00124421  0.80699041]
 [-0.19404741  0.9021939  -0.00730164 -0.01424541  0.03106289  0.12563004
   0.11152105 -0.3448924 ]
 [-0.06133747 -0.03383817  0.12652433  0.64325682 -0.3896425  -0.10681901
   0.63233277  0.04720838]
 [ 0.02579655 -0.06678747  0.12816343 -0.57023937 -0.52642373  0.52280144
   0.31167833  0.0754221 ]
 [-0.03800378  0.09520111  0.15593386  0.34300352 -0.56640021  0.18985251
  -0.69902952  0.04505823]
 [-0.10147399  0.03937889  0.91023327 -0.18760016  0.06193777 -0.34598258
  -0.02090066  0.02137393]]
------------------------------------------------------
[7.74011263e-01 1.56949443e-01 4.27594216e-02 2.40659228e-02
 1.50278048e-03 4.10990447e-04 2.07718405e-04 9.24594471e-05]
import pandas as pd

#参数初始化
inputfile = '../data/principal_component.xls'
outputfile = '../tmp/dimention_reducted1.xls' #降维后的数据

data = pd.read_excel(inputfile, header = None) #读入数据
print(data)
print('------------------------------------------------------')
from sklearn.decomposition import PCA

pca = PCA(3)
pca.fit(data)
low_d=pca.transform(data)
pd.DataFrame(low_d).to_excel(outputfile)
print(low_d)#返回模型的各个特征向量
[[  8.19133694  16.90402785   3.90991029]
 [  0.28527403  -6.48074989  -4.62870368]
 [-23.70739074  -2.85245701  -0.4965231 ]
 [-14.43202637   2.29917325  -1.50272151]
 [  5.4304568   10.00704077   9.52086923]
 [ 24.15955898  -9.36428589   0.72657857]
 [ -3.66134607  -7.60198615  -2.36439873]
 [ 13.96761214  13.89123979  -6.44917778]
 [ 40.88093588 -13.25685287   4.16539368]
 [ -1.74887665  -4.23112299  -0.58980995]
 [-21.94321959  -2.36645883   1.33203832]
 [-36.70868069  -6.00536554   3.97183515]
 [  3.28750663   4.86380886   1.00424688]
 [  5.99885871   4.19398863  -8.59953736]]

4.4.2 数值规约

4.5 常用预处理函数

import pandas as pd
import numpy as np
D = pd.Series([1,1,2,3,5])
# 进行数据去重的两种方法
a=D.unique()
b=np.unique(D)
print(a,'\n',b)
[1 2 3 5] 
 [1 2 3 5]
  • PCA
from sklearn.decomposition import PCA
import numpy as np
D=np.random.rand(10,4)
pca=PCA()
pca.fit(D)
print(pca.components_)
print('----------------------')
print(pca.explained_variance_ratio_)
[[ 0.93015398  0.05459401  0.35245154 -0.087241  ]
 [ 0.14059859  0.86040809 -0.47129117  0.13347128]
 [-0.05332749  0.09686805 -0.11852097 -0.98677532]
 [-0.33496536  0.49732685  0.79975954 -0.02913565]]
----------------------
[0.38769127 0.30546941 0.23075548 0.07608384]
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值