python数据分析与挖掘实战第四章总结

本博客旨在帮助学生自己巩固所学,若能帮得上他人也是荣幸之至
首先以下是借鉴过的几个github库,非常感谢:
https://github.com/apachecn/python_data_analysis_and_mining_action
https://github.com/keefecn/python_practice_of_data_analysis_and_mining
https://github.com/Stormzudi/Python-Data-Mining
https://github.com/Echo9573/DataAnalysisbyPython

主要参考https://blog.csdn.net/qq_41709378/article/details/107443313

1 拉格朗日插值填补缺失值

牛顿插值见https://blog.csdn.net/wwxy1995/article/details/84440110,二者结果一样的

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File  : 4-1_lagrange_newton_interp.py
# @Author: Stormzudi
# @Date  : 2020/7/22 16:01

# 拉格朗日插值代码
import pandas as pd  # 导入数据分析库Pandas
from scipy.interpolate import lagrange  # 导入拉格朗日插值函数

inputfile = '../data/catering_sale.xls'  # 销量数据路径
outputfile = '../tmp/sales.xls'  # 输出数据路径

data = pd.read_excel(inputfile)  # 读入数据
data[u'销量'][(data[u'销量'] < 400) | (data[u'销量'] > 5000)] = None  # 过滤异常值,将其变为空值

# 自定义列向量插值函数
# s为列向量,n为被插值的位置,k为取前后的数据个数,默认为5
def ployinterp_column(s, n, k=5):
    y = s[list(range(n-k, n)) + list(range(n+1, n+1+k))]  # 取数
    y = y[y.notnull()]  # 剔除空值
    return lagrange(y.index, list(y))(n)  # 插值并返回插值结果

# 逐个元素判断是否需要插值
for i in data.columns:
    for j in range(len(data)):
        if (data[i].isnull())[j]:  # 如果为空即插值。
            data[i][j] = ployinterp_column(data[i], j)

data.to_excel(outputfile)  # 输出结果,写入文件

2 规范化

# -*- coding: utf-8 -*-
# 数据规范化
import pandas as pd
import numpy as np

datafile = '../data/normalization_data.xls'  # 参数初始化
data = pd.read_excel(datafile, header = None)  # 读取数据

(data - data.min())/(data.max() - data.min())   # 最小-最大规范化
(data - data.mean())/data.std()  # 零-均值规范化
data/10**np.ceil(np.log10(data.abs().max()))  # 小数定标规范化

3 数据离散化

将该列数据用三种方法分成4分
分别是等宽,等频,基于聚类分析

#-*- coding: utf-8 -*-
# 数据规范化
import pandas as pd

datafile = '../data/discretization_data.xls'  # 参数初始化
data = pd.read_excel(datafile)  # 读取数据
data = data[u'肝气郁结证型系数'].copy()
k = 4

d1 = pd.cut(data, k, labels=range(k))  # 等宽离散化,各个类比依次命名为0,1,2,3


# 等频率离散化
w = [1.0 * i / k for i in range(k + 1)]
print(data.describe(percentiles=w))
w = data.describe(percentiles=w)[4:4 + k + 1]  # 使用describe函数自动计算分位数
w[0] = w[0] * (1 - 1e-10)
d2 = pd.cut(data, w, labels=range(k))

from sklearn.cluster import KMeans  # 引入KMeans
kmodel = KMeans(n_clusters=k, n_jobs=4)  # 建立模型,n_jobs是并行数,一般等于CPU数较好
kmodel.fit(data.values.reshape((len(data),1)))  # 训练模型
c=pd.DataFrame(kmodel.cluster_centers_).sort_values(0)  # 输出聚类中心,并且排序(默认是随机序的)
w=c.rolling(2).mean().iloc[1:]  # 相邻两项求中点,作为边界点
w = [0] + list(w[0]) + [data.max()]  # 把首末边界点加上
d3 = pd.cut(data, w, labels=range(k))


def cluster_plot(d, k):  # 自定义作图函数来显示聚类结果
    import matplotlib.pyplot as plt
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

    plt.figure(figsize=(8, 3))
    for j in range(0, k):
        plt.plot(data[d == j], [j for i in d[d == j]], 'o')

    plt.ylim(-0.5, k - 0.5)
    return plt


cluster_plot(d1, k).show()
cluster_plot(d2, k).show()
cluster_plot(d3, k).show()

展示结果中的一个图
在这里插入图片描述

4 小波变换

咱也不咋理解哈哈

#-*- coding: utf-8 -*-
# 利用小波分析进行特征分析

# 参数初始化
inputfile = '../data/leleccum.mat'  # 提取自Matlab的信号文件

from scipy.io import loadmat  # mat是MATLAB专用格式,需要用loadmat读取它
mat = loadmat(inputfile)

print(mat['leleccum'][0])

signal = mat['leleccum'][0]

import pywt  # 导入PyWavelets
coeffs = pywt.wavedec(signal, 'bior3.7', level=5)
print(coeffs)
# 返回结果为level+1个数组,第一个数组为逼近系数数组,后面的依次是细节系数数组
[420.20278994 423.52653517 423.52271225 ... 323.96580997 323.2400761
 323.85476049]
[array([2415.1478541 , 2395.74470824, 2402.22022728, 2408.90987352,
       2402.22022728, 2395.74470824, 2415.1478541 , 2369.53622493,
       1958.0913368 , 1983.87619596, 1901.68851538, 1651.86483216,
       1482.45129628, 1356.98779058, 1257.4459793 , 1265.75505172,
       1363.66712581, 1427.53767222, 1568.87951307, 1893.80694993,
       2295.89161125, 2555.9239482 , 2778.31817145, 2871.0940301 ,
       2954.38189098, 2981.0281365 , 2986.06286012, 3091.56214184,
       3085.0678644 , 2840.05639099, 2782.74679521, 2776.99922688,
       2833.0658032 , 2907.76710805, 2496.58749928, 2443.95791914,
       2338.50723857, 2394.15834442, 2186.86013504, 2142.10730351,
       2066.37469747, 2097.47366057, 2190.20987484, 2024.82470966,
       1999.88792082, 1761.22260043, 2012.8983115 , 1733.14320566,
       1955.69105593, 2296.53399998, 2332.11621828, 2436.91433782,
       2248.43497823, 1928.01215666, 1900.73383661, 1804.08152916,
       1596.93576991, 1375.26325034, 1301.52662997, 1239.15426738,
       1186.59596164, 1319.79503991, 1366.29061126, 1541.13036373,
       1840.28203581, 2332.24861782, 2493.05709766, 2756.64959852,
       2845.85405655, 2889.08956115, 2900.45305889, 2894.26919258,
       2840.00331868, 2972.87057918, 2734.41261131, 2706.91816977,
       2748.45656461, 2728.48445985, 2699.97766246, 2573.64021822,
       2465.86126471, 2389.76210231, 2228.72532938, 2147.04749027,
       2101.5149566 , 2060.59130892, 2073.90160123, 2125.05661853,
       2006.49905922, 1892.43376708, 1792.18694605, 1688.28436526,
       1759.13437455, 1662.84067347, 2211.37879446, 2298.66750686,
       2229.3528378 , 2250.43556987, 1739.81121296, 1711.93766043,
       1658.80982905, 1343.09569093, 1170.87330461,  930.3307274 ,
        881.12593524,  806.05407736,  796.07602554,  770.19910471,
        746.44388457,  872.75531896, 1072.73155416, 1203.88261161,
       1402.5617364 , 1520.92224501, 1899.51673709, 1836.55406856,
       1874.02882644, 1860.42136727, 1843.63833987, 1803.4998732 ,
       1888.2388324 , 1808.61624732, 1669.61176324, 1589.11409167,
       1454.81743823, 1309.27429412, 1217.19395153, 1155.90443861,
       1100.16891616, 1135.66615726, 1090.30057756, 1175.24958262,
       1265.17133627, 1224.91217397, 1174.91998265, 1124.20079064,
       1081.12884006, 1128.58871491, 1168.71694008, 1687.91722313,
       1793.37627801, 1885.84661105, 1821.46713782, 1836.21073473,
       1850.45948483, 1795.06691925, 1850.45948483, 1836.21073473,
       1821.46713782]), array([-5.11468240e+00, -7.78481941e+00, -5.59636066e+00,  5.68434189e-14,
        5.59636066e+00,  7.78481941e+00,  5.11468240e+00, -2.81258232e+00,
        3.26062134e+01,  2.68735619e+01,  1.20971007e+01, -5.07671404e-01,
        7.62522769e+00, -6.84072854e-01, -1.16884926e+01, -1.18088402e+01,
        1.33926665e+01, -2.01235299e+01,  3.48260315e+00, -3.54659040e+00,
        2.01837609e+01,  1.15681152e+01, -1.18638949e+01,  6.84726925e+00,
       -4.11590600e+00, -1.02132760e+01, -1.27666042e+01, -9.20952401e+00,
        2.77461418e+01, -3.98479118e+01,  5.15429218e+01,  6.60716803e+01,
        3.46710408e+01,  5.15409412e+01, -1.97726577e+00, -1.73356501e+01,
       -2.98726832e+01,  6.88709748e+01,  4.13412171e+01, -3.34948636e+00,
       -3.74146943e+00,  1.88164077e+00, -1.79587054e+01, -1.10115156e+01,
        5.35815207e+00, -9.54447993e+00, -3.58666643e+01, -2.27141233e+01,
       -1.10221405e+02, -8.13158319e+01, -7.74112994e+01, -7.01880092e+01,
       -3.44548030e+01, -8.16498691e+00,  2.65900570e+00, -6.23319888e+00,
        4.87340502e+00,  4.68468083e+00,  9.18690091e+00,  1.85430639e+01,
        2.93779255e+00,  2.58783705e+01,  5.40120757e+00,  5.17675770e+01,
        1.61772930e+01,  1.32514251e+01,  6.44356661e+00,  9.97394561e+00,
       -6.32494684e+00, -1.37030499e+01,  7.82329250e+00,  2.65234860e-02,
       -1.50525686e+01, -5.40883815e+00, -3.98406609e+01,  5.40847540e+01,
        4.94428826e+01,  1.69192021e+01,  2.05407784e+01,  6.23907404e+00,
        2.08672203e+00,  1.88346482e+01,  1.65051009e+01, -2.48767801e+01,
        1.21863576e+01,  3.11490193e+01,  6.93708899e+00, -1.01772953e+01,
       -9.74729168e+00,  1.89445313e+01,  7.99277116e+00,  7.44593412e+00,
       -4.47889132e+00, -9.75438570e+01, -9.69116185e+01, -5.69268401e+01,
       -3.98340942e+01, -4.56499591e+01, -1.41379229e+01, -4.08223971e+01,
       -3.17335250e+01, -1.63609602e+00, -4.09533079e+01, -3.34194655e+01,
       -2.24169494e+01, -2.33365528e+01,  1.66316122e+01, -8.71183199e+00,
       -8.09314057e+00, -8.34584196e+00,  3.28930260e+01,  6.08832971e+01,
        1.23425633e+01, -5.44692340e+01, -6.72967848e+01, -2.55981626e+01,
       -1.59700109e+01, -5.97791744e+00, -6.95498159e+00, -1.49507701e+01,
        6.94813379e+01,  2.99780448e+01, -2.88329977e+00,  1.96172085e+01,
        1.07086206e+01,  1.35358732e+00, -9.96476145e+00, -9.87509065e+00,
        3.28294967e-01,  4.38088766e+00,  2.28724457e+00,  7.26065444e+00,
        1.73104487e-01, -9.90509246e-01,  1.12392282e+01,  6.73495793e+00,
       -6.68352711e+00, -2.67788156e+00, -6.92047375e+01, -5.60700568e+01,
       -6.98683373e+01, -6.56027327e+01, -7.28593276e+00, -3.41060513e-13,
        7.28593276e+00,  0.00000000e+00, -7.28593276e+00,  3.41060513e-13,
        7.28593276e+00]), array([ 1....]

进程已结束,退出代码 0

5 主成分分析PCA

#-*- coding: utf-8 -*-
# 主成分分析 降维
import pandas as pd

# 参数初始化
inputfile = '../data/principal_component.xls'
outputfile = '../tmp/dimention_reducted.xls'  # 降维后的数据

data = pd.read_excel(inputfile, header=None)  # 读入数据

from sklearn.decomposition import PCA

pca = PCA()
pca.fit(data)
print(pca.components_)  # 返回模型的各个特征向量
print(pca.explained_variance_ratio_)  # 返回各个成分各自的方差百分比
# 以上说明取前3个已经很不错了
print("-----------4-6.2-----------------")
pca=PCA(3)
pca.fit(data)
low_d=pca.transform(data)
pd.DataFrame(low_d).to_excel(outputfile)
print(low_d)
print(pca.inverse_transform(low_d))

6 Python 主要数据预处理函数

在这里插入图片描述

92讲视频课+16大项目实战+课件源码  为什么学习数据分析?       人工智能、大数据时代有什么技能是可以运用在各种行业的?数据分析就是。      从海量数据中获得别人看不见的信息,创业者可以通过数据分析来优化产品,营销人员可以通过数据分析改进营销策略,产品经理可以通过数据分析洞察用户习惯,金融从业者可以通过数据分析规避投资风险,程序员可以通过数据分析进一步挖掘出数据价值,它和编程一样,本质上也是一个工具,通过数据来对现实事物进行分析和识别的能力。不管你从事什么行业,掌握了数据分析能力,往往在其岗位上更有竞争力。   本课程共包含五大模块: 一、先导篇: 通过分析数据分析师的一天,让学员了解全面了解成为一个数据分析师的所有必修功法,对数据分析师不在迷惑。  二、基础篇: 围绕Python基础语法介绍、数据预处理、数据可视化以及数据分析挖掘......这些核心技能模块展开,帮助你快速而全面的掌握和了解成为一个数据分析师的所有必修功法。 三、数据采集篇: 通过网络爬虫实战解决数据分析的必经之路:数据从何来的问题,讲解常见的爬虫套路并利用三大实战帮助学员扎实数据采集能力,避免没有数据可分析的尴尬。  四、分析工具篇: 讲解数据分析避不开的科学计算库Numpy、数据分析工具Pandas及常见可视化工具Matplotlib。  五、算法篇: 算法是数据分析的精华,课程精选10大算法,包括分类、聚类、预测3大类型,每个算法都从原理和案例两个角度学习,让你不仅能用起来,了解原理,还能知道为什么这么做。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值