python hstack_Python数据分析总结,干货资料分享!

970e26fea9cde5aef29bc47bf82c0158.png

(图片来自网络配图)

Python数据分析学习总结

概述

● 数据分析的含义与目标

方法:统计分析方法

目标:提取有用信息

手段:研究、概括、总结

● Python与数据分析

Python特点:简洁、开发效率高、运算速度慢、胶水特性(集成C语言)

Python数据分析:numpy、scipy、matplotlib、pandas、scikit-learn、keras…

● Python数据分析大家族

numpy:数据结构基础

scipy:强大的科学计算方法(矩阵分析、信号分析、数理分析…)

matplotlib:丰富的可视化套件

pandas:基础数据分析套件

scikit-learn:强大的数据分析建模库

keras:人工神经网络

● Python数据分析环境搭建

平台:Windows、Linux

科学计算工具:Anaconda

Python数据分析基础

● numpy

开源、数据计算扩展;ndarray、多维操作、线性代数

● numpy使用程序

import numpy as npdef main():    lst=[[1,3,5],[2,4,6]]    print(type(lst))    np_lst=np.array(lst)    print(type(np_lst))    np_lst=np.array(lst, dtype=np.float)    print(np_lst.shape)    print(np_lst.ndim)    print(np_lst.dtype)    print(np_lst.itemsize)    print(np_lst.size)if __name__=="__main__":    main()执行结果:(2, 3)2float6486

● numpy常用数组

print(np.zeros([2,4]))print(np.ones([3,5]))print(np.random.rand(2,4))print(np.random.rand())print("RandInt:")print(np.random.randint(1,10,3))print("Randn:")  # 标准正态分布print(np.random.randn(2,4)) print("Choice")print(np.random.choice([10,20,30]))print("Distribute:")  # Beta分布print(np.random.beta(1,10,100))执行结果:[[ 0.  0.  0.  0.] [ 0.  0.  0.  0.]][[ 1.  1.  1.  1.  1.] [ 1.  1.  1.  1.  1.] [ 1.  1.  1.  1.  1.]][[ 0.80307088  0.25491367  0.54381007  0.10159737] [ 0.71565024  0.62473538  0.66892166  0.41078071]]0.16467244260637237RandInt:[5 3 2]Randn:[[-0.51707383 -1.46091351 -0.78197086  0.44640286] [-0.0998081   0.40701679  0.07750661  0.66041753]]Choice10Distribute:[ 0.03897375  0.09804991  0.1617222  ...,  0.12878516  0.11699157  0.05681225]

● numpy常用操作

print("Arange:")print(np.arange(1,11))print("Exp:")print(np.exp(lst))print("Exp2:")print(np.exp2(lst))print("Sqrt:")print(np.sqrt(lst))print("Sin:")print(np.sin(lst))print("Log:")print(np.log(lst))执行结果:Arange:[ 1  2  3  4  5  6  7  8  9 10]Exp:[[   2.71828183   20.08553692  148.4131591 ] [   7.3890561    54.59815003  403.42879349]]Exp2:[[  2.   8.  32.] [  4.  16.  64.]]Sqrt:[[ 1.          1.73205081  2.23606798] [ 1.41421356  2.          2.44948974]]Sin:[[ 0.84147098  0.14112001 -0.95892427] [ 0.90929743 -0.7568025  -0.2794155 ]]Log:[[ 0.          1.09861229  1.60943791] [ 0.69314718  1.38629436  1.79175947]]lst=np.array([[[1,2,3,4],[4,5,6,7]],[[7,8,9,10],[10,11,12,13]],[[14,15,16,17],[18,19,20,11]]])print(lst.sum(axis=2))print(lst.sum(axis=1))print(lst.sum(axis=0))print("Max:")print(lst.max(axis=1))print("Min:")print(lst.min(axis=0))  执行结果:[[10 22] [34 46] [62 68]][[ 5  7  9 11] [17 19 21 23] [32 34 36 28]][[22 25 28 31] [32 35 38 31]]Max:[[ 4  5  6  7] [10 11 12 13] [18 19 20 17]]Min:[[1 2 3 4] [4 5 6 7]]lst1=np.array([10,20,30,40])lst2=np.array([4,3,2,1])print("Add:")print(lst1+lst2)print("Sub:")print(lst1-lst2)print("Mul:")print(lst1*lst2)print("Div:")print(lst1/lst2)print("Square:")print(lst1**2)print("Dot:")print(np.dot(lst1.reshape([2,2]),lst2.reshape([2,2])))print("Concatenate:")print(np.concatenate((lst1,lst2),axis=0))print("vstack:")print(np.vstack((lst1,lst2)))print("hstack:")print(np.hstack((lst1,lst2)))print("Split:")print(np.split(lst1,2))print(np.split(lst1,4))print("Copy:")print(np.copy(lst1))执行结果:Add:[14 23 32 41]Sub:[ 6 17 28 39]Mul:[40 60 60 40]Div:[  2.5          6.66666667  15.          40.        ]Square:[ 100  400  900 1600]Dot:[[ 80  50] [200 130]]Concatenate:[10 20 30 40  4  3  2  1]vstack:[[10 20 30 40] [ 4  3  2  1]]hstack:[10 20 30 40  4  3  2  1]Split:[array([10, 20]), array([30, 40])][array([10]), array([20]), array([30]), array([40])]Copy:[10 20 30 40]

● 线程方程组

import numpy as npfrom numpy.linalg import *def main():    print(np.eye(3))    lst=np.array([[1,2],[3,4]])    print("Inv:")    print(inv(lst))    print("T:")    print(lst.transpose())    print("Det:")    print(det(lst))    print("Eig:")    print(eig(lst))if __name__=="__main__":    main()执行结果:[[ 1.  0.  0.] [ 0.  1.  0.] [ 0.  0.  1.]]Inv:[[-2.   1. ] [ 1.5 -0.5]]T:[[1 3] [2 4]]Det:-2.0Eig:(array([-0.37228132,  5.37228132]), array([[-0.82456484, -0.41597356],       [ 0.56576746, -0.90937671]]))

● numpy其他方面应用

import numpy as npfrom numpy.linalg import *def main():    print("FFT:")    print(np.fft.fft(np.array([1,1,1,1,1,1,1,1])))    print("Coef:")    print(np.corrcoef([1,0,1],[0,2,1]))    print("Poly:")    print(np.poly1d([2,1,3]))if __name__=="__main__":    main()执行结果:FFT:[ 8.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j  0.+0.j]Coef:[[ 1.        -0.8660254] [-0.8660254  1.       ]]Poly:   22 x + 1 x + 3

● matplotlib

● 概述

matplotlib是关键的绘图库。

● 实现

import numpy as npimport matplotlib.pyplot as pltdef main():    #line    x=np.linspace(-np.pi,np.pi,256,endpoint=True)    c,s=np.cos(x),np.sin(x)    plt.figure(1)    plt.plot(x,c,color="blue",linewidth=1.0,linestyle="-",label="COS",alpha=0.5)    plt.plot(x,s,"r*",label="SIN")    plt.title("COS & SIN")    ax=plt.gca()    ax.spines["right"].set_color("none")    ax.spines["top"].set_color("none")    ax.spines["left"].set_position(("data",0))    ax.spines["bottom"].set_position(("data",0))    ax.xaxis.set_ticks_position("bottom")    ax.yaxis.set_ticks_position("left")    plt.show()    #scatter    fig=plt.figure()    ax=fig.add_subplot(3,3,1)    n=128    X=np.random.normal(0,1,n)    Y=np.random.normal(0,1,n)    T=np.arctan2(Y,X)    #plt.axes([0.025,0.025,0.95,0.95])    #plt.scatter(X,Y,s=75,c=T,alpha=0.5)    ax.scatter(X,Y,s=75,c=T,alpha=0.5)    plt.xlim(-1.5,1.5),plt.xticks([])    plt.ylim(-1.5,1.5),plt.yticks([])    plt.axis()    plt.title("scatter")    plt.xlabel("x")    plt.ylabel("y")     plt.show()    #bar    fig.add_subplot(332)    n=10    X=np.arange(n)    Y1=(1-X/float(n))*np.random.uniform(0.5,1.0,n)    Y2=(1-X/float(n))*np.random.uniform(0.5,1.0,n)    plt.bar(X,+Y1,facecolor='#9999ff',edgecolor='white')    plt.bar(X,-Y2,facecolor='#9999ff',edgecolor='white')    for x,y in zip(X,Y1):        plt.text(x+0.4,y+0.05,'%.2f' % y,ha='center',va='bottom')    for x,y in zip(X,Y2):        plt.text(x+0.4,-y-0.05,'%.2f' % y,ha='center',va='bottom')           plt.show()    #Pie    fig.add_subplot(333)    n=20    Z=np.ones(n)    Z[-1]*=2    plt.pie(Z,explode=Z*.05,colors=['%s' % (i / float(n)) for i in range(n)],            labels=['%.2f' % (i / float(n)) for i in range(n)])    plt.gca().set_aspect('equal')    plt.xticks([]), plt.yticks([])    plt.show()    #polar    fig.add_subplot(334)    n=20    theta=np.arange(0.0,2*np.pi,2*np.pi/n)    radii=10*np.random.rand(n)    plt.plot(theta, radii)    plt.show()     #beatmap    fig.add_subplot(335)    from matplotlib import cm    data=np.random.rand(3,3)    cmap=cm.Blues    map=plt.imshow(data,interpolation='nearest',cmap=cmap,aspect='auto',vmin=0,vmax=1)    plt.show()    #hot map    fig.add_subplot(313)    def f(x,y):        return (1-x/2+x**5+y**3)*np.exp(-x**2-y**2)    n=256    x=np.linspace(-3,3,n)    y=np.linspace(-3,3,n)    X,Y=np.meshgrid(x,y)    plt.contourf(X,Y,f(X,Y),8,alpha=.75,cmap=plt.cm.hot)    plt.show()    #3D    ax=fig.add_subplot(336,projection="3d")    ax.scatter(1,1,3,s=100)    plt.show()if __name__=="__main__":    main()

● scipy

● 简介

数值计算库

● 积分

程序:import numpy as npfrom scipy.integrate import quad,dblquad,nquaddef main():    # Integral    print(quad(lambda x:np.exp(-x),0,np.inf))    print(dblquad(lambda t,x:np.exp(-x*t)/t**3,0,np.inf,lambda x:1,lambda x:np.inf))    def f(x,y):        return x*y    def bound_y():        return [0,0.5]    def bound_x(y):        return [0,1-2*y]    print(nquad(f,[bound_x,bound_y]))if __name__=="__main__":    main()执行结果:(1.0000000000000002, 5.842607038578007e-11)(0.3333333333366853, 1.3888461883425516e-08)(0.010416666666666668, 4.101620128472366e-16)

● 优化器

import numpy as npfrom scipy.optimize import minimizedef main():    # Optimizer    def rosen(x):        return sum(100.0*(x[1:]-x[:-1]**2.0)**2.0+(1-x[:-1])**2.0)    x0=np.array([1.3,0.7,0.8,1.9,1.2])    res=minimize(rosen,x0,method="nelder-mead",options={"xtol":1e-8,"disp":True})    print("ROSE MINI:", res)if __name__=="__main__":    main()执行结果:Optimization terminated successfully.             Current function value: 0.000000         Iterations: 339         Function evaluations: 571ROSE MINI:  final_simplex: (array([[ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ],       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ],       [ 1.        ,  1.        ,  1.        ,  1.00000001,  1.00000001],       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ],       [ 1.        ,  1.        ,  1.        ,  1.        ,  1.        ],       [ 1.        ,  1.        ,  1.        ,  1.        ,  0.99999999]]), array([  4.86115343e-17,   7.65182843e-17,   8.11395684e-17,         8.63263255e-17,   8.64080682e-17,   2.17927418e-16]))           fun: 4.8611534334221152e-17       message: 'Optimization terminated successfully.'          nfev: 571           nit: 339        status: 0       success: True             x: array([ 1.,  1.,  1.,  1.,  1.])

● 插值

import numpy as npfrom scipy.interpolate import interplddef main():    def fun(x):        return x+2*np.cos(x)    sol=root(fun,0.1)    print("ROOT:",sol.x,sol.fun)    #Interpolation    x=np.linspace(0,1,10)    y=np.sin(2*np.pi*x)    li=interpld(x,y,kind="cubic")    x_new=np.linspace(0,1,50)    y_new=li(x_new)    figure()    plot(x,y,"r")    plot(x_new,y_new,"k")    show()    print(y_new)if __name__=="__main__":    main()

● 线性计算与矩阵分解

程序:import numpy as npfrom scipy import linalg as lgdef main():    arr=np.array([[1,2],[3,4]])    print("Det:",lg.det(arr))    print("Inv:",lg.inv(arr))    b=np.array([6,14])    print("Sol:",lg.solve(arr,b))    print("Eig:",lg.eig(arr))    print("LU:",lg.lu(arr))    print("QR:",lg.qr(arr))    print("SVD:",lg.svd(arr))    print("Schur:",lg.schur(arr))if __name__=="__main__":    main()执行结果:Det: -2.0Inv: [[-2.   1. ] [ 1.5 -0.5]]Sol: [ 2.  2.]Eig: (array([-0.37228132+0.j,  5.37228132+0.j]), array([[-0.82456484, -0.41597356],       [ 0.56576746, -0.90937671]]))LU: (array([[ 0.,  1.],       [ 1.,  0.]]), array([[ 1.        ,  0.        ],       [ 0.33333333,  1.        ]]), array([[ 3.        ,  4.        ],       [ 0.        ,  0.66666667]]))QR: (array([[-0.31622777, -0.9486833 ],       [-0.9486833 ,  0.31622777]]), array([[-3.16227766, -4.42718872],       [ 0.        , -0.63245553]]))SVD: (array([[-0.40455358, -0.9145143 ],       [-0.9145143 ,  0.40455358]]), array([ 5.4649857 ,  0.36596619]), array([[-0.57604844, -0.81741556],       [ 0.81741556, -0.57604844]]))Schur: (array([[-0.37228132, -1.        ],       [ 0.        ,  5.37228132]]), array([[-0.82456484, -0.56576746],       [ 0.56576746, -0.82456484]]))

● pandas

● 简介

数据分析库

● 基础数据分析技术

import numpy as npimport pandas as pddef main():    #Data Structure    s=pd.Series([i*2 for i in range(1,11)])    print(type(s))       dates=pd.date_range("20170301",periods=8)    df=pd.DataFrame(np.random.randn(8,5),index=dates,columns=list("ABCDE"))    print(df)    #Basic    print(df.head(3))    print(df.tail(3))    print(df.index)    print(df.values)    print(df.T)    print(df.sort(columns="C"))    print(df.sort_index(axis=1,ascending=False))    print(df.describe())    #Select    print(type(df["A"]))    print(df[:3])    print(df["20170301":"20170304"])    print(df.loc[dates[0]])    print(df.loc["20170301":"20170304",["B","D"]])    print(df.iloc[1:2,2:4])    print(df.iloc[1,4])    print(df[df.B>0][df.A<0])    print(df[df>0])    print(df[df["E"].isin([1,2])])    #Set    s1=pd.Series(list(range(10,18)),index=pd.date_range("20170301",periods=8))    df["F"]=s1    print(df)    df.at[dates[0],"A"]=0    print(df)    df.iat[1,1]=1    df.loc[:,"D"]=np.array([4]*len(df))    df2=df.copy()    df2[df2>0]=df2    print(df2)    #Missing Value    df1=df.reindex(index=dates[:4],columns=list("ABCD")+["G"])    df1.loc[dates[0]:dates[1],"G"]=1    print(df1)    print(df1.dropna())    print(df1.fillna(value=2))    #Concat    pieces=[df[:3],df[-3:]]    print(pd.concat(pieces))    left=pd.DataFrame({"key":["x","y"],"value":[1,2]})    right=pd.DataFrame({"key":["x","z"],"value":[3,4]})    print("LEFT:",left)    print("RIFHT:",right)    print(pd.merge(left,right,on="key",how="left"))    df3=pd.DataFrame({"A":["a","b","c","b"],"B":list(range(4))})    print(df3.groupby("A").sum())if __name__=="__main__":    main()

● 时间、绘图

import numpy as npimport pandas as pdfrom pylab import *def main():    #Time Series    t_exam=pd.date_range("20170301",periods=10,freq="S")    print(t_exam)    #Graph    ts=pd.Series(np.random.randn(1000),index=pd.date_range("20170301",periods=1000))    ts=ts.cumsum()    ts.plot()    show()if __name__=="__main__":    main()

● scikit-learn

● 简介

数据挖掘建模、机器学习

● 机器学习与决策树

机器学习:因子–>结果

结果:

不带标记–>无监督学习(聚类);带标记–>监督学习

有限离散–>分类;连续–>回归

决策树:监督学习;树形结构

● Iris数据集

● 花萼长度

● 花萼宽度

● 花瓣长度

● 花瓣宽度

● 种类:Iris Setosa(山鸢尾)、Iris Versicolour(杂色鸢尾)、Iris Virginica(维吉尼亚鸢尾)

● 实现

import numpy as npimport pandas as pdfrom sklearn.datasets import load_irisfrom sklearn.cross_validation import train_test_splitfrom sklearn import treefrom sklearn import metricsdef main():    #Pre-processing    iris=load_iris()    print(iris)    print(len(iris["data"]))    train_data,test_data,train_target,test_target=train_test_split(iris.data,iris.target,test_size=0.2,random_state=1)    #Model    clf=tree.DecisionTreeClassifier(criterion="entropy")    clf.fit(train_data,train_target)    y_pred=clf.predict(test_data)    #Verify    print(metrics.accuracy_score(y_true=test_target,y_pred=y_pred))    print(metrics.confusion_matrix(y_true=test_target,y_pred=y_pred))if __name__=="__main__":    main()

● keras

● 简介

人工神经网络

● 简单神经网络实现

Keras安装步骤:Anaconda CMD;conda install mingw libpython;pip install keras;pip install np_utils

● 实例

注意:需要需要C:/user/username/.keras/keras.json,具体改后内容如下:{“backend”: “theano”,”image_data_format”: “th”,”epsilon”: 1e-07,”floatx”: “float32”}。

import numpy as npfrom keras.models import Sequentialfrom keras.layers import Dense,Activationfrom keras.optimizers import SGDfrom sklearn.datasets import load_irisfrom sklearn.preprocessing import LabelBinarizerfrom sklearn.cross_validation import train_test_splitdef main():    pass    iris=load_iris()    print(iris["target"])    LabelBinarizer().fit_transform(iris["target"])    train_data,test_data,train_target,test_target=train_test_split(iris.data,iris.target,test_size=0.2,random_state=1)    labels_train=LabelBinarizer().fit_transform(train_target)    labels_test=LabelBinarizer().fit_transform(test_target)    model=Sequential(            [                    Dense(5,input_dim=4),                    Activation("relu"),                    Dense(3),                    Activation("sigmoid"),            ]            )    # 优化器    sgd=SGD(lr=0.01,decay=1e-6,momentum=0.9,nesterov=True)    model.compile(optimizer=sgd,loss="categorical_crossentropy")    model.fit(train_data,labels_train,nb_epoch=200,batch_size=40)    print(model.predict_classes(test_data))    #model.save_weights("D:/w")    #model.load_weights("D:/w")if __name__=="__main__":    main()

(欢迎私信小编有干货分享哦!)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值