pandas基础及机器学习中简单应用


一、pandas是什么?

示例:pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。
Series:一维数组,与Numpy中的一维array类似。二者与Python基本的数据结构List也很相近,其区别是:List中的元素可以是不同的数据类型,而Array和Series中则只允许存储相同的数据类型,这样可以更有效的使用内存,提高运算效率。
Time- Series:以时间为索引的Series。
DataFrame:二维的表格型数据结构。很多功能与R中的data.frame类似。可以将DataFrame理解为Series的容器。以下的内容主要以DataFrame为主。
Panel :三维的数组,可以理解为DataFrame的容器

二、使用步骤

1.引入库

代码如下(示例):

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import  ssl
ssl._create_default_https_context = ssl._create_unverified_context

2.基本操作

代码如下(示例):

import pandas as pd
import numpy as np
from pandas import Series

data=pd.Series([1,5,3,4,5,6])
print(data)
data=pd.Series([1,5,3,4],index=["a","b","c","d"])
print(data)
#用一维numpy数组创建
x=np.arange(5)
print(pd.Series(x))
#用字典创建 默认以键为index 值为data
population_dict={"BeiJing":2154,
                 "ShangHai":2424,
                "ShengZheng":1303,
                "HangZhou":981}
population=pd.Series(population_dict)
print(population)
population=pd.Series(population_dict,index=["BeiJing","ShangHai","c"])#找不到的为NaN
print(population)
#data为标量
print(pd.Series(5,index=[100,200,300]))#index对应值均为5

Pandas DataFrame对象 带标签数据的多维数组:

#通用结构 pd.DataFrame(data,index=index,columns=columns)
#1通过Series对象创建
import pandas as pd
import numpy as np
population_dict={"BeiJing":2154,
                 "ShangHai":2424,
                "ShengZheng":1303,
                "HangZhou":981}
population=pd.Series(population_dict)
print(pd.DataFrame({"population":population}))
#2.通过serie对象字典创建
GDP_dict={"BeiJing":30320,
            "ShangHai":32680,
            "ShengZheng":112303,
            "HangZhou":12981}
GDP=pd.Series(GDP_dict)
print(pd.DataFrame({"population":population,
                    "GDP":GDP,
                    "country":"China"}))
#3.通过字典列表对象创建
data=[{"a":i,"b":2*i} for i in range(3)]
print(data)
print(pd.DataFrame(data))
data=[{"a":1,"b":1},{"b":3,"c":1}]#不存在的填NAN
print(pd.DataFrame(data))
#4.通过Numpy二维数组创建
data=np.random.randint(10,size=(3,2))
print(pd.DataFrame(data,columns=["foo","bar"],index=["a","b","c"]))

3.DateFrame性质

#DateFrame性质
import pandas as pd

import numpy as np

#1.属性
population_dict={"BeiJing":2154,
                 "ShangHai":2424,
                "ShengZheng":1303,
                "HangZhou":981}
population=pd.Series(population_dict)
GDP_dict={"BeiJing":30320,
            "ShangHai":32680,
            "ShengZheng":112303,
            "HangZhou":12981}
GDP=pd.Series(GDP_dict)
data=pd.DataFrame({"population":population,
                    "GDP":GDP,
                    "country":"China"})
print(data)
print(data.values)#1 df.values 返回numpy数组表示的数据
print(data.index)#2返回行索引
print(data.columns)#3返回列索引
print(data.shape)#4返回形状 43print(data.size)#5大小 12
print(data.dtypes)#6返回每列数据类型
#2索引
#获取列
print(data["population"])#populat列
print(data["GDP"])#字典式 获得GDP列
print(data.GDP)#对象属性式
#获取行
print(data.loc["BeiJing"])#绝对索引
print(data.iloc[0,2])#相对索引
#获取标量
print(data.loc["BeiJing","GDP"])
print(data.iloc[0,1])
print(data.values[0][1])
#Series对象的索引
print(data.GDP)
print(GDP)
print(GDP["BeiJing"])
#3.切片
datas=pd.date_range(start='2020-01-01',periods=6)
print(datas)
df=pd.DataFrame(np.random.randn(6,4),index=datas,columns=["a","b","c","d"])
print(df)
#行切片
print(df["2020-01-01":"2020-01-03"])
print(df.loc["2020-01-01":"2020-01-03"])
print(df.iloc[0:3])
#列切片
print(df.loc[:,"a":"c"])
print(df.iloc[:,0:3])
#多种多样的取值
#行列同时切片
print(df.loc["2020-01-02":"2020-01-03","c":"d"])
print(df.iloc[1:3,2:])
#行切片 列分散取值
print(df.loc["2020-01-02":"2020-01-03",["c","d"]])
print(df.iloc[3:,[0,2]])
#均分散
print(df.loc[["2020-01-02","2020-01-03"],["c","d"]])
#4.布尔索引
print(df>0)
#isin()方法
df2=df.copy()
df2['e']=['one','two','three','four','five','six']#加一列
print(df2)
ind=df2['e'].isin(["two","four"])#e列
print(ind)
#5赋值
#增加新列
s1=pd.Series([1,2,3,4,5,6],index=pd.date_range('20200101',periods=6))
print(s1)
df["e"]=s1
print(df)
#修改赋值
df.loc['2020-01-01','a']=0
df.iloc[0,1]=0
df["d"]=np.array([5]*len(df))
print(df)
#修改index和columns
df.index=[i for i in range(df.shape[0])]#range(len(df))
print(df)
df.columns=[i for i in range(df.shape[1])]
print(df)

4.应用-数值运算及统计分析

#1数据查看

dates=pd.date_range(start='2019-01-01',periods=6)
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=["A","B","C","D"])
print(df)
print(df.head())#查看前面的行 默认5print(df.head(2))
print(df.tail())#后5行
df.iloc[0,3]=np.nan
print(df)
print(df.info())#总体信息
#2.Numpy通用函数同样适用于Pandas
x=pd.DataFrame(np.arange(4).reshape(1,4))
print(x)
y=pd.DataFrame(np.arange(4,8).reshape(1,4))
print(y)
print(x*y)
#矩阵化运算

np.random.seed(42)
x=pd.DataFrame(np.random.randint(10,size=(30,30)))
print(x)
print(x.T)

y=pd.DataFrame(np.random.randint(10,size=(30,30)))
print(x.dot(y))
#pandas较numpy慢 pandas侧重数据处理 numpy侧重计算
#广播运算
print(x/x.iloc[0])
#pandas特有特性
#索引对齐
A=pd.DataFrame(np.random.randint(0,20,size=(2,2)),columns=list("AB"))
B=pd.DataFrame(np.random.randint(0,10,size=(3,3)),columns=list("ABC"))
print(A)
print(B)
print(A+B)#自动对齐 没有的值用np.nan表示
print(A.add(B,fill_value=0))#0 填充

#2 统计相关

#数据种类统计
y=np.random.randint(3,size=20)
print(y)
print(np.unique(y))
print(Counter(y))
y1=pd.DataFrame(y,columns=["A"])
print(y1)
print(np.unique(y1))
print(y1["A"].value_counts())


import pandas as pd

import numpy as np
import timeit
from collections import  Counter


#产生新的结果并进行排序
population_dict={"BeiJing":2154,
                 "ShangHai":2424,
                "ShengZheng":1303,
                "HangZhou":981}
population=pd.Series(population_dict)
GDP_dict={"BeiJing":30320,
            "ShangHai":32680,
            "ShengZheng":112303,
            "HangZhou":12981}
GDP=pd.Series(GDP_dict)
city_info=pd.DataFrame({"population":population,
                    "GDP":GDP,
                    "country":"China"})
city_info["per_GDP"]=city_info["GDP"]/city_info["population"]
print(city_info)
#递增排序
print(city_info.sort_values(by="per_GDP"))
#递减
print(city_info.sort_values(by="per_GDP",ascending=False))
#按照轴排序
data=pd.DataFrame(np.random.randint(20,size=(3,4)),index=[2,1,0],columns=["D","B","A","C"])
print(data)
print(data.sort_index())#行排序
print(data.sort_index(axis=1))
#统计方法
df=pd.DataFrame(np.random.normal(2,4,size=(6,4)),columns=list("ABCD"))
print(df)
print(df.count())#统计非空个数
print(df.sum())
print(df.sum(axis=1))
#max min var std median
#mode 众数
print(data.mode())
#一网打尽 中位数最大最小等等
print(df.describe())

5.利用pandas求相关性系数和协方差

print(df.corr())
print(df.corrwith(df["A"]))
#自定义输出
print(df.apply(np.cumsum))#累加求和
print(df.apply(np.cumsum,axis=1))#行累加
print(df.sum())#A与B和
print(df.apply(lambda x:x.max()-x.min()))
def my_describe(x):
    return pd.Series([x.count(),x.mean(),x.max(),x.idxmin(),x.std()],index=["Count","mean","max","idxmin","std"])
print(df.apply(my_describe))

6.缺失值处理

import pandas as pd

import numpy as np
import timeit
from collections import  Counter

#1.发现缺失值
data=pd.DataFrame(np.array([[1,np.nan,2],
                            [np.nan,3,4],
                            [5,6,7]]),columns=["A","B","C"])
print(data)#NAN 字符串等数据类型变为object 比int更消耗资源np.nan是特殊浮点数
print(data.dtypes)
print(data.isnull())
print(data.notnull())
#2 删除缺失值
print(data.dropna())#删除有缺失值的整行
print(data.dropna(axis=1))#删除整列
data["C"]=np.nan
print(data)
print(data.dropna(axis="columns",how="all"))
data.loc[3]=np.nan
print(data)
print(data.dropna(how="all"))
#3.填充缺失值
print(data.fillna(value=5))
#用均值进行替换
fill=data.stack().mean()
print(fill)
print(data.fillna(value=fill))
#合并数据
def make_df(cols, ind):#一个简单的DataFrame
    data={c:[str(c)+str(i) for i in ind] for c in cols}
    return pd.DataFrame(data,ind)
print(make_df("ABC",range(3)))
#垂直合并
df_1=make_df("AB",[1,2])
df_2=make_df("AB",[1,4])
print(df_1)
print(df_2)
print(pd.concat([df_1,df_2],ignore_index=True))#行重叠
#水平合并
df_1=make_df("CD",[1,2])
df_2=make_df("AB",[1,2])
print(df_1)
print(df_2)
print(pd.concat([df_2,df_1],axis=1))
#对齐合并merge()
df_9=make_df("AB",[1,2])
df_10=make_df("BC",[1,2])
print(df_9)
print(df_10)
print(pd.merge(df_9,df_10))

7.应用举例

#合并城市信息
population_dict={"city":("BeiJing","HangZhou","ShenZhen"),"pop":(2154,981,1303)}
population=pd.DataFrame(population_dict)
print(population)
GDP_dict={"city":("BeiJing","ShangHai","ShenZhen"),"GDP":(30320,32680,13468)}
GDP=pd.DataFrame(GDP_dict)
print(GDP)
city_info=pd.merge(population,GDP)#交集
print(city_info)
city_info=pd.merge(population,GDP,how="outer")#全
print(city_info)
#分组和数据透视表
df=pd.DataFrame({"key":["A","B","C","C","B","A"],"data1":range(6),"data2":np.random.randint(0,10,size=6)})
print(df)
#分组
print(df.groupby("key"))#延迟计算
print(df.groupby("key").sum())
print(df.groupby("key").mean())
print(df.groupby("key").var())
for i in df.groupby("key"):
    print(str(i))
#按列取值
print(df.groupby("key")["data2"].sum())
#按组迭代
for data,group in df.groupby("key"):
    print("{0:5} shape={1}".format(data,group.shape))
#调用方法
print(df.groupby("key")["data1"].describe())
print(df.groupby("key").aggregate(["min","median","max"]))
#过滤
def filter_fun(x):
    return x["data2"].std()>3
print(df.groupby("key")["data2"].std())
print(df.groupby("key").filter(filter_fun))
#转换
print(df.groupby("key").transform(lambda x:x-x.mean()))
#行星
import seaborn as sns
planets=sns.load_dataset("planets")
print(planets.shape)
print(planets.head())
print(planets.describe())
decade=10*(planets["year"]//10)
decade=decade.astype(str)+"s"
decade.name="decade"
print(decade.head())
print(planets.groupby(["method",decade]).sum())
print(planets.groupby(["method",decade])[["number"]].sum().unstack().fillna(0))

#泰坦尼克号乘客数据分析

titanic=sns.load_dataset("titanic")
print(titanic.head())
print(titanic.describe())
print(titanic.groupby("sex")[["survived"]].mean())
print(titanic.groupby("sex")["survived"].mean())
print(titanic.groupby(["sex","class"])[["survived"]].aggregate("mean").unstack())
#数据透视表
print(titanic.pivot_table("survived",index="sex",columns="class"))
print(titanic.pivot_table("survived",index="sex",columns="class",aggfunc="mean",margins=True))
print(titanic.pivot_table(index="sex",columns="class",aggfunc={"survived":"sum","fare":"mean"}))
#其它 1向量化字符串操作 2处理时间序列 3多级索引:用于多维数组



总结

以上就是今天要讲的内容,本文仅仅简单介绍了pandas的使用和简单应用,而pandas提供了大量能使我们快速便捷地处理数据的函数和方法。
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值