2020-11-21 数据科学库(4) pandas：series和dataframe的使用

最新推荐文章于 2022-10-10 20:40:35 发布

weixin_51182518

最新推荐文章于 2022-10-10 20:40:35 发布

阅读量167

点赞数

文章标签： pandas python 数据分析

本文链接：https://blog.csdn.net/weixin_51182518/article/details/109892885

版权

为什么要学习pandas

与numpy处理数值不同，pandas还可以处理其他类型的数据

high-performance
easy to use data structures
data analysis tool

series 一维，带标签的数组

dataframe 二维

series创建

import pandas as pd



t1=pd.Series([1,2,31,12,12,3])
t2=pd.Series([1,23,2,2,1],index=list("abces")) #添加索引
print(t2)

#通过字典创建
dict={"name":"james","age":30,"tel":"10086"}
t3=pd.Series(dict)
print(t3)

#g给定索引值后，如果对应不上，拿nan补齐
#出现了nan后，dtype会变为float类型。
print(t3["age"]) #既可以通过索引取值，也可以通过标签来取
print(t3[2])
print("*************************")

#取不连续的多行.加入列表
print(t3[[1,2]])
print("*************************")
print(t3[["age","tel"]])
#如果没有此标签，是nan

print("*************************")
print(t3.index)
print(type(t3.index)) #index类型
print(list(t3.index)) #['name', 'age', 'tel'] 列表 t3.index 可迭代
print("*************************")
print(t3.values) #数组类型
print(type(t3.values))

print("*************************")
## series的本质是带标签的数组

s=pd.Series(range(5))
print(s.where(s>0))  #会打印出大于零的数组及其标签，不满足的数据会变为0
print(s.where(s>1,10)) #将不满足条件的变为10
print(s.mask(s>0)) #将满足条件的数值变为none

# numpy很多方法都可以运用在series中

2、pandas读取外部数据

import pandas as pd


#pandas读取csv的文件
df=pd.read_csv("path")

#pd.read_clipboard() #赋值剪贴板的内容
#读取mysql文件
#pd.read_sql(sql_sentence, connection)

3、 pandas读取mongodb文件

import pandas as pd
from pymongo import MongoClient


client=MongoClient()
db=client.db111
collection=db.nba
data=list(collection.find())

#取第一行
t1=data[0]
t1=pd.Series(t1)
print(t1)

result

在这里插入图片描述

4、dataframe的创建

import pandas as pd
import numpy as np

t1=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("wxyz")) #指定行列索引
print(t1)
#带有标签的二维数组
#竖着的为行索引index，axis=0 行位columns，列索引，axis=1

在这里插入图片描述

5、dataframe传入字典

import pandas as pd

dict={"name":["james","kobe","jordan"],"age":[36,42,51],"tel":["10086","110","120"]}
t1=pd.DataFrame(dict)
print(t1) #列索引变为key，每一行代表一列数据.

在这里插入图片描述

6、pandas读取mongodb文件

import pandas as pd
from pymongo import MongoClient


client=MongoClient()
db=client.db111
collection=db.nba
data=list(collection.find())


print(pd.DataFrame(data))

在这里插入图片描述

7、dataframe描述信息

import pandas as pd
from pymongo import MongoClient


client=MongoClient()
db=client.db111
collection=db.nba
data=list(collection.find())
data_list=[]
for i in data:
    temp={}
    temp["name"]=i["name"]
    temp["age"]=i["age"]
    temp["team"]=i["team"]
    temp["address"]=i["address"]
    data_list.append(temp)

df=pd.DataFrame(data_list)
print(df)

在这里插入图片描述

8、dataframe的使用方法

df.shape #行数列数

df.dtypes  #列数据类型

df.ndim   #数据维度

df.index  #行索引

df.columns #列索引

df.values #对象值 二维数组

df.head(3)  #显示头部三行

df.tail(3)  #显示末尾几行

df.info()  #相关信息概览

df.describe() #快速综合统计结果(count max min )

9、排序

df=df.sort_values(by="")

10、pandas的索引

#pandas取行取列的主意是将

#- 方括号写数字表示取行，对行进行操作

#- 字符串，表示的取列索引，对列进行操作

df[:20] #取前20行

df[:20]["Row_Labels"]

df.loc #通过标签进行索引

df.iloc #通过位置进行索引

t.loc["A","W"] #第a行w列

t.loc["A",["W","Z"]] #a行w列和a行z列，type：series

t.loc[["A","C"],["W","Z"]] a行c行于w列z列交汇处

t.loc["A":"C",["W","Z"]] a行到c行的w列和z列的数组

11、pandas.loc &panda.iloc

import pandas as pd
import numpy as np

t1=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("wxyz"))
print(t1)
print("********************************")
print(t1.iloc[1:3,[2,3]])
t1.loc["a","y"]=100
print("********************************")
print(t1)
print("********************************")
print(t1.iloc[1:3,1:3])
print("********************************")

t1.iloc[1:2,0:2]=200
print(t1)

在这里插入图片描述

可以将np.nan赋值进dataframe

12、布尔索引

df[df[“Count_AnimalName”]>800]

df[(df[“Count”]>800)&(df[“count”]<1000)]

df[“info”].str.split("/").tolist() #将info标签下的列转为字符串切割再放到一个列表中。

13、缺失数据的处理

判断是否为nan

pd.isnull(df)

pf.notnull(df)

处理方式

删除nan所在的行列

dropna(axis=0,how="any",inplace=False)

any只要有一个为nan就删除
all 全部为nan才删除
inplace:是否进行原地修改
填充数据

t.fillna(t.mean()),

t2["age"]=t2["age"].fillna(t2["age"].mean())
print(t2)

t.fillna(t,median())

t.fillna(0)

处理为0的数据

t[t==0]=np.nan

14、pandas 常用统计方法

df[""].mean()

df["director"}.unique()

#获取演员的信息(字符串形式，以逗号隔开)

#tolist 大列表嵌套小列表

 temp=df["actors"].str.split(",").tolist() 

actors_list=[i for j in temp for i in j]

actors_num=len(set(actor_list))