pandas

pandas

01 pandas之Series的索引和值

pandas可以处理非数值类型的数据
series 带标签的数组

import pandas as pd
pd.Series([1,2,31,12,3,4]) #Series 带标签的数组
#out
0     1
1     2
2    31
3    12
4     3
5     4
dtype: int64
t = pd.Series([1,2,31,12,3,4])
type(t)
pandas.core.series.Series #out
t2 = pd.Series([1,23,2,2,1],index=list("abcde")) #abcde对应
t2
a     1
b    23
c     2
d     2
e     1
dtype: int64
temp_dict = {"name":"xiaohong","age":30,"tel":10086}#字典索引
t3 = pd.Series(temp_dict)
t3
name    xiaohong
age           30
tel        10086
dtype: object
t2.astype(float) #修改文件类型
a     1.0
b    23.0
c     2.0
d     2.0
e     1.0
dtype: float64
t3["age"] #通过键索引
t3[1]  #通过位置索引
t3[:2]
t3[[1,2]]
t3[["age","name"]]
t[t>10] #布尔索引
2    31
3    12
dtype: int64

02 pandas 读取外部数据

import pandas as pd
#pandas读取csv中的文件
df = pd.read_csv("E:/yanyi/python_study/14100_HM数据科学库课件/14100_HM数据科学库课件/day04/dogNames2.csv")
print(df)
from pymongo import MongoClient
client = MongoClient()
collection = client["douban"]["tv1"]
data = list(collection.find())
t1 = data[0]
t1 = pd.Series(t1)
print(data)

03 pandas Dataframe

通过字典创建dataframe

import pandas as pd
import numpy as np
pd.DataFrame(np.arange(12).reshape(3,4))
t1 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
t1 
d1 = {"name":["xiaoming","xiaogang"],"age":[20,32],"tel":[10086,10010]}
pd.DataFrame(d1)
t1 = pd.DataFrame(d1)
type(t1)
d2 = [{"name":"xiaohong","age":32,"tel":10010},{"name":"xiaogang","tel":10000},{"name":"xiaowang","age":32}]
t2 = pd.DataFrame(d2)
t2

从mondb里面读取数据

from pymongo import MongoClient
import pandas as pd
client = MongoClient
collection = client["douban"]["tv1"]
data = list(collection.find())
df = pd.DataFrame(data)
print(df)
#筛选数据
data = collection.find()
data_list = []
for i in data:
	temp = {}
	temp["info"]=i["info"]
	temp["rating_count"] = i["rating"]["count"]
	temp["rating_count"] = i["rating"]["value"]
	temp["country"] = i["tv_category"]
	temp["actors"]=i["actors"]
	data_list.append(temp)
df = pd.DataFrame(data_list)
print(df)
#显示前几行
print(df.head(1))
#显示后几行
print(df.tail(1))

#显示df的概览
print(df.info())
print(df.describe())
import pandas as pd
df = pd.read_csv("./dogNames2.csv")
#print(df.head())
#print(df.info()) #使用次数
#哪些名字出现次数最多
#dataFrame中按照某一列进行排序的方法,降序
df = df.sort_values(by="Count_AnimalName",ascending=False)
print(df.head(5))

04 取行取列索引

#pandas取行取列的注意事项
#[]写数字表示取行对行进行操作
#[]写字符串表示对列进行操作
print(df[:20])#前20行
print(df[:20]["Raw_labels"])#前20行Raw_labels列
print(type(df["Raw_labels"]))

#同时取行和列 df.loc(字符串) df.iloc(位置)
t3 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("WXYZ"))
t3.loc["a","Z"] #取某一个
3
t3.loc["a",:] #取行
t3.loc[:,"Y"] #取列
t3.loc[["a","c"]]
t3.iloc[1]
t3.iloc[1,:]
t3.iloc[:,2]
t3.iloc[[0,2],[2,1]]
t3.iloc[[1:],[:2]] = np.nan #自动转换

05 布尔索引

df[(800<df["Count_AnimalName"])&(df["Count_AnimalName"])<1000]
#&且 |或

#切割
df["info"].str.split("/").tolist()

#缺失数据处理
#判断是不是NaN
pd.isnull(t3)
pd.notnull(t3)
t3[pd.notnull(t3["W"])]
t3.dropna(axis=0)
t3.dropna(axis=0,how="any")#默认
t3.dropna(axis=0,how="all") #所有都为NaN
t3.dropna(axis=0,how="any",inplace=True) #原地修改
t2.fillna(t2.mean())#填充为中值
t2["age"] = t2["age"].fillna(t2["age"].mean())
t[t==0]=np.nan

06 练习

import pandas as pd
import matplotlib import pyplot as plt
file_path = './IMDB-Movie-Data.csv'
df = pd.read_csv(file_path)
print(df.head(1))
print(df.info())
#rating,runtime分布情况
#直方图
runtime_data = df("Runtime(Minutes)").values
#组距
max_runtime = runtime_data.max()
min_runtime = runtime_data.min()
num_bin_list = [1.6]
i=1.6
while i<=max_runtime:
	i+=0.5
	num_bin_ljist.append(i)
#图形大小
plt.figure(figsize(20,8),dpi=80)
plt.hist(runtime_data,num_bin_list)

_x = [min_runtime]
i = min_runtime
while i <=max_runtime+0.5:
	i = i*0.5
	_x.append(i)

plt.xticks(_x)
plt.show()
#电影的平均评分,导演的人数
import pandas as pd
file_path = "IMOB-Movie-Data.csv"
df = pd.read_csv(file_path)
#print(df.info())
#print(df.head(1))
#获取电影的平均评分
print(df["Rating"].mean())
#获取导演人数
#print(len(set(df["Director"].tolist())))
print(len(df["Director"].unique()))
#获取演员的人数
temp_actors_list = df["Actors"].str.split(",").tolist()
actors_list = [i for j in temp_actors_list for i in j]
actors_num = len(set(actors_list))
print(actors_num)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值