1.Pandas简介
Pandas是一个Python包,提供快速、灵活和富有表现力的数据结构,使关联或标记数据的使用既简单又直观。
它旨在成为Python中进行实际,真实世界数据分析的基础高级构建块。此次外还有更广泛的目标,即称为任何语言中最强大,最灵活的开源数据分析/操作工具。
适合许多不同类型的数据
- 具有异构类型列的表格数据,如SQL表或Excel表
- 有序和无序的时间序列数据
- 具有行和列标签的任意矩阵数据
- 任何其他形式的观察/统计数据集。实际上不需要将数据标记为放置在pandas数据结构中
2.Pandas中的数据结构
Pandas 中除了 Panel 数据结构,还引入了两种新的数据结构一-Series和 DataFrame ,这两种数据结构都建立在 NumPy 的基础之上。
( 1 ) Series :一维数组系列,也称序列,与 Numpy中的一维 array 类似。 二者与Python 基本的数据结构 list 也很相近。
( 2 ) DataFrame :二维的表格型数据结构。可以将DataFrame 理解为 Series 的容器。以下的内容主要以DataFrame为主。
( 3) Panel:三维 数 组,可以理解为 DataFrame 的 容器。
3.数据结构
3.1 Series 一维数组 <==> array,list
# -*- coding: UTF-8 -*-
'''
@Author :Jason
Pandas 的数据结构:Pandas 主要有 Series(一维数组),DataFrame(二维数组),Panel(三维数组),Panel4D(四维数组),PanelND(更多维数组)
等数据结构。其中 Series 和 DataFrame 应用的最为广泛
'''
import pandas as pd
import numpy as np
#创建Series数据类型的三种方法
def createSeries():
#1.列表创建
list1 = [1,2,3,4,5]
s1 = pd.Series(list1) #如果不指定索引,默认从0开始
print(s1)
'''
0 1
1 2
2 3
3 4
dtype: int64
'''
#2.从加入所以创建Series
list2 = np.random.randn(5) #随机五位小数
indexList = ["a","b","c","d","e"]
s2 = pd.Series(list2,index=indexList)
print(s2)
'''
a 3.198744
b 0.214999
c -0.272966
d 0.089220
e -0.895139
dtype: float64
'''
#3.从字典创建
dict1 = {"a":1,"b":2,"c":3,"d":4,"e":5}
s3 = pd.Series(dict1)
print(s3)
'''
a 1
b 2
c 3
d 4
e 5
dtype: int64
'''
def seriesBaseOperate():
list1 = [1,2,3,4,5]
s1 = pd.Series(list1)
s1.index = ["A","B","C","D","E"] #将下标12345,改为ABCDE
print(s1)
'''
A 1
B 2
C 3
D 4
E 5
dtype: int64
'''
list2 = np.random.randn(5)
indexList = ["a", "b", "c", "d", "e"]
s2 = pd.Series(list2,index=indexList)
print(s2)
'''
a -1.273575
b -1.074655
c -0.772257
d 0.694503
e 1.254038
dtype: float64
'''
d = {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5}
s3 = pd.Series(d)
s4 = s3.append(s1) # 将s1拼接到s3,注意不能拼接单个元素
print(s4)
'''
a 1
b 2
c 3
d 4
e 5
A 1
B 2
C 3
D 4
E 5
dtype: int64
'''
s4 = s4.drop("e") # 删除索引为e的值
s4["A"] = "JASON" # 修改索引A的值
print(s4["A"]) # 查值
print(s4[0:3]) # 切片
'''
JASON
a 1
b 2
c 3
dtype: object
'''
#Series运算操作
def operatingSeries():
list1 = [1,2,3,4,5]
s1 = pd.Series(list1)
n = np.random.randn(5)
indexList = ["a","b","c","d","e"]
indexList1 = [1,2,3,4,5]
s2 = pd.Series(n,index = indexList1)
print(s1.add(s2)) #Series 的加法运算是按照索引计算,如果索引不同则填充为 `NaN`(空值)
'''
0 NaN
1 1.392250
2 2.856717
3 4.037887
4 3.496885
5 NaN
dtype: float64
'''
print(s1.sub(s2))
'''
0 NaN
1 2.607750
2 3.143283
3 3.962113
4 6.503115
5 NaN
dtype: float64
'''
print(s1.mul(s2))
'''
0 NaN
1 -1.215499
2 -0.429849
3 0.151548
4 -7.515576
5 NaN
dtype: float64
'''
print(s1.div(s2))
'''
0 NaN
1 -3.290829
2 -20.937609
3 105.577265
4 -3.326425
5 NaN
dtype: float64
'''
print(s1.median()) #中位数 3.0
print(s1.max()) # 5
print(s1.sum()) # 15
if __name__ == "__main__":
# createSeries()
# seriesBaseOperate()
operatingSeries()
3.2 DataFrame 二维数组 <===> ndarray
# -*- coding: UTF-8 -*-
'''
二维数组DataFrame <==> ndarray
'''
import pandas as pd
import numpy as np
from pandas import Series
class DF(object):
def __init__(self):
dates = pd.date_range("today", periods=6) # 定义时间序列作为index
num_arr = np.random.randn(6, 4) # 传入nunpy的随机小数数组
colnum = ["A", "B", "C", "D"]
self.df1 = pd.DataFrame(data=num_arr, index=dates, columns=colnum)
self.dataInfo = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
self.df2 = pd.DataFrame(data=self.dataInfo, index=labels)
def createDataFrame(self):
print(self.df1)
'''
A B C D
2018-12-05 19:26:33.079050 1.238997 1.178291 0.682551 0.083252
2018-12-06 19:26:33.079050 -0.682866 0.168864 -0.702818 -1.183783
2018-12-07 19:26:33.079050 -0.042540 1.595481 -0.157319 -1.531944
2018-12-08 19:26:33.079050 -1.397062 -0.853874 -0.365774 -0.865814
2018-12-09 19:26:33.079050 0.997983 -0.871088 0.856143 -0.322108
2018-12-10 19:26:33.079050 0.134739 -0.886856 -0.731986 -0.975596
'''
print(self.df2)
'''
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e dog 5.0 2 no
f cat 2.0 3 no
g snake 4.5 1 no
h cat NaN 1 yes
i dog 7.0 2 no
'''
print(self.df2.shape) # 属性(10,4) #10行4列
def dataFrame(self):
'''
dataFrmme的各种方法
:return: None
'''
# print(self.df1)
# print(self.df2.head()) #1.head(n) 查看前几个值,默认前5
'''
animal age visits priority
a cat 2.5 1 yes
b cat 3.0 3 yes
c snake 0.5 2 no
d dog NaN 3 yes
e dog 5.0 2 no
'''
# print(self.df2.tail(3)) #查看后3个的值
'''
animal age visits priority
h cat NaN 1 yes
i dog 7.0 2 no
j dog 3.0 1 no
'''
# print(self.df2.columns) #2.查看列名
'''Index(['animal', 'age', 'visits', 'priority'], dtype='object')'''
# print(self.df2.values) #查看值
'''
[['cat' 2.5 1 'yes']
['cat' 3.0 3 'yes']
['snake' 0.5 2 'no']
['dog' nan 3 'yes']
['dog' 5.0 2 'no']
['cat' 2.0 3 'no']
['snake' 4.5 1 'no']
['cat' nan 1 'yes']
['dog' 7.0 2 'no']
['dog' 3.0 1 'no']]
'''
# print(self.df2.index) #查看索引
'''
Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')
'''
# print(self.df2.describe()) #3.查看数据统计
'''
age visits
count 8.000000 10.000000
mean 3.437500 1.900000
std 2.007797 0.875595
min 0.500000 1.000000
25% 2.375000 1.000000
50% 3.000000 2.000000
'''
# print(self.df2.T)#4.转置操作
'''
a b c d e f g h i j
animal cat cat snake dog dog cat snake cat dog dog
age 2.5 3 0.5 NaN 5 2 4.5 NaN 7 3
visits 1 3 2 3 2 3 1 1 2 1
priority yes yes no yes no no no yes no no
'''
# print(self.df2['age']) #5.通过标签查询
'''
a 2.5
b 3.0
c 0.5
d NaN
e 5.0
f 2.0
g 4.5
h NaN
i 7.0
j 3.0
Name: age, dtype: float64
'''
# print(self.df2.iloc[1:3]) #6.查询2,3行
'''
animal age visits priority
b cat 3.0 3 yes
c snake 0.5 2 no
'''
# print(self.df2.iat[1,0]) #7.按照坐标查询 2行1列的数据
'''
cat
'''
# print(self.df2.loc["f","age"])#8.按照标签和索引 f行标签为age的数据
'''
2.0
'''
#9.添加列数据(先创建一个Series,然后添加)
df3 = self.df2
num = Series([0,1,2,3,4,5,6,7,8,9],index=df3.index)
df3['No.'] = num #添加以No.为列名的新数据
# print(df3)
'''
animal age visits priority No.
a cat 2.5 1 yes 0
b cat 3.0 3 yes 1
c snake 0.5 2 no 2
d dog NaN 3 yes 3
e dog 5.0 2 no 4
f cat 2.0 3 no 5
g snake 4.5 1 no 6
h cat NaN 1 yes 7
i dog 7.0 2 no 8
j dog 3.0 1 no 9
'''
#10.删除数据
df22 = self.df2
df23 = df22.drop("age",axis=1) #delete table where yid = "age"
# print(df23)
'''
nimal visits priority No.
a cat 1 yes 0
b cat 3 yes 1
c snake 2 no 2
d dog 3 yes 3
e dog 2 no 4
f cat 3 no 5
g snake 1 no 6
h cat 1 yes 7
i dog 2 no 8
j dog 1 no 9
'''
df24 = df22.drop(["age","No."],axis=1)#相当于delete table where yid = "age" or yid = "No."
# print(df24)
'''
animal visits priority
a cat 1 yes
b cat 3 yes
c snake 2 no
d dog 3 yes
e dog 2 no
f cat 3 no
g snake 1 no
h cat 1 yes
i dog 2 no
j dog 1 no
'''
df25 = df22.dropna(how="any") #how参数any,只要有缺失值就删除
# print(df25)
'''
animal age visits priority No.
a cat 2.5 1 yes 0
b cat 3.0 3 yes 1
c snake 0.5 2 no 2
e dog 5.0 2 no 4
f cat 2.0 3 no 5
g snake 4.5 1 no 6
i dog 7.0 2 no 8
j dog 3.0 1 no 9
'''
#11.缺失值处理(填充)
df4 = df3.fillna(value=3) #值为Nan地方填充为3,可以df3对比,返回值,不能直接改变df3
# print(df4)
'''
animal age visits priority No.
a cat 2.5 1 yes 0
b cat 3.0 3 yes 1
c snake 0.5 2 no 2
d dog 3.0 3 yes 3
e dog 5.0 2 no 4
f cat 2.0 3 no 5
g snake 4.5 1 no 6
h cat 3.0 1 yes 7
'''
#12.条件查找
# print(self.df2[self.df2["age"] < 3]) #注意Nan不参与计算
'''
animal age visits priority No.
a cat 2.5 1 yes 0
c snake 0.5 2 no 2
f cat 2.0 3 no 5
'''
# print(self.df2[(self.df2["animal"] == "cat") & (self.df2["age"]> 2)])
'''
animal age visits priority No.
a cat 2.5 1 yes 0
b cat 3.0 3 yes 1
'''
# print(self.df2[self.df2["animal"].isin(["cat","dog"])]) #animal索引中包含cat,dog的
'''
animal age visits priority No.
a cat 2.5 1 yes 0
b cat 3.0 3 yes 1
d dog NaN 3 yes 3
e dog 5.0 2 no 4
f cat 2.0 3 no 5
h cat NaN 1 yes 7
i dog 7.0 2 no 8
j dog 3.0 1 no 9
'''
#13.行列索引切片
# print(self.df2.iloc[2:4,1:3]) #2-4行,1-3列的数据
'''
age visits
c 0.5 2
d NaN 3
'''
#14.排序操作
# print(self.df2.sort_values(by=["age","visits"],ascending=[False,True]))
#根据age降序,visits升序排列,根据sql经验会报错,这里竟然没
'''
animal age visits priority No.
i dog 7.0 2 no 8
e dog 5.0 2 no 4
g snake 4.5 1 no 6
j dog 3.0 1 no 9
b cat 3.0 3 yes 1
a cat 2.5 1 yes 0
f cat 2.0 3 no 5
c snake 0.5 2 no 2
h cat NaN 1 yes 7
d dog NaN 3 yes 3
'''
#15.DataFrame 多值替换
# print(self.df2["priority"].map({"yes":1,"no":0})) #将yes替换为1,no替换为0
'''
a 1
b 1
c 2
d 1
e 2
f 2
g 2
h 1
i 2
j 2
Name: priority, dtype: int64
'''
#16.分组操作
print(self.df2.groupby("animal").sum())
'''
age visits No.
animal
cat 7.5 8 13
dog 15.0 8 24
snake 5.0 3 8
'''
if __name__ == "__main__":
df = DF()
# df.createDataFrame()
df.dataFrame()
参照:《基于Python的大数据分析基础及实战》