Pandas的常用数据类型
Series一维,带标签的数组
DataFrame二维,Series容器
一、Series
Series对象本质上是有两个数组组成,一个数组构成对象的键(index),一个数组构成对象的值(values)
import string
import pandas as pd
import numpy as np
# 创建Series
t1 = pd.Series(np.arange(5),index=list("abcde"))
print(t1)
"""
索引可以指定,默认为012...
a 0
b 1
c 2
d 3
e 4
dtype: int64
"""
print(type(t1)) #
# 使用字典创建Series
a = {string.ascii_uppercase[i]:i for i in range(5)}
# 创建Series
print(pd.Series(a))
"""
A 0
B 1
C 2
D 3
E 4
dtype: int64
"""
print(pd.Series(a,index=list("CDEFG")))
"""
C 2.0
D 3.0
E 4.0
F NaN
G NaN
dtype: float64
"""
# 切片
print(t1[0:4:2])
"""
a 0
c 2
dtype: int64
"""
print(t1[[2,3,4]])
"""
c 2
d 3
e 4
dtype: int64
"""
print(t1[t1>2])
"""
d 3
e 4
dtype: int64
"""
print(t1["b"]) # 1
print(t1[["a","e","f"]])
"""
a 0.0
e 4.0
f NaN
dtype: float64
"""
# 索引和值
print(t1.index) # Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
print(type(t1.index)) #
print(t1.values) # [0 1 2 3 4]
print(type(t1.values)) #
二、DataFrame
创建DataFrame
# 创建DataFrame对象
t1 = pd.DataFrame(np.arange(12).reshape(3,4))
print(t1)
"""
DataFrame对象既有行索引,又有列索引
行索引,表明不同行,横向索引,叫index,0轴,axis=0
列索引,表名不同列,纵向索引,叫columns,1轴,axis=1
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
"""
t2 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("EFGH"))
print(t2)
"""
E F G H
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
"""
# 将字典转换成dataframe
temp_dict = [{"name":"zhangsan","age":15,"tel":10086},
{"name":"lisi","age":15},
{"name":"wangwu","tel":10086}
]
t3 = pd.DataFrame(temp_dict)
print(t3)
"""
age name tel
0 15.0 zhangsan 10086.0
1 15.0 lisi NaN
2 NaN wangwu 10086.0
"""
获取DataFrame的基本信息
# 获取DataFrame的基本信息
# 行数,列数
print(t1.shape)
# 列数据类型
print(t1.dtypes)
# 数据维度
print(t1.ndim) # 2
# 行索引
print(t1.index) # RangeIndex(start=0, stop=3, step=1)
# 列索引
print(t2.columns) # Index(['E', 'F', 'G', 'H'], dtype='object')
# 对象值
print(t1.values)
"""
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
"""
# 显示头几行,默认是5
print(t1.head(2))
# 显示末尾几行
print(t1.tail(2))
# 相关信息概览:行数,列数,列索引,咧非空值个数,行列类型,内存占用
print(t1.info())
"""
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
0 3 non-null int64
1 3 non-null int64
2 3 non-null int64
3 3 non-null int64
dtypes: int64(4)
memory usage: 176.0 bytes
None
"""
# 快速综合统计结果:计数,均值,标准差,最大值,1/4值,最小值
print(t2.describe())
"""
是根据列来计算的
E F G H
count 3.0 3.0 3.0 3.0
mean 4.0 5.0 6.0 7.0
std 4.0 4.0 4.0 4.0
min 0.0 1.0 2.0 3.0
25% 2.0 3.0 4.0 5.0
50% 4.0 5.0 6.0 7.0
75% 6.0 7.0 8.0 9.0
max 8.0 9.0 10.0 11.0
"""
加载csv数据
import numpy as np
import pandas as pd
# 加载csv数据
t = pd.read_csv("./dogNames2.csv")
# 按照字段进行排序,ascending desc/asc
t2 = t.sort_values("Count_AnimalName",ascending=False).head(10)
print(t2)
获取行列数据
import string
import numpy as np
import pandas as pd
# loc和iloc方法
# df.loc:通过标签获取行数据
# df.iloc:通过位置获取行数据
t1 = pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("EFGH"))
# 获取行列交叉部分
print(t1.loc["a","E"]) # 0
# 获取行与多列交叉部分
print(t1.loc["a",["E","F"]])
"""
E 0
F 1
"""
# 获取行与多列交叉部分
print(t1.loc["a","E":"G"])
"""
E 0
F 1
G 2
"""
# 获取行与连续多列交叉部分
print(t1.loc["a":"c","G"])
"""
注意:loc里的:是包括最后的那个的
a 2
b 6
c 10
"""
# iloc和loc是一样的,只不过采用的是索引来进行的操作
布尔索引
import numpy as np
import pandas as pd
# pandas中的布尔索引
t1 = pd.read_csv("./dogNames2.csv")
# 找出其中名字使用次数超过800的狗
print(t1[t1["Count_AnimalName"]>800])
"""
Row_Labels Count_AnimalName
1156 BELLA 1195
2660 CHARLIE 856
3251 COCO 852
9140 MAX 1153
12368 ROCKY 823
"""
# 找出狗名字符串长度超过4的狗
print(t1[t1["Row_Labels"].str.len()>4].head(3))
"""
Row_Labels Count_AnimalName
2 40804 1
3 90201 1
4 90203 1
"""
# 多条件,要使用()分割,&或|做连接符
print(t1[(t1["Row_Labels"].str.len()>4)&(t1["Row_Labels"].str.len()<6)].head(3))
字符串方法
处理缺失数据
import pandas as pd
import numpy as np
temp_dict = [{"name":"zhangsan","age":0,"tel":10086},
{"name":"lisi","age":15},
{"name":"wangwu","tel":10010}]
t1 = pd.DataFrame(temp_dict)
print(t1)
"""
age name tel
0 15.0 zhangsan 10086.0
1 15.0 lisi NaN
2 NaN wangwu 10010.0
"""
## 缺失数据的处理
# - 处理方式1:删除NaN所在的行列dropna (axis=0, how='any', inplace=False)
t2 = t1.dropna(axis=0, how='any', inplace=False)
print(t2)
"""
age name tel
0 15.0 zhangsan 10086.0
"""
# - 处理方式2:填充数据,t.fillna(t.mean()),t.fiallna(t.median()),t.fillna(0)
t3 = t1.fillna(t1.mean())
print(t3)
"""
age name tel
0 15.0 zhangsan 10086.0
1 15.0 lisi 10048.0
2 15.0 wangwu 10010.0
"""
### 处理为0的数据:将0改为nan,然后使用上面的方法进行填充
t1[t1==0] = np.nan
print(t1)
### 查看是否为nan,返回布尔索引
print(pd.isnull(t1))
"""
age name tel
0 True False False
1 False False True
2 True False False
"""
print(pd.notnull(t1))
"""
age name tel
0 False True True
1 True True False
2 False True True
"""