python数据分析之pandas
上一篇: python数据分析之numpy
1. 为什么要学习 pandas ?
上一篇已经学习numpy,它已经能够帮助我们处理数据,为啥还要学pandas呢?反正多学习点没啥坏处。
2. 什么是 pandas ?
pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.
3. pandas 的常用数据类型 ?
pandas的常用数据类型
- Series 一维,带标签的数组
- DataFrame 二维,Series容器
3.1 pandas之Series
主要内容有:Series创建、切片和索引、索引和值
本小节的代码:
# coding=utf-8
'''
pandas的常用数据类型
Series 一维,带标签的数组
DataFrame 二维,Series容器
'''
import pandas as pd
# 1、Series创建
t0 = pd.Series([1, 21, 34, 14, 5])
print(t0)
print(type(t0))
t1 = pd.Series([1, 6, 3, 2, 7], index=list("abcde"))
print(t1)
# 通过字典创建
temp_dict = {
"name": "xiaoxiong", "age": 30, "tel": 10086}
t2 = pd.Series(temp_dict)
print(t2)
print(t2.dtype)
print(t1.astype(float)) # 更改类型
print(t1)
# 2、Series切片和索引
print(t2["age"]) # 索引取值
print(t2[1]) # 位置取值
print(t2[:2]) # 取前2行
print(t2[[1,2]]) # 取指定行
print(t2[["age", "tel"]]) # 指定索引
print(t0[t0 > 10]) # 大于10的
# 3、Series的索引和值
# 索引操作
print(t2.index)
for i in t2.index:
print(i)
print(type(t2.index))
print(len(t2.index))
print(list(t2.index)[:2])
# 值操作
print(t2.values)
print(t2.values[:2])
print(list(t2.values)[:2])
print(type(t0.values))
print(t2.index)
3.2 panda 之读取外部数据
这些都可以,这里就不多说了,用到的时候可以自行百度。
# coding=utf-8
import pandas as pd
df=pd.read_csv('./books.csv')
print(df)
'''
可以读取多种格式的文件
txt
csv
sql
mongodb
...
'''
3.3 panda 之 DataFrame
3.3.1 DataFrame创建
DataFrame源码,其实好多时候不懂就可以查看源码。
# coding=utf-8
import pandas as pd
import numpy as np
# 创建DataFrame
data = pd.DataFrame(np.arange(12).reshape(3, 4))
print(data)
# 创建DataFrame,并为其添加索引
t = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list("wxyz"))
print(t)
# 通过字典创建DataFrme
print("*"*100)
temp_dict = {
"name": ["xiaoxiong", "xiaogan"], "age": [30, 24], "tel": [10086, 10010]}
t1 = pd.DataFrame(temp_dict)
print(t1)
# 通过列表创建DataFrme
print("*"*100)
d2=[{
"name": "xiaoxiong", "age": 30, "tel": 10086},{
"name": "xiaoxiong", "age": 30, "tel": 10000},{
"name": "xiaoxiong", "age": 30}]
print(d2)
t2=pd.DataFrame(d2)
print(t2)
3.3.2 DataFrame的基础属性和常用方法
# coding=utf-8
import pandas as pd
import numpy as np
# 创建DataFrame
data = pd.DataFrame(np.arange(12).reshape(3, 4))
print(data)
# 创建DataFrame,并为其添加索引
t = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list("wxyz"))
print(t)
# 通过字典创建DataFrme
print("*"*100)
temp_dict = {
"name": ["xiaoxiong", "xiaogan"], "age": [30, 24], "tel": [10086, 10010]}
t1 = pd.DataFrame(temp_dict)
print(t1)
# 通过列表创建DataFrme
print("*"*100)
d2=[{
"name": "xiaoxiong", "age": 30, "tel": 10086},{
"name": "xiaoxiong", "age": 30, "tel": 10000},{
"name": "xiaoxiong", "age": 30}]
print(d2)
t2=pd.DataFrame(d2)
print(t2)
# 常用基础属性
print("*"*100)
print(t2.shape)
print(t2.ndim)
print(t2.index)
print(t2.columns)
print(t2.dtypes)
'''
(3, 3)
2
RangeIndex(start=0, stop=3, step=1)
Index(['name', 'age', 'tel'], dtype='object')
name object
age int64
tel int64
dtype: object
'''
# 常用方法
print("*"*100)
df=pd.read_csv('./books.csv')
print(df.head(3))
'''
id ... small_image_url
0 1 ... https://images.gr-assets.com/books/1447303603s...
1 2 ... https://images.gr-assets.com/books/1474154022s...
2 3 ... https://images.gr-assets.com/books/1361039443s...
[3 rows x 23 columns]
'''
print(df.tail(3))
'''
id ... small_image_url
9997 9998 ... https://images.gr-assets.com/books/1455373531s...
9998 9999 ... https://images.gr-assets.com/books/1279214118s...
9999 10000 ... https://images.gr-assets.com/books/1403194704s...
[3 rows x 23 columns]
'''
print(df.info)
print(df.describe())
'''
id book_id ... ratings_4 ratings_5
count 10000.00000 1.000000e+04 ... 1.000000e+04 1.000000e+04
mean 5000.50000 5.264697e+06 ... 1.996570e+04 2.378981e+04
std 2886.89568 7.575462e+06 ... 5.144736e+04 7.976889e+04
min 1.00000 1.000000e+00 ... 7.500000e+02 7.540000e+02
25% 2500.75000 4.627575e+04 ... 5.405750e+03 5.334000e+03
50% 5000.50000 3.949655e+05 ... 8.269500e+03 8.836000e+03
75% 7500.25000 9.382225e+06 ... 1.602350e+04 1.730450e+04
max 10000.00000 3.328864e+07 ... 1.481305e+06 3.011543e+06
[8 rows x 16 columns]
'''
# dataframe中的排序方法sort_values()
# df.sort_values
# pandas取行或列的注意点
# - 方括号写数组,表示取行索引,对行进行操作
# - 方括号写字符串,表示取列索引,对列进行操作
print("*"*100)
print(df[:20])
print(df["books_count"])
print(type(df["books_count"]))
'''
pandas之loc
df.loc 通过标签索引行数据
'''
print("*"*100)
t3=pd.DataFrame(np.arange(12).reshape(3,4),index=list("abc"),columns=list("wxyz"))
print(t3)
print(t3.loc["a", "z"])
print(type(t3.loc["a", "z"]))
print(t3.loc["a"])
# 取行数据
print(t3.loc["a",:])
print(t3.loc[["a","c"]]) # 也可以 t3.loc[["a","c"],:]
# 取列数据
print(t3.loc[:,"y"])
print(t3.loc[["a","b"],["y","w"]])
'''
pandas之iloc
df.iloc 通过位置获取行数据
'''
print("*"*100)
print(t3.iloc[1])
print(t3.iloc[1:])
print(t3.iloc[:,2])
print(t3.iloc[[1,2],[1,2]])
print(t3.iloc[1:,2:])
t3.iloc[1:,2:]=30
print(t3)
t3.iloc[1:,2:]=np.nan
print(t3)
'''
pandas之布尔索引
'''
print("*"*100)
print(df[df["books_count"] > 300])
'''
pandas之缺失值处理
对于NaN的数据,在numpy中我们是如何处理的?
在pandas中我们处理起来非常容易
判断数据是否为NaN:pd.isnull(df),pd.notnull(df)
处理方式1:删除NaN所在的行列dropna (axis=0, how='any', inplace=False)
处理方式2:填充数据,t.fillna(t.mean()),t.fiallna(t.median()),t.fillna(0)
处理为0的数据:t[t==0]=np.nan
当然并不是每次为0的数据都需要处理
计算平均值等情况,nan是不参与计算的,但是0会
'''
print("*"*100)
# print(pd.isnull(t3))
print(t3[pd.notnull(t3["w"])])
print(t3)
print(t3.dropna(axis=0, how="all",inplace=False))
print(t3.dropna(axis=0, how="any",inplace=False))
print("*"*100)
print(t2)
print(t2.fillna(0))
print(t2.fillna(10010))
print(t2.fillna(t2.mean()))
t2["age"]=t2["age"].fillna(t2["age"].mean())
print(t2)
t2["tel"]=t2["tel"].fillna(t2["tel"].mean())
print(t2)
3.3.3 缺失数据的处理
对于NaN的数据,在numpy中我们是如何处理的?
在pandas中我们处理起来非常容易
判断数据是否为NaN:pd.isnull(df),pd.notnull(df)
- 处理方式1
删除NaN所在的行列 dropna (axis=0, how=‘any’, inplace=False) - 处理方式2:
填充数据,t.fillna(t.mean()),t.fillna(t.median()),t.fillna(0)
处理为0的数据:t[t==0]=np.nan
当然并不是每次为0的数据都需要处理
计算平均值等情况,nan是不参与计算的,但是0会
代码:
# coding=utf-8
import pandas as pd
import numpy as np
# 创建DataFrame
data = pd.DataFrame(np.arange(12).reshape(3, 4))
print(data)
# 创建DataFrame,并为其添加索引
t = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list("wxyz"))
print(t)
# 通过字典创建DataFrme
print("*"*100)
temp_dict = {
"name": ["xiaoxiong", "xiaogan"], "age": [30, 24], "tel": [10086, 10010]}
t1 = pd.DataFrame(temp_dict)
print(t1)
# 通过列表创建DataFrme
print("*"*100)
d2=[{
"name": "xiaoxiong", "age": 30, "tel": 10086},{
"name": "xiaoxiong", "age": 30, "tel": 10000},{
"name": "xiaoxiong", "age": 30}]
print(d2)
t2=pd.DataFrame(d2)
print(t2)
# 常用基础属性
print("*"*100)
print(t2.shape)
print(t2.ndim)
print(t2.index)
print(t2.columns)
print(t2.dtypes)
'''
(3, 3)
2
RangeIndex(start=0, stop=3, step=1)
Index(['name', 'age', 'tel'], dtype='object')
name object
age int64
tel int64
dtype: object
'''
# 常用方法
print("*"*100)
df=pd.read_csv('./books.csv')
print(df.head(3))
'''
id ... small_image_url
0 1 ... https://images.gr-assets.com/books/1447303603s...
1 2 ... https://images.gr-assets.com/books/1474154022s...
2 3 ... https://images.gr-assets.com/books/1361039443s...
[3 rows x 23 columns]
'''
print(df.tail(3))
'''
id ... small_image_url
9997 9998 ... https://images.gr-assets.com/books/1455373531s...
9998 9999 ... https://images.gr-assets.com/books/1279214118s...
9999 10000 ... https://images.gr-assets.com/books/1403194704s...
[3 rows x 23 columns]
'''
print(df.info)
print(df.describe())
'''
id book_id ... ratings_4 ratings_5
count 10000.00000 1.0