目录
一、样本数据student.csv
NO | Name | class | age | score01 | score02 | address |
1 | student01 | 1 | 25 | 70 | 84 | 北京 |
2 | student02 | 2 | 19 | 71 | 85 | |
3 | student03 | 3 | 20 | 72 | 86 | 天津 |
4 | student04 | 1 | 21 | 73 | 87 | |
5 | student05 | 2 | 25 | 67 | 88 | |
6 | student06 | 3 | 68 | 89 | 河北 | |
7 | student07 | 1 | 73 | 90 | ||
8 | student08 | 2 | 16 | 74 | 91 | |
9 | student09 | 3 | 19 | 75 | 92 | 山东 |
10 | student10 | 3 | 76 | 93 |
二、Pandas读取文件
import pandas as pd
# 读取CSV文件,读取其他格式文件都是read_开头的方法
csv_data = pd.read_csv("student.csv",encoding="gbk")
# Pandas读取数据类型是DataFrame
print(type(csv_data))
# 运行结果如下:
# <class 'pandas.core.frame.DataFrame'>
# 字段数据类型有
print(csv_data.dtypes)
# 运行结果如下:
# NO int64
# Name object
# class int64
# age float64
# score01 int64
# score02 int64
# address object
# dtype: object
# 查看read_csv()方法
print(help(pd.read_csv))
三、Pandas 查看数据基本信息
# head(3)查看数据的前3行(默认值是5,不传值默认展示前5行)
print(csv_data.head(3))
# 运行结果如下:
# NO Name class age score01 score02 address
# 0 1 student01 1 25.0 70 84 北京
# 1 2 student02 2 19.0 71 85 NaN
# 2 3 student03 3 20.0 72 86 天津
# 查看数据维度信息
print(csv_data.shape)
# 运行结果如下:
# (10, 7)
# tail(3)查看数据的倒数3条(默认值是5,不传值默认展示后5行)
print(csv_data.tail(3))
# 运行结果如下:
# NO Name class age score01 score02 address
# 7 8 student08 2 16.0 74 91 NaN
# 8 9 student09 3 19.0 75 92 山东
# 9 10 student10 3 NaN 76 93 NaN
# 获取数据列的标题
print(csv_data.columns)
# 运行结果如下:
# Index(['NO', 'Name', 'class', 'age', 'score01', 'score02', 'address'], dtype='object')
# 获取指定一条数据
print(csv_data.loc[0])
# 运行结果如下
# NO 1
# Name student01
# class 1
# age 25
# score01 70
# score02 84
# address 北京
# Name: 0, dtype: object
# 返回切片值
print(csv_data.loc[2:5])
# 运行结果如下:
# NO Name class age score01 score02 address
# 2 3 student03 3 20.0 72 86 天津
# 3 4 student04 1 21.0 73 87 NaN
# 4 5 student05 2 25.0 67 88 NaN
# 5 6 student06 3 NaN 68 89 河北
# 返回列值
print(csv_data[['NO', 'Name']])
# 运行结果如下:
# NO Name
# 0 1 student01
# 1 2 student02
# 2 3 student03
# 3 4 student04
# 4 5 student05
# 5 6 student06
# 6 7 student07
# 7 8 student08
# 8 9 student09
# 9 10 student10
四、Pandas 基本运算
# 加减乘除
print(csv_data['score01'])
# 运行结果如下:
# 0 70
# 1 71
# 2 72
# 3 73
# 4 67
# 5 68
# 6 73
# 7 74
# 8 75
# 9 76
# Name: score01, dtype: int64
# 数据中每个元素与2相除,其他同理是针对数据中的元素进行加减乘除运算
print(csv_data['score01']/2)
# 运行结果如下:
# 0 35.0
# 1 35.5
# 2 36.0
# 3 36.5
# 4 33.5
# 5 34.0
# 6 36.5
# 7 37.0
# 8 37.5
# 9 38.0
# Name: score01, dtype: float64
# max\min\mean等函数
print(csv_data['score01'].max())
# 运行结果如下
# 76
# set_values()第一个参数是排序列、第二列是为True改变原数组,False不改变原数组,第三列为False降序为True升序
csv_data.sort_values("score01", inplace=True, ascending=False )
print(csv_data)
# 运行结果如下:
# NO Name class age score01 score02 address
# 9 10 student10 3 NaN 76 93 NaN
# 8 9 student09 3 19.0 75 92 山东
# 7 8 student08 2 16.0 74 91 NaN
# 3 4 student04 1 21.0 73 87 NaN
# 6 7 student07 1 NaN 73 90 NaN
# 2 3 student03 3 20.0 72 86 天津
# 1 2 student02 2 19.0 71 85 NaN
# 0 1 student01 1 25.0 70 84 北京
# 5 6 student06 3 NaN 68 89 河北
# 4 5 student05 2 25.0 67 88 NaN
# 查找缺失值
age = csv_data['age']
age_is_null = pd.isnull(age)
age_null_true = age[age_is_null]
age_null_count = len(age_null_true)
print(age_null_count)
# 运行结果如下:
# 3
五、pvoit_table函数
# 计算age均值
# 直接计算因为有NaN空值
age_mean = sum(csv_data['age'])/len(csv_data['age'])
print(age_mean)
# 运行结果如下
# nan NaN与任何数据计算结果都是NaN
# 去除空值进行计算
age_is_not_null = csv_data['age'][csv_data['age'].isnull()==False]
age_mean = sum(age_is_not_null)/len(age_is_not_null)
print(age_mean)
# 运行结果如下:
# 20.714285714285715
# pandas自带mean方法计算逻辑是计算非空值均值
print(csv_data['age'].mean())
# 运行结果如下:
# 20.714285714285715
# 根据一个指标去分析数据
# 计算每个班级的平均分
# 班级集合
stu_cl = set(csv_data['class'])
print(stu_cl)
# 运行结果如下:
# {1, 2, 3}
score_cl = {}
for cl in stu_cl:
this_class_stu = csv_data[csv_data['class']==cl]
score_cl[cl] = this_class_stu['score01'].mean()
print(score_cl)
# 运行结果如下:
# {1: 72.0, 2: 70.66666666666667, 3: 72.75}
# 利用pivot计算每个班的平均分
score_cl_pivot = csv_data.pivot_table(index="class", values="score01")
print(score_cl_pivot)
# 运行结果如下:
# class score01
# 1 72.000000
# 2 70.666667
# 3 72.750000
六、删除确实值
# 删除缺失值
# 按列删除有空值的列
temp = csv_data.dropna(axis=1)
print(temp)
# 运行结果如下:
# NO Name class score01 score02
# 9 10 student10 3 76 93
# 8 9 student09 3 75 92
# 7 8 student08 2 74 91
# 3 4 student04 1 73 87
# 6 7 student07 1 73 90
# 2 3 student03 3 72 86
# 1 2 student02 2 71 85
# 0 1 student01 1 70 84
# 5 6 student06 3 68 89
# 4 5 student05 2 67 88
# 按行删除有空值的行
temp = csv_data.dropna(axis=0, subset=['address'])
print(temp)
# 运行结果如下:
# NO Name class age score01 score02 address
# 8 9 student09 3 19.0 75 92 山东
# 2 3 student03 3 20.0 72 86 天津
# 0 1 student01 1 25.0 70 84 北京
# 5 6 student06 3 NaN 68 89 河北
# 数据定位
print(csv_data.loc[3, "Name"])
# 运行结果如下:
# student04
print(csv_data.loc[3,["Name","score01"]])
# 运行结果如下
# Name student04
# score01 73
# Name: 3, dtype: object
七、修改排序后索引
# 排序,根据Age排序,展示索引是排序前的索引
print(csv_data.sort_values('age',ascending=False))
# 运行结果如下:
# NO Name class age score01 score02 address
# 0 1 student01 1 25.0 70 84 北京
# 4 5 student05 2 25.0 67 88 NaN
# 3 4 student04 1 21.0 73 87 NaN
# 2 3 student03 3 20.0 72 86 天津
# 8 9 student09 3 19.0 75 92 山东
# 1 2 student02 2 19.0 71 85 NaN
# 7 8 student08 2 16.0 74 91 NaN
# 9 10 student10 3 NaN 76 93 NaN
# 6 7 student07 1 NaN 73 90 NaN
# 5 6 student06 3 NaN 68 89 河北
# 修改索引,删除旧索引建立新索引
print(csv_data.sort_values('age',ascending=False).reset_index(drop=True))
# 运行结果如下:
# NO Name class age score01 score02 address
# 0 1 student01 1 25.0 70 84 北京
# 1 5 student05 2 25.0 67 88 NaN
# 2 4 student04 1 21.0 73 87 NaN
# 3 3 student03 3 20.0 72 86 天津
# 4 9 student09 3 19.0 75 92 山东
# 5 2 student02 2 19.0 71 85 NaN
# 6 8 student08 2 16.0 74 91 NaN
# 7 10 student10 3 NaN 76 93 NaN
# 8 7 student07 1 NaN 73 90 NaN
# 9 6 student06 3 NaN 68 89 河北
八、DataFrame的apply函数
# apply函数,对DataFrame进行列操作或者行操作
def get_avg_score(row):
mean_score = (row["score01"]+row["score02"])/2
return mean_score
# 对行进行操作
print(csv_data.apply(get_avg_score,axis=1).reset_index(drop=True))
# 运行结果如下:
# 0 84.5
# 1 83.5
# 2 82.5
# 3 80.0
# 4 81.5
# 5 79.0
# 6 78.0
# 7 77.0
# 8 78.5
# 9 77.5
# dtype: float64
# 对列数据进行操作
def column_null_count(column):
column_null = pd.isnull(column)
null = column[column_null]
return len(null)
print(csv_data.apply(column_null_count))
# 运行结果如下:
# NO 0
# Name 0
# class 0
# age 3
# score01 0
# score02 0
# address 6
# dtype: int64
九、DataFrame的子结构series
# series
# 从DataFrame中获取series
series_data = csv_data['Name']
print(type(series_data))
# 运行结果如下:
# <class 'pandas.core.series.Series'>
print(series_data.loc[2:4])
# 运行结果如下:
# 2 student03
# 1 student02
# 0 student01
# 5 student06
# 4 student05
# Name: Name, dtype: object
# 自己创建
from pandas import Series
print(csv_data)
# 运行结果如下:
# NO Name class age score01 score02 address
# 9 10 student10 3 NaN 76 93 NaN
# 8 9 student09 3 19.0 75 92 山东
# 7 8 student08 2 16.0 74 91 NaN
# 3 4 student04 1 21.0 73 87 NaN
# 6 7 student07 1 NaN 73 90 NaN
# 2 3 student03 3 20.0 72 86 天津
# 1 2 student02 2 19.0 71 85 NaN
# 0 1 student01 1 25.0 70 84 北京
# 5 6 student06 3 NaN 68 89 河北
# 4 5 student05 2 25.0 67 88 NaN
new_series = Series(csv_data["score01"].values, index=csv_data["Name"].values)
print(new_series)
# 运行结果如下:
# student10 76
# student09 75
# student08 74
# student04 73
# student07 73
# student03 72
# student02 71
# student01 70
# student06 68
# student05 67
# dtype: int64
stu_name = new_series.index.tolist()
new_index = sorted(stu_name)
print(new_index)
# 运行结果如下:
# ['student01', 'student02', 'student03', 'student04', 'student05', 'student06', 'student07', 'student08', 'student09', 'student10']
print(new_series.reindex(new_index))
# 运行结果如下:
# student01 70
# student02 71
# student03 72
# student04 73
# student05 67
# student06 68
# student07 73
# student08 74
# student09 75
# student10 76
# dtype: int64
# 索引排序
print(new_series.sort_index())
# 运行结果如下:
# student01 70
# student02 71
# student03 72
# student04 73
# student05 67
# student06 68
# student07 73
# student08 74
# student09 75
# student10 76
# dtype: int64
# 值排序
print(new_series.sort_values())
# 运行结果如下:
# student05 67
# student06 68
# student01 70
# student02 71
# student03 72
# student04 73
# student07 73
# student08 74
# student09 75
# student10 76
# dtype: int64