3.python数据分析处理库pandas(学习笔记）

最新推荐文章于 2024-09-13 11:18:00 发布

科研废喵爱吃鱼

最新推荐文章于 2024-09-13 11:18:00 发布

阅读量307

点赞数 1

文章标签：机器学习

本文链接：https://blog.csdn.net/qq_33929660/article/details/107204909

版权

1.数据的读取

import pandas 
food_info = pandas.read_csv("food_info.csv")#由于代码和数据是存在于同一文件夹下，所以不用指定绝对路径）
print(type(food_info))#pandas的核心数据结构DataFrame(它是一个表格型的数据结构，它含有一组有序的列，每列可以是不同的值类型（数值，字符串，布尔值等），它既有行索引也有列索引，可以被看成是由series组成的字典）
print(food_info.dtype)#打印数据结构的类型结构（某个指标是某种类型的列表）
print(help(pandas.read_csv))#打开read函数的帮助文档
food_info.head()#显示一部分读入的数据，默认前五条
food_info.head(3)#显示前三条数据
food_info.tail(4)#显示最后4条数据（从后往前读）
print(food_info.columns)#读取数据的列名
print(food_info.shape)#读取数据的规模

2.索引数据

print(food_info.loc[0])#读取第0号数据，结果为0号数据的所有列
#pandas里面的所有dtypes：object--for string values;float--for float values; int--for string values; datetime--for time values; bool--for boolean values
food_info.loc[3:6]#索引3到6号数据（按行取）
ndb_col = food_info["NDB_No"] #取列名为NDB_No的那列数据
columns = ["Zinc_(mg)","Copper_(mg)"]#给要取的两个列名赋值一个变量
zinc_copper = food_info[Columns]
print(zinc_copper)#即可打印出两列的数据

3.索引特定的数据（例如单位为"g"的数据)

col_names = food_info.columns.tolist() #.columns函数作用是读取列名，.tolist函数的作用是将列名用list形式呈现
print(col_names)
gram_columns = []#定义list的变量，准备存放以（g)为单位的列名

for c in col_names:
    if c.endswith("(g)"): #遍历变量名，查找以（g)为结尾的，如果列名是以“（g）”结尾的，则将其放入到gram_columns的列表中
         gram_columns.append(c)
gram_df = food_info[gram_columns]#gram_df 是所有以（gram_columns）为列名的列的数据
print(gram_df.head(3))#打印前三行

4.对pandas数据进行计算

print(food_info["Iron_(mg)"])
div_1000 = food_info["Iron_(mg)"]/1000#对"Iron_(mg)"列的所有数据除以1000
print(div_1000)
water_energy = food_info["Water_(g)"]*food_info["Energy_Kcal"] #同维度的数据，对应元素相乘

5.新加入一列数据（前提行数相同）

div_1000 = food_info["Iron_(mg)"]/1000
print（food_info.shape)
food_info["Iron_(g)"] = div_1000 #给div_1000起的列名为"Iron_(g)"，并加入原来的数据
print(food_info.shape)

6.常用函数

#最值查找&归一化
max_calories = food_info["Energ_Kcal"].max() #找出"Energ_Kcal"列的最大值，并赋值给max_calories
normalized_calories = food_info["Energ_Kcal"]/max_calories #归一化操作
normalized_protein = food_info["Protein_(g)"]/food_info["Protein_(g)"].max() #归一化操作
#排序
food_info.sort_values("Sodium_(mg)",inplace=True)#.sort_values()函数旨在于从小到大排序。本句代码意思是对"Sodium_(mg)"列进行从小到大排序，升序（默认）
food_info.sort_values("Sodium_(mg)",inplace=True,ascending = False)#ascending = False,从大到小排列（降序）

7.数据预处理实例

import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()

age = titanic_survival["Age"]
print(age.loc[0:10])
#缺失值问题处理----查找
age_is_null = pd.isnull(age) #.isnull（）函数用于判断是否缺失，是为True，否定为False
print(age_is_null)
age_null_true = age[age_is_null]#本句操作会留下所有为True的值
print(age_null_true) 
age_null_count = len(age_null_true) #本句为计算缺失值的总数（长度）
print（age_null_count）

#缺失值问题处理----删掉（当有缺失值时，计算结果为nan，无法获得所需结果）
good_ages = titanic_survival["Age"][age_is_null] == False] #只保留不缺失值的部分
correct_mean_age = sum(good_ages)/len(good_ages)#计算年龄均值
print(correct_mean_age)
#问题一：计算3种不同舱位分别的平均价格  mean fare each class
passenger_classes = [1, 2, 3] #三种舱位
fare_by_class = {}
for this_class in passenger_classes: #遍历舱位
    pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class] #分别找出坐1，2，3等舱的人的数据
    pclass_fares = pclass_rows["Fare"] #定位到价格那一列
    fare_for_class = pclass_fare.mean()#求该列均值 .mean()求均值函数
    fare_by_class[this class] = fare_for_class
print(fare_by_class)
#运行结果：1：84.154     2：20.662    3：13.675
#简化该数据统计问题的解答（求三种舱位平均获救人数）
passenger_survival = titanic_survival.pivot_table(index="Pclass",values="Survived",aggfunc=np.mean)#.pivot_table()是统计中常用函数，此处三个参数，index是指的舱位种类（1，2，3），values指的是是否获救，aggfunc指的是计算3种舱位平均获救人数，即舱位类型和获救人数之间的关系。
print（passenger_survival）
#运行结果：1 38.233441   2  29.877630  3  25.140620
#问题二：三种舱的平均年龄
passenger_age = titanic_survival.pivot_table(index="Pclass",values="Age", aggfunc=np.mean)
print(passenger_age)
#问题三 ：分析登船码头，船票价格这两个变量与获救人数的关系
port_starts = titanic_survival.pivot_table(index="Embarked",values=["Fare","Survived"], aggfunc=np.sum)
print(port_starts)
#运行结果 
#Embarked       Fare           Survived
#  C            10072.26        93
#  Q            1022.25         30
#  S            17439.39        217
#问题四：丢掉缺失值的简化操作 .dropna()
new_titanic_survival = titanic_survival.dropna(axis = 0,subset = ["Age","Sex"])#将Age和Sex列存在缺失值的行去掉
#问题五：定位到具体值，指定行列即可
raw_index_83_age = titanic_survival.loc[83,"Age"] #83行Age列
row_index_1000_pclass =  titanic_survival.loc[83,"Pclass"]#83行Pclass列
print(raw_index_83_age)
print(row_index_1000_pclass)
#问题六：按年龄大小排序，并且重新编号index
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)#按照降序排列，序号还是原序号
print(new_titanic_survival)
titanic_reindexed = new_titanic_survival.reset_index(drop=True) #序号也重新编排
print(titanic_reindexed.loc[0:10])

8.自定义函数
当pandas内置的函数满足不了需求时，可以编写代码，拼接起来，即自定义函数。

#Q1：This function returns the hundredth item from a series
def hundredth_row(column):
    #extract the hundredth item
    hundredth_item = column.loc[99]
    return hundredth_item
#return the hundredth item from each column
hundredth_row = titanic_survival.apply(hundredth_row)
print(hundredth_row)
#Q2:每列的缺失值个数
def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print(column_null_count)
#输出的是每列缺失值的个数

9.series结构
pandas里的主要数据结构就是DataFrame结构（相当于矩阵），它是由一系列Series组成的（相当于一行或者一列）

import pandas as pd
fandango = pd.read_csv('fandango_score_compaiison.csv')
series_film = fandango['FILM']
print(type(series_film))
print(series_film[0:5])