python之Pandas

最新推荐文章于 2024-05-17 14:53:34 发布

阿磊的MD和CFD记录簿

最新推荐文章于 2024-05-17 14:53:34 发布

阅读量572

点赞数 1

分类专栏： Python杂记文章标签： python 数据分析 pandas

本文链接：https://blog.csdn.net/weixin_45896923/article/details/122123316

版权

Python杂记专栏收录该内容

7 篇文章 2 订阅

订阅专栏

python之Pandas

在这里插入图片描述

# -*- coding: utf-8 -*-
"""
Created on Wed Dec 22 20:40:41 2021

@author: 阿磊很努力

summary:Pandas study
"""
import pandas as pd

#=======================Read File=======================#
##读取csv文件
df = pd.read_csv(r'C:\Users\Lenlovo\Desktop\pandas\pokemon_data.csv')
b=df.head(3)   #取前三行
c=df.tail(3)   #取最后三行
print(df.head(3))
print(df.tail(3))

##读取excel文件
df_xlsx = pd.read_excel('pokemon_data.xlsx')
e = df_xlsx.head(3)
print(e)

##读取txt文件
df_txt = pd.read_csv('pokemon_data.txt',delimiter='\t')
print(df_txt.head(3))

#=================Reading Data in Pandas=================#
##read headers读标题
print(df.columns)          #搞清楚有几个标题

##read each Column读取每一列
print(df['Name'])  
#print(df[['Name','Type 1','HP']])           
#or
print(df.Name)             #输出标题为Name的列

print(df['Name'][0:5])     #输出标题为Name的列的前五行
                           #注意这里为左闭右开

##read each Row
print(df.head(3))

print(df.iloc[1])         #输出特定的行，这里为第2行
print(df.iloc[1:5])       #输出第2-5行

##read a specific location
print(df.iloc[3,1])      #第四行 第2列


#====================遍历数据集中的每一行====================#
for index, row in df.iterrows():
    print(index)
    print(row)

##根据输出条件进行输出
df.loc[df['Type 1'] == 'Fire']    

#============Sorting/Describing Data(排序/描述数据)==========#
df.describe()              #提供平均及偏差数据
df.sort_values('Name')     #根据Name排序，默认升序
df.sort_values('Name',ascending=False)   #改为降序
df.sort_values(['Type 1','Speed'],ascending=[1,0])   #多条件排序，Type 1升序，Speed降序 

#====================Making Changes to Data====================#
##增加一列Total
df['Total'] = df['HP']+df['Attack']+df['Defense']+df['Sp. Atk']+df['Sp. Def']+df['Speed']
df.head(5)                 #Total列为前几列的总和

##删除特定列
df = df.drop(columns=['Total'])
df.head(5)

##简洁方式加一列
df['Total'] = df.iloc[:,4:10].sum(axis=1) #第5列到第10列所有行内容水平相加

##改变Total列的位置
cols = list(df.columns)
df = df[cols[0:4] + [cols[-1]]+cols[4:12]] #将Total列插到第四列之后

#====================Saving our Data(Exporting into Desired Fromat)====================#
##储存信息到文件
df.to_csv(r'C:\Users\Lenlovo\Desktop\pandas\modified.csv',index = False)   #df数据存入自动生成的csv文件，index设置取消输出第一列的index 
df.to_excel(r'C:\Users\Lenlovo\Desktop\pandas\modified.xlsx',index = False)
df.to_csv(r'C:\Users\Lenlovo\Desktop\pandas\modified.txt',index = False,sep='\t')

#====================Filtering Data 过滤数据====================#
df.loc[df['Type 1'] == 'Grass']                 #筛选出type1为Grass的数据
df.loc[(df['Type 1'] == 'Grass')& (df['Type 2'] == 'Poison')] #两个筛选条件
new_df = df.loc[(df['Type 1']=='Grass') & (df['Type 2']=='Poison') & (df['HP']>70)]
new_df.to_csv(r'C:\Users\Lenlovo\Desktop\pandas\Filtered.csv',index = False) #注意这个时候新数据的index还是原来的，需要进行调整
##对筛选后的数据的index进行调整
new_df = new_df.reset_index()                  #默认情况下将旧index显示为新列
new_df = new_df.reset_index(drop = True)       #新index代替旧index
new_df = new_df.reset_index(drop = True,inplace = True)       #重置索引，效果一样，节省内存
##筛选字符串中有***的内容
df.loc[df['Name'].str.contains('Mega')]        #筛选出名字中带有Mega的
df.loc[~df['Name'].str.contains('Mega')]       #不带Mega的数据

##高级筛选
import re
df.loc[df['Type 1'].str.contains('fire|grass',flags = re.I,regex = True)] #regex = True打开正则表达式，re.I表示忽略大小写
df.loc[df['Name'].str.contains('^pi[a-z]',flags = re.I,regex = True)]      #筛选出Name中以pi开头的内容，^表示匹配开头字符串

#====================Conditional Changes 条件变化====================#
##根据条件改变内容
df.loc[df['Type 1']=='Fire','Type 1']='Flamer'   #将type1中的fire改为flamer
df.loc[df['Type 1']=='Fire','Legendary']='True'  #将type1中等于fire行的legendary列改为True
df.loc[df['Total']>500,['Generation','Legendary']]=['Test 1','Test 2']  #total大于500的，gen列和leg列的内容分别改为test1和test2

#====================Aggregate Statistics(总统计)====================#
##按照group平均然后排序
df.groupby(['Type 1']).mean()                   #获得type1中按照类型进行平均
df.groupby(['Type 1']).mean().sort_values('HP',ascending = False) #按照HP降序
df.groupby(['Type 1']).sum() 

df.groupby(['Type 1']).count()                 #统计按照type1分类的组中各性质的个数(很复杂，可能有重复)

##使用此方法统计个数
df['count'] = 1                               #最后添加一列,全为1
df.groupby(['Type 1']).count()['count']       #只用统计count列即可按照type1类型统计每种类型的个数
df.groupby(['Type 1','Type 2']).count()['count'] #统计type1和type2为某种类型时的个数

#====================Working with large amounts of data====================#
##当数据很多时如何处理----分块读取
df = pd.read_csv('modified.csv')             #一下子全部读取了
for df in pd.read_csv('modified.csv',chunksize = 5):  #每次读取5行，循环
    print('chunk DF')
    print(df)
    
    
new_df = pd.DataFrame(columns = df.columns)
for df in pd.read_csv('modified.csv',chunksize = 5):
    results = df.groupby(['Type 1']).count()
    
    new_df=pd.concat([new_df,results])