pandas常用
数据文件的读写
import pandas as pd
df1 = pd.read_excel(r'F:\a_houdashi_work\bbb_python学习\python高薪全能班\pandas\课程代码\stock_data.xlsx',"Sheet1")
#1,读取excel
df2 = pd.read_excel('F:/a_houdashi_work/bbb_python学习/python高薪全能班/pandas/课程代码/stock_data.xlsx',"Sheet1")
#2,读取处理异常值
def convert_price(t):
if t == 'n.a.':
return 250
return t
df3 = pd.read_excel(r'F:\a_houdashi_work\bbb_python学习\python高薪全能班\pandas\课程代码\stock_data.xlsx',
"Sheet1",converters={
'price':convert_price
})
df2.dtypes
df3.dtypes
#3,写出数据到excel
df2.to_excel(r'F:\a_houdashi_work\bbb_python学习\python高薪全能班\pandas\课程代码\w1.xlsx',
sheet_name='weather',index=False)
df3.to_excel(r'F:\a_houdashi_work\bbb_python学习\python高薪全能班\pandas\课程代码\w2.xlsx',
sheet_name='weather',index=False,startrow=3,startcol=4)
#多4,个写入
with pd.ExcelWriter(r'F:\a_houdashi_work\bbb_python学习\python高薪全能班\pandas\课程代码\combine.xlsx') as writer:
df1.to_excel(writer,sheet_name='df1',index=False)
df2.to_excel(writer,sheet_name='df2')
缺失数据的补偿方法
import pandas as pd #1,导入数据,处理字段为时间类型 df4 = pd.read_csv(r'F:\a_houdashi_work\bbb_python学习\python高薪全能班\pandas\课程代码\weather_data.csv') df4 = pd.read_csv(r'F:\a_houdashi_work\bbb_python学习\python高薪全能班\pandas\课程代码\weather_data.csv' ,parse_dates=['day']) df4.head(10) df4.info() type(df4.day) type(df4.day[0]) #2,手动补缺 df4.iloc[0,1] df4.iloc[0,0]=44 #3,使用fillna填充空值 new_df4 = df4.fillna(0) new_df4 = df4.fillna( { 'temperature' :250, 'event':'No Event', 'windspeed':'77' } ) #ffill填充 new_df4 = df4.fillna(method='ffill') new_df4 = df4.fillna(method='bfill') new_df4 = df4.fillna(method='bfill',axis='columns') new_df4 = df4.fillna(method='bfill',limit=2) #空值往下填充几个 #4,使用finterpolate填充空值,拟合过渡器 newdf = df4.interpolate() newdf = df4.interpolate(method="time") #5,删除法 dropna() new_df = df4.dropna() new_df = df4.dropna(thresh=3) #只要有3个字段有数据就留下
替换
import pandas as pd
import numpy as np
#1,导入数据,处理字段为时间类型
df5 = pd.read_csv(r'F:\a_houdashi_work\bbb_python学习\python高薪全能班\pandas\课程代码\weather_data.csv'
,parse_dates=['day'])
#1,单值替换
new_df5 = df5.replace(-99999,value=np.NAN)
new_df5.head(10)
df5.iloc[4,2]=-8888
new_df5 = df5.replace(to_replace=[-99999,-8888],value=np.NAN)
new_df5.head()
#2,替换专门列
new_df5 = df5.replace(
{
'event':'0'
},np.NAN
)
type(df5.event[0])
new_df5.head()
#3,使用map表替换
newdf = df5.replace(
{
-99999:np.nan,
'Sunny':'Sunny222'
}
)
newdf
#4,正则替换法
newdf = df5.replace(
{
'temperature':'[A-Za-z]'
},'',regex=True
)
newdf
newdf.iloc[3,1] ='abcd'
newdf2 = newdf.replace(
{
'temperature':'[A-Za-z]'
},'',regex=True
)
newdf2
#5,列表替换
df = pd.DataFrame(
{
'score':['a','b','c','d']
}
)
df = df.replace(['a','b','c','d'],[5,4,3,2])