python数据清洗常用举例
数据展示
![在这里插入图片描述](https://img-blog.csdnimg.cn/5fee39e5b1d14e9889a560006783a243.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NDA1NTY2NA==,size_16,color_FFFFFF,t_70)
练习代码
import pandas as pd
import numpy as np
import os
from datetime import datetime
data=pd.read_excel(r"C:\Users\10746\Desktop\aa\111.xlsx",engine='openpyxl')
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
print(data.head())
data.drop_duplicates(['班级','姓名'],inplace=True)
print(data.describe())
data[(data[['语文','数学','英语']]>100) | (data[['语文','数学','英语']]<0)]=np.nan
print(data.isnull().sum())
da=data.groupby(['班级','年级'])['语文','数学','英语'].mean()
data.set_index(['班级','年级'],inplace=True,drop=False)
data['语文'].fillna(da['语文'],inplace=True)
data['数学'].fillna(da['数学'],inplace=True)
data['英语'].fillna(da['英语'],inplace=True)
data.reset_index(drop=True,inplace=True)
data.sort_values(by=['年级','班级'],inplace=True,ascending=[True,False])
data['rank'] = [i for i in range(data.iloc[:,0].count())]
data['语文']=data['语文'].astype(int)
data['rq'] = data['日期'].apply(lambda x : x.strftime('%Y/%m/%d'))
data['rq']=data['rq'].apply(pd.to_datetime,format='%Y-%m-%d')
data['rq']=data['rq'].apply(lambda x:int(datetime.timestamp(x)))
data['rq']=pd.to_datetime(data['rq'], unit='s')
data['rq']=data['rq'] + pd.Timedelta(days=1)
data['y'] = data['日期'].dt.year
data['rrq']=f'{data["rq"].dt.year}年{data["rq"].dt.month}月{data["rq"].dt.day}日'
data['rrq']=list(map(lambda x:f'{x:%Y}年',data['rq']))
if os.path.exists(r'C:\Users\10746\Desktop\aa\1.xlsx'):
os.remove(r'C:\Users\10746\Desktop\aa\1.xlsx')
data.to_excel(r'C:\Users\10746\Desktop\aa\1.xlsx')
结果展示
![在这里插入图片描述](https://img-blog.csdnimg.cn/4f65ea9f76db4c95b17a86c95773d8ef.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NDA1NTY2NA==,size_16,color_FFFFFF,t_70)