python--data.dropna

读取csv文件 data=pd.read_csv(‘G:\IOtest_1.csv’)

1、删除全为空值的行或列

      data=data.dropna(axis=0,how='all')   #行

      data=data.dropna(axis=1,how='all')   #列

2、删除含有空值的行或列

     data=data.dropna(axis=0,how='any')   #行

     data=data.dropna(axis=1,how='any')   #列

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
# -*- coding: utf-8 -*- import pandas as pd import pickle import time from sklearn import preprocessing def current_time(): ''' 以固定格式打印当前时间 :return:返回当前时间的字符串 ''' return time.strftime('%Y-%m-%d %X', time.localtime()) class DataPre: ''' 数据预处理器 它的初始化需要提供清洗好的数据。它提供了唯一的对外接口:load_data()。它返回预处理好的数据。 如果数据已存在,则直接返回。否则将执行一系列预处理操作并返回预处理好的数据。 ''' def __init__(self, train_data, test_data): self.train_datas = pd.read_csv(train_data,index_col=['SK_ID_CURR']) self.trainlables = self.train_datas['TARGET'] # 删除唯一值 self.train_datas.drop(['TARGET'], axis=1,inplace=True) self.test_datas = pd.read_csv(test_data,index_col=['SK_ID_CURR']) self.test_datas.drop(['Unnamed: 0'], axis=1,inplace=True) self.train_datas.drop(['Unnamed: 0'], axis=1,inplace=True) self.test_datas.drop(['previousSCOFR'], axis=1,inplace=True) # 这两个特征全是 NAN self.train_datas.drop(['previousSCOFR'], axis=1,inplace=True) self.test_datas.drop(['previousHomewares'], axis=1,inplace=True) self.train_datas.drop(['previousHomewares'], axis=1,inplace=True) self.train_datas.dropna(axis=0, how='all',inplace=True) self.train_datas.dropna(axis=1, how='all',inplace=True) self.test_datas.dropna(axis=0, how='all',inplace=True) self.test_datas.dropna(axis=1, how='all',inplace=True) print(len(self.train_datas.dtypes)) print(len(self.test_datas.dtypes)) self.fname = 'D:\\data\\processed_data' # 类别特征全部编码 def encode(self,features): print("----- Begin run encode at %s -------" % current_time()) TrainTemp = self.train_datas[features] TestTemp = self.test_datas[features] le = preprocessing.LabelEncoder() for i in features: le.fit(TrainTemp[i]) self.train_datas[i] = le.transform(TrainTemp[i]) self.test_datas[i]= le.transform(TestTemp[i]) print("----- end run encode at %s -------" % current_time()) def getColType(self): ''' :处理数据类型 ''' Tcols = self.train_datas.columns TeCols = self.test_datas.columns cols = list(set(list(Tcols)) & (set(list(TeCols)))) objectLists = [] numberLists = [] RTraindata = self.train_datas[cols] RTestdata = self.test_datas[cols] self.train_datas = None self.train_datas = RTraindata self.test_datas = None self.test_datas = RTestdata print(len(self.train_datas.columns)) print(len(cols)) for i in cols: temp = self.train_datas[i].dtype if temp == 'object': objectLists.append(i) else: numberLists.append(i) return objectLists,numberLists def writeFile(self,dictdata,filename): with open(filename,'w') as files: files.write(str(dictdata)) def fillna(self,numeric_feature,string_feature): # 填充缺失值 for i in string_feature: # 字符串类型 self.train_datas[i].fillna('miss',inplace=True) self.test_datas[i].fillna('miss',inplace=True) for i in numeric_feature: value = self.train_datas[i].mean() self.train_datas[i].fillna(value,inplace=True) self.test_datas[i].fillna(value,inplace=True) def scaled(self,types): ''' 特征归一化 :return: ''' print("----- Begin run scaled at %s -------" % current_time()) trantemp = self.train_datas[types] testtemp = self.test_datas[types] trantemp.fillna(0, inplace=True) testtemp.fillna(0, inplace=True) for i in types: MEAN = trantemp[i].mean() STD = trantemp[i].std() if STD != 0: trainnorm = (trantemp[i] - MEAN) / STD testnorm = (testtemp[i] - MEAN) / STD self.train_datas[i] = trainnorm self.test_datas[i] = testnorm print("----- End run scaled at %s -------" % current_time()) def save_data(self): print("----- Begin run save_data at %s -------"%current_time()) with open(self.fname,'wb') as file:#保存训练集、测试集、编码器、归一化器 pickle.dump([self.train_datas,self.test_datas,self.trainlables],file) print("----- End run save_data at %s -------"%current_time()) def load_data(self): print("----- Begin run _load_data at %s -------"%current_time()) with open(self.fname,'rb') as file:#加载训练集、测试集 self.train_datas,self.test_datas,self.trainlables=pickle.load(file) print("----- End run _load_data at %s -------"%current_time()) def propressData(self): TobjectList, TnnumberList = datapre.getColType() self.fillna(TnnumberList, TobjectList) self.scaled(TnnumberList) self.encode(TobjectList) self.save_data() if __name__=='__main__': datapre=DataPre("D:\\data\\train_data.csv","D:\\data\\test_data.csv") datapre.propressData()

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值