df 根据列排序后,index 索引并没有变,需要重新df.index = range(0,len(df))指定索引,才能df["column_name"][0]
将某列值为通过的index取出,然后取该值对应的样本
index = data.loc[(data['check_flag']=='通过')].index.tolist()
data_sucess = data.iloc[index, 2:4] # 取2-3个列
data_sucess.head()
将某列值为通过的,编码为新列label,值为1
data.loc[data['check_flag'] == '通过', "label"] = 1
炸裂函数
df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
df.explode('A')
1.df中的数据以分隔符拆分
tmp = []
for i in range(len(data)):
tmp.append(data.iloc[i][0].split(":"))
tmp
2.tmp(list类型)变为df,合并df,加上df列名
data_output_split = pd.DataFrame(tmp[:])
# 合并两个df
df = pd.concat([data_y,data_output_split],axis=1)
df.columns = ['test_y','group', 'sort_y', 'ipo', 'normal','none']
3.查询df某一列中含有某个值的样本
df[df['group']=='2']
4.pandas读取csv,读取excel
import pandas as pd
data = pd.read_excel('./data/province.xlsx')
data_train = pd.read_csv("./data/new_fivestar_rank_eval_neg_merge.csv")
5.df某一列数据加前后缀
data_train['ipo_test'] = '1:' + data_train['ipo']
6.df剔除某一列的缺失值
data.dropna(subset=['ipo_reg_compute'],inplace=True)
#删除table值为sc的那一行
>>> df.drop(index=(df.loc[(df['table']=='sc')].index))
7.df转换某一列的数据类型
data_train['ipo_reg_compute']=data_train['ipo_reg_compute'].astype("int")
8.df查看是否有缺失值,简单描述统计
data_train["merge_reg_compute"].info()
data_train["merge_reg_compute"].describe()
9.df看某一列各个类别的计数
data_train['ipo_industry'].value_counts()
10.df某一列分类变量变成哑变量
ipo_province_onehot = pd.get_dummies(data_train['ipo_province'])
11.df某一列具体有几个类别,保存到list
industry_cate_ipo = data_train['ipo_industry'].unique()
12.ROC曲线
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
def plot_roc(labels, predict_prob):
false_positive_rate,true_positive_rate,thresholds=roc_curve(labels, predict_prob)
roc_auc=auc(false_positive_rate, true_positive_rate)
plt.title('ROC')
plt.plot(false_positive_rate, true_positive_rate,'b',label='AUC = %0.4f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.ylabel('TPR')
plt.xlabel('FPR')
plot_roc(y_test, preds)
plt.show()
13.原df加新的某一列数据
tmpnew = []
for i in range(len(data_train)):
tmpnew.append(str(data_train.iloc[i]["industry_newcate_ipo"])+"-"+str(data_train.iloc[i]["industry_newcate_normal"]))
if i%100000==0:
print(i)
print(len(tmpnew))
data_train['industry_newcate_ipo-normal'] = tmpnew
data_train.head()
14.array按某一列值 实现整体排序
data_new_array_sort = data_new_array[np.argsort(data_new_array[:,len(data_new_array[0])-1].astype(np.int)),:]