python pandas中 inplace 参数理解
!](https://img-blog.csdnimg.cn/20200914194137264.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzQ1NTMxNTk0,size_16,color_FFFFFF,t_70#pic_center)
join 和 merge
就是SQL 的那种感觉
import pandas as pd
df3=pd.DataFrame({'Red':[1,3,5],'Green':[5,0,3]},index=list('abc'))
df4=pd.DataFrame({'Blue':[1,9,8],'Yellow':[6,6,7]},index=list('cde'))
print(df3)
print(df4)
merge
# 使⽤用merge,着重关注的是列列的合并
df1=pd.DataFrame({'名字':list('ABCDE'),'性别':['男','⼥女女','男','男','⼥女女'],'职称':['副教授','讲师','助教','教授','助教']},index=range(1001,1006))
df1.columns.name='学院⽼老老师'
df1.index.name='编号'
print(df1)
df2=pd.DataFrame({'名字':list('ABDAX'),'课程':['C++','计算机导论','汇编','数据结构','⻢马克思原理理'],'职称':['副教授','讲师','教授','副教授','讲师']},index=
[1001,1002,1004,1001,3001])
df2.columns.name='课程'
df2.index.name='编号'
print(df2)
# 默认下是根据左右对象中出现同名的列列作为连接的键,且连接⽅方式是how=’inner’
print(pd.merge(df1,df2))# 返回匹配的
4.31.1 多项式特征
poly_features = app_train[['TARGET','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']]
from sklearn.preprocessing import PolynomialFeatures # 多项式
from sklearn.preprocessing import Imputer # 缺失值补充
缺失值填充
#缺失值填充
imputer = Imputer(strategy='median') # 中位数
poly_target = poly_features['TARGET'] # 目标值
poly_features.drop(columns=['TARGET'],inplace =True) # 先删除目标列,目的是除了该列的特征做后面的归一化
poly_features = imputer.fit_transform(poly_features) # 对3个外部数据源的标准化评分进行多项式
poly_transformer = PolynomialFeatures(degree=3) # 最好次幂为3的多项式
poly_transformer.fit(poly_features)
poly_features = poly_transformer.transform(poly_features)
poly_features.shape # 原来外部数据源的标准化评分是3列的h和标签1列,一共是4列 ,现在变成为35列
poly_features.shape # 原来外部数据源的标准化评分是3列的h和标签1列,一共是4列 ,现在变成为35列
#可以获取多少各特征和特征的名字:输入特征的名称即可
poly_transformer.get_feature_names(input_features=['TARGET','EXT_SOURCE_1',
'EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH'])[:20]
# 变成一个DataFrame结构,便于查看
poly_features = pd.DataFrame(poly_features,columns=poly_transformer.get_feature_names(input_features=['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH']))
poly_features.head()
# SK_ID_CURR:此次申请的ID ,就是100002、100003
poly_features['SK_ID_CURR'] = app_train['SK_ID_CURR']
# 合成
app_train_poly = app_train.merge(poly_features,on='SK_ID_CURR',how='left')
app_train_poly.head()
**
4.3.2 创建特征
**
app_train_domain = app_train.copy()# 备份
# AMT_CREDIT :此次申请的贷款金额
# AMT_INCOME_TOTAL:收入总额
# AMT_ANNUITY: 贷款年金
# AMT_INCOME_TOTAL:申请人收入状况
# DAYS_EMPLOYED:工龄
# DAYS_BIRTH:年龄
# CREDIT_INCOME_PERCENT:信用额度与工资比值
# ANNUITY_INCOME_PERCENT:贷款年金和收入的百分比
#CREDIT_TERM:还款的总月份
# DAYS_EMPLOYED_PERCENT:上班时间 / 年龄
app_train_domain['CREDIT_INCOME_PERCENT'] = app_train_domain['AMT_CREDIT'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['ANNUITY_INCOME_PERCENT'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_INCOME_TOTAL']
app_train_domain['CREDIT_TERM'] = app_train_domain['AMT_ANNUITY'] / app_train_domain['AMT_CREDIT']
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['DAYS_EMPLOYED'] / app_train_domain['DAYS_BIRTH']
plt.figure(figsize = (12, 20))
# CREDIT_INCOME_PERCENT:信用额度与工资比值
# ANNUITY_INCOME_PERCENT:贷款年金和收入的百分比
# CREDIT_TERM:还款的总月份
# DAYS_EMPLOYED_PERCENT:上班时间 / 年龄
for i, feature in enumerate(['CREDIT_INCOME_PERCENT', 'ANNUITY_INCOME_PERCENT', 'CREDIT_TERM', 'DAYS_EMPLOYED_PERCENT']):
plt.subplot(4, 1, i + 1)
sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 0, feature], label = 'target == 0')
sns.kdeplot(app_train_domain.loc[app_train_domain['TARGET'] == 1, feature], label = 'target == 1')
plt.title('Distribution of %s by Target Value' % feature)
plt.xlabel('%s' % feature); plt.ylabel('Density');
plt.tight_layout(h_pad = 2.5)
plt.show()
# 从4个特征中,差异性越强越明显的特征,对模型越有用,越起到促进的作用
数据预处理:
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.model_selection import train_test_split
label = app_train['TARGET']
train = app_train.drop(columns = ['TARGET']) # 先删掉标签
train,test,y_train,y_test= train_test_split(train,label,test_size = 0.2,random_state = 0)
features = list(train.columns)
imputer = Imputer(strategy='median')
std = StandardScaler()
#填充
imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(test)
#标准化
std.fit(train)
train = std.transform(train)
test = std.transform(test)
test.shape
4.5 基础模型:逻辑回归和随机森林
率先使用LR 建模,然后看一下准确度
不行就上集成模型,
再不行就XGBOOST
GBDT
神经网络
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(C=0.0001)
log_reg.fit(train,y_train)
一般逻辑回归的这个参数就在 0.1 ,0.01 , 0.0001 上面区选择
预测结果,并返回评估指标
predictions = log_reg.predict_proba(test)[:,1]
predictions[0]
predictions[:5]
准确率:
from sklearn.metrics import roc_auc_score
test_auc = roc_auc_score(y_test,predictions)
test_auc
对比随机森林:
from sklearn.ensemble import RandomForestClassifier
# 树的个数一般是1000,再大跟1000颗树的效果是一样
random_forest = RandomForestClassifier(n_estimators=1000,random_state=10,n_jobs=-1)
random_forest.fit(train,y_train)
predictions = random_forest.predict_proba(test)[:,1]
test_auc = roc_auc_score(y_test,predictions)
test_auc