一、任务2.1
1、任务内容
编写代码回答下面的问题:
-
字段x1至x8为用户相关的属性,为匿名处理字段。添加代码对这些数据字段的取值分析,那些字段为数值类型?那些字段为类别类型?
-
对于数值类型的字段,考虑绘制在标签分组下的箱线图。
-
从common_ts中提取小时,绘制每小时下标签分布的变化。
-
对udmap进行onehot,统计每个key对应的标签均值,绘制直方图。
2、具体实践
①查看x1到x8属性:
data_types = train_data[['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8']].dtypes
# 查看数据类型
print(data_types)
②数值类型字段在标签分组下的箱线图:
X = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8']
for field in X:
sns.boxplot(x='target', y=field, data=train_data)
plt.title(f'Boxplot of {field}')
plt.show()
③提取小时并绘制每小时下标签分布的变化。
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
hours=pd.read_csv("Hours.csv")
plt.rcParams["font.sans-serif"]=['SimHei']
labels = ['0', '1', '2', '3', '4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23']
target1=[hours.iloc[0,0],hours.iloc[0,1],hours.iloc[0,2],hours.iloc[0,3],hours.iloc[0,4],hours.iloc[0,5],hours.iloc[0,6],hours.iloc[0,7],hours.iloc[0,8],hours.iloc[0,9],hours.iloc[0,10],hours.iloc[0,11],hours.iloc[0,12],hours.iloc[0,13],hours.iloc[0,14],hours.iloc[0,15],hours.iloc[0,16],hours.iloc[0,17],hours.iloc[0,18],hours.iloc[0,19],hours.iloc[0,20],hours.iloc[0,21],hours.iloc[0,22],hours.iloc[0,23]]
target0=[hours.iloc[1,0],hours.iloc[1,1],hours.iloc[1,2],hours.iloc[1,3],hours.iloc[1,4],hours.iloc[1,5],hours.iloc[1,6],hours.iloc[1,7],hours.iloc[1,8],hours.iloc[1,9],hours.iloc[1,10],hours.iloc[1,11],hours.iloc[1,12],hours.iloc[1,13],hours.iloc[1,14],hours.iloc[1,15],hours.iloc[1,16],hours.iloc[1,17],hours.iloc[1,18],hours.iloc[1,19],hours.iloc[1,20],hours.iloc[1,21],hours.iloc[1,22],hours.iloc[1,23]]
data = [target0,target1]
sums = np.sum(data, axis=0)
x = range(len(labels))
width = 0.35
p1 = plt.bar(labels, target1/sums, width, label='老用户')
p2 = plt.bar(labels, target0/sums, width,
bottom=target1/sums,label='新用户')
plt.xlabel('小时数',fontsize=20)
plt.ylabel('占比',fontsize=20)
plt.title('用户访问应用记录小时分布图',fontsize=20)
plt.grid(axis='y',alpha=0.5,ls='--')
plt.legend(frameon=False,bbox_to_anchor=(1.01,1))
# 为第一段柱子添加标签
plt.bar_label(p1, label_type='center')
# 为第二段柱子添加标签
plt.bar_label(p2, label_type='center')
# 为柱子整体添加标签
#plt.bar_label(p2)
plt.show()
④对udmap进行onehot,统计每个key对应的标签均值,绘制直方图。
for i in range(1,10):
key_mean_target = train_data.groupby('key' + str(i))['target'].mean()
plt.bar(key_mean_target.index, key_mean_target)
plt.show()
二、任务2.2
1、任务内容
编写代码回答下面的问题:
-
在上面模型中哪一个模型的macro F1效果最好,为什么这个模型效果最好?
-
使用树模型训练,然后对特征重要性进行可视化。
-
再加入3个模型训练,对比模型精度。
2、具体实践
# 导入库
import pandas as pd
import numpy as np
# 读取训练集和测试集文件
train_data = pd.read_csv('用户新增预测挑战赛公开数据/train.csv')
test_data = pd.read_csv('用户新增预测挑战赛公开数据/test.csv')
# 提取udmap特征,人工进行onehot
def udmap_onethot(d):
v = np.zeros(9)
if d == 'unknown':
return v
d = eval(d)
for i in range(1, 10):
if 'key' + str(i) in d:
v[i-1] = d['key' + str(i)]
return v
train_udmap_df = pd.DataFrame(np.vstack(train_data['udmap'].apply(udmap_onethot)))
test_udmap_df = pd.DataFrame(np.vstack(test_data['udmap'].apply(udmap_onethot)))
train_udmap_df.columns = ['key' + str(i) for i in range(1, 10)]
test_udmap_df.columns = ['key' + str(i) for i in range(1, 10)]
# 编码udmap是否为空
train_data['udmap_isunknown'] = (train_data['udmap'] == 'unknown').astype(int)
test_data['udmap_isunknown'] = (test_data['udmap'] == 'unknown').astype(int)
# udmap特征和原始数据拼接
train_data = pd.concat([train_data, train_udmap_df], axis=1)
test_data = pd.concat([test_data, test_udmap_df], axis=1)
# 提取eid的频次特征
train_data['eid_freq'] = train_data['eid'].map(train_data['eid'].value_counts())
test_data['eid_freq'] = test_data['eid'].map(train_data['eid'].value_counts())
# 提取eid的标签特征
train_data['eid_mean'] = train_data['eid'].map(train_data.groupby('eid')['target'].mean())
test_data['eid_mean'] = test_data['eid'].map(train_data.groupby('eid')['target'].mean())
# 提取时间戳
train_data['common_ts'] = pd.to_datetime(train_data['common_ts'], unit='ms')
test_data['common_ts'] = pd.to_datetime(test_data['common_ts'], unit='ms')
train_data['common_ts_hour'] = train_data['common_ts'].dt.hour
test_data['common_ts_hour'] = test_data['common_ts'].dt.hour
# 导入模型
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
# 导入交叉验证和评价指标
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
# 训练并验证SGDClassifier
pred = cross_val_predict(
SGDClassifier(max_iter=10),
train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1),
train_data['target']
)
print(classification_report(train_data['target'], pred, digits=3))
# 训练并验证DecisionTreeClassifier
pred = cross_val_predict(
DecisionTreeClassifier(),
train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1),
train_data['target']
)
print(classification_report(train_data['target'], pred, digits=3))
# 训练并验证MultinomialNB
pred = cross_val_predict(
MultinomialNB(),
train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1),
train_data['target']
)
print(classification_report(train_data['target'], pred, digits=3))
# 训练并验证RandomForestClassifier
pred = cross_val_predict(
RandomForestClassifier(n_estimators=5),
train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1),
train_data['target']
)
print(classification_report(train_data['target'], pred, digits=3))
三、任务2.3
1、任务内容
编写代码回答下面的问题:
-
加入特征之后模型的精度有什么变化?
-
思考并加入3个额外的特征,并观测模型精度的变化
2、具体实践
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler,SMOTE
from collections import Counter
from sklearn.model_selection import learning_curve
train_data = pd.read_csv('用户新增预测挑战赛公开数据/train.csv')
test_data = pd.read_csv('用户新增预测挑战赛公开数据/test.csv')
X=train_data.drop(columns='target')
Y=train_data['target']
ros=RandomOverSampler(random_state=0)
X_over,Y_over=ros.fit_resample(X,Y)
train_data=pd.concat([X_over,Y_over],axis=1)
train_data.to_csv("change1.csv",index=None)
'''
def x7_change(x):#新增
if x==1:
return 1
else:
return 0
train_data['x7_feature']=train_data['x7'].apply(x7_change)
test_data['x7_feature']=train_data['x7'].apply(x7_change)
'''
'''
def udmap_onethot(d):
v = np.zeros(9)
if d == 'unknown':
return v
d = eval(d)
for i in range(1, 10):
if 'key' + str(i) in d:
v[i - 1] = d['key' + str(i)]
return v
train_udmap_df = pd.DataFrame(np.vstack(train_data['udmap'].apply(udmap_onethot)))
test_udmap_df = pd.DataFrame(np.vstack(test_data['udmap'].apply(udmap_onethot)))
train_udmap_df.columns = ['key' + str(i) for i in range(1, 10)]
test_udmap_df.columns = ['key' + str(i) for i in range(1, 10)]
train_data = pd.concat([train_data, train_udmap_df], axis=1)
test_data = pd.concat([test_data, test_udmap_df], axis=1)
'''
from sklearn import preprocessing
enc=preprocessing.LabelEncoder()
enc=enc.fit(train_data['udmap'])
train_data['udmap_feature']=enc.transform(train_data['udmap'])
enc=preprocessing.LabelEncoder()
enc=enc.fit(test_data['udmap'])
test_data['udmap_feature']=enc.transform(test_data['udmap'])
X=train_data.drop(['target','udmap'],axis=1)
Y=train_data['target']
'''
smote=SMOTE(random_state=0)
X_smote,Y_smote=smote.fit_resample(X,Y)
train_data=pd.concat([X_smote,Y_smote,train_data['udmap']],axis=1)
'''
train_data['common_ts'] = pd.to_datetime(train_data['common_ts'], unit='ms')
test_data['common_ts'] = pd.to_datetime(test_data['common_ts'], unit='ms')
train_data['eid_freq'] = train_data['eid'].map(train_data['eid'].value_counts())
test_data['eid_freq'] = test_data['eid'].map(train_data['eid'].value_counts())
train_data['eid_mean'] = train_data['eid'].map(train_data.groupby('eid')['target'].mean())
test_data['eid_mean'] = test_data['eid'].map(train_data.groupby('eid')['target'].mean())
train_data['udmap_isunknown'] = (train_data['udmap'] == 'unknown').astype(int)
test_data['udmap_isunknown'] = (test_data['udmap'] == 'unknown').astype(int)
train_data['common_ts_hour'] = train_data['common_ts'].dt.hour
test_data['common_ts_hour'] = test_data['common_ts'].dt.hour
def hours_feature(x):
if x>=5 and x<=16:
return 1
else:
return 0
train_data['hour_feature']=train_data['common_ts_hour'].apply(hours_feature)
test_data['hour_feature']=test_data['common_ts_hour'].apply(hours_feature)
train_data['common_ts_day'] = train_data['common_ts'].dt.day
test_data['common_ts_day'] = test_data['common_ts'].dt.day
'''
def day_feature(x):#新增
if x==6:
return 1
elif x==7:
return 0.2
else:
return 0
train_data['day_feature']=train_data['common_ts_day'].apply(day_feature)
test_data['day_feature']=train_data['common_ts_day'].apply(day_feature)
'''
train_data['x1_freq'] = train_data['x1'].map(train_data['x1'].value_counts())
test_data['x1_freq'] = test_data['x1'].map(train_data['x1'].value_counts())
train_data['x1_mean'] = train_data['x1'].map(train_data.groupby('x1')['target'].mean())
test_data['x1_mean'] = test_data['x1'].map(train_data.groupby('x1')['target'].mean())
train_data['x2_freq'] = train_data['x2'].map(train_data['x2'].value_counts())
test_data['x2_freq'] = test_data['x2'].map(train_data['x2'].value_counts())
train_data['x2_mean'] = train_data['x2'].map(train_data.groupby('x2')['target'].mean())
test_data['x2_mean'] = test_data['x2'].map(train_data.groupby('x2')['target'].mean())
train_data['x3_freq'] = train_data['x3'].map(train_data['x3'].value_counts())
test_data['x3_freq'] = test_data['x3'].map(train_data['x3'].value_counts())
train_data['x4_freq'] = train_data['x4'].map(train_data['x4'].value_counts())
test_data['x4_freq'] = test_data['x4'].map(train_data['x4'].value_counts())
train_data['x6_freq'] = train_data['x6'].map(train_data['x6'].value_counts())
test_data['x6_freq'] = test_data['x6'].map(train_data['x6'].value_counts())
train_data['x6_mean'] = train_data['x6'].map(train_data.groupby('x6')['target'].mean())
test_data['x6_mean'] = test_data['x6'].map(train_data.groupby('x6')['target'].mean())
train_data['x7_freq'] = train_data['x7'].map(train_data['x7'].value_counts())
test_data['x7_freq'] = test_data['x7'].map(train_data['x7'].value_counts())
train_data['x7_mean'] = train_data['x7'].map(train_data.groupby('x7')['target'].mean())
test_data['x7_mean'] = test_data['x7'].map(train_data.groupby('x7')['target'].mean())
train_data['x8_freq'] = train_data['x8'].map(train_data['x8'].value_counts())
test_data['x8_freq'] = test_data['x8'].map(train_data['x8'].value_counts())
train_data['x8_mean'] = train_data['x8'].map(train_data.groupby('x8')['target'].mean())
test_data['x8_mean'] = test_data['x8'].map(train_data.groupby('x8')['target'].mean())
train_data['hour_freq']=train_data['common_ts_hour'].map(train_data['common_ts_hour'].value_counts())
test_data['hour_freq']=test_data['common_ts_hour'].map(test_data['common_ts_hour'].value_counts())
train_data['hour_mean']=train_data['common_ts_hour'].map(train_data.groupby('common_ts_hour')['target'].mean())
test_data['hour_mean']=test_data['common_ts_hour'].map(train_data.groupby('common_ts_hour')['target'].mean())
train_data.to_csv("changed.csv",index=None)
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
pred = cross_val_predict(
DecisionTreeClassifier(),
train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1),
train_data['target']
)
print(classification_report(train_data['target'], pred, digits=3))
clf = DecisionTreeClassifier()
clf.fit(
train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1),
train_data['target']
)
pred=clf.predict(train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1))
precision=precision_score(train_data["target"],pred)
recall=recall_score(train_data["target"],pred)
f1_score=f1_score(train_data["target"],pred)
print(precision)
print(recall)
print(f1_score)
result_df = pd.DataFrame({
'uuid': test_data['uuid'],
'target': clf.predict(test_data.drop(['udmap', 'common_ts', 'uuid'], axis=1))
})
result_df.to_csv('submit.csv', index=None)
发现:1、添加特征后模型的精度增加
2、加入了额外的四个特征:hour_feature、day_feature、hour_freq和hour_mean
模型精度:上升