比赛通用模板
示例数据集
百度AI常规赛反欺诈预测
数据集链接: https://aistudio.baidu.com/aistudio/competition/detail/52
简要处理流程
导入与加载数据集
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# 数据加载
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test1.csv')
train
警告抑制
warnings.filterwarnings('ignore')
查看数据字段数量与类型
train.info // train 是Dataframe类型
查看某类特征数值分布与数量
train['lan'].value_counts() // 'lan'为特征列名
去掉Dataframe首列
test = test.iloc[:, 1:]
train = train.iloc[:, 1:]
特征处理
特征选取
//['os', 'osv', 'lan', 'sid’]
features = train.columns.tolist()
features.remove('label') //去除标签列
print(features)
查看每个特征的值数量
for feature in features:
print(feature, train[feature].nunique()) // 即每特征列有几种元素,unique返回元素种类个数
统计对哈希特征用长度编码
train['fea_hash'].map(lambda x: len(str(x))).value_counts()
筛选特征
remove_list = ['os', 'osv', 'lan', 'sid']
col = features
for i in remove_list:
col.remove(i)
col
# 特征筛选
features = train[col]
# 构造fea_hash_len特征
features['fea_hash_len'] = features['fea_hash'].map(lambda x: len(str(x)))
features['fea1_hash_len'] = features['fea1_hash'].map(lambda x: len(str(x)))
# Thinking:为什么将很大的,很长的fea_hash化为0?
# 如果fea_hash很长,都归为0,否则为自己的本身
features['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))
features['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))
用LightGBM进行训练
#train['os'].value_counts()
# 使用LGBM训练
import lightgbm as lgb
model = lgb.LGBMClassifier()
# 模型训练
model.fit(features.drop(['timestamp', 'version'], axis=1), train['label'])
result = model.predict(test_features.drop(['timestamp', 'version'], axis=1))
结果
#features['version'].value_counts()
res = pd.DataFrame(test['sid'])
res['label'] = result
res.to_csv('./baseline.csv', index=False)
完整代码
#!/usr/bin/env python
# coding: utf-8
# ## baseline1版本,不参与建模的特征 ['os', 'osv', 'version', 'lan', 'sid’]
# ## Score = 86.714
# In[1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# 数据加载
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test1.csv')
train
# In[6]:
test = test.iloc[:, 1:]
train = train.iloc[:, 1:]
train
# In[15]:
#train.info()
#train['lan'].value_counts()
# Object类型: lan, os, osv, version, fea_hash
# 字符串类型 需要转换为数值(labelencoder)
object_cols = train.select_dtypes(include='object').columns
# 缺失值个数
temp = train.isnull().sum()
# 有缺失值的字段: lan, osv
temp[temp>0]
# ##### Object类型: lan, os, osv, version, fea_hash
# ##### 有缺失值的字段: lan, osv
# In[18]:
# ['os', 'osv', 'lan', 'sid’]
features = train.columns.tolist()
features.remove('label')
print(features)
# In[19]:
for feature in features:
print(feature, train[feature].nunique())
# In[32]:
# Thinking: fea_hash是否要做特征变换?
#train['fea_hash'].value_counts()
#train['fea_hash'].describe()
train['fea_hash'].map(lambda x: len(str(x))).value_counts()
# In[31]:
#train['fea1_hash'].value_counts()
train['fea1_hash'].map(lambda x: len(str(x))).value_counts()
# In[27]:
remove_list = ['os', 'osv', 'lan', 'sid']
col = features
for i in remove_list:
col.remove(i)
col
# In[35]:
# 特征筛选
features = train[col]
# 构造fea_hash_len特征
features['fea_hash_len'] = features['fea_hash'].map(lambda x: len(str(x)))
features['fea1_hash_len'] = features['fea1_hash'].map(lambda x: len(str(x)))
# Thinking:为什么将很大的,很长的fea_hash化为0?
# 如果fea_hash很长,都归为0,否则为自己的本身
features['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))
features['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))
features
# In[36]:
test_features = test[col]
# 构造fea_hash_len特征
test_features['fea_hash_len'] = test_features['fea_hash'].map(lambda x: len(str(x)))
test_features['fea1_hash_len'] = test_features['fea1_hash'].map(lambda x: len(str(x)))
# Thinking:为什么将很大的,很长的fea_hash化为0?
# 如果fea_hash很长,都归为0,否则为自己的本身
test_features['fea_hash'] = test_features['fea_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))
test_features['fea1_hash'] = test_features['fea1_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))
test_features
# In[41]:
#train['os'].value_counts()
# 使用LGBM训练
import lightgbm as lgb
model = lgb.LGBMClassifier()
# 模型训练
model.fit(features.drop(['timestamp', 'version'], axis=1), train['label'])
result = model.predict(test_features.drop(['timestamp', 'version'], axis=1))
result
# In[45]:
#features['version'].value_counts()
res = pd.DataFrame(test['sid'])
res['label'] = result
res.to_csv('./baseline.csv', index=False)
res