添加数据
通过添加第 1、2、3、4、5 类数据,使用 ngram 词向量和 xgboost 分类器,效果大幅提高
# 类别 1 数据
with open("new_data/label_1.pkl", "rb") as f:
labels_1 = pickle.load(f)
train_apis_1 = pickle.load(f) # list 287
print('label 1 length:', len(train_apis_1))
# 类别 2 数据
with open("new_data/label_2.pkl", "rb") as f:
labels_2 = pickle.load(f)
train_apis_2 = pickle.load(f) # list 744
print('label 2 length:', len(train_apis_2))
# 类别 3 数据
with open("new_data/label_3.pkl", "rb") as f:
labels_3 = pickle.load(f)
train_apis_3 = pickle.load(f) # list 598
print('label 3 length:', len(train_apis_3))
# 类别 4 数据
with open("new_data/label_4.pkl", "rb") as f:
labels_4 = pickle.load(f) # ndarray (53,)
train_apis_4 = pickle.load(f) # list 53
print('label 4 length:', len(train_apis_4))
# 类别 5 数据
with open("new_data/label_5.pkl", "rb") as f:
labels_5 = pickle.load(f)
train_apis_5 = pickle.load(f) # list 3397
print('label 5 length:', len(train_apis_5))
# 训练集全体
with open("security_train.csv.pkl", "rb") as f:
labels = pickle.load(f) # ndarray (13887,)
train_apis = pickle.load(f) # list 13887
# 数据融合
train_apis.extend(train_apis_1)
train_apis.extend(train_apis_2)
train_apis.extend(train_apis_3)
train_apis.extend(train_apis_4)
train_apis.extend(train_apis_5)
print('concat length:', len(train_apis)) # 18966
labels = np.concatenate((labels, labels_1, labels_2, labels_3, labels_4, labels_5))
print('label length:', labels.shape) # (18966,)
未添加数据前得分:0.471710
添加数据后得分:0.435405
足以看出,数据对模型效果的影响十分巨大。
拼接词向量
将 ngram 训练出的词向量和 word2vec 词向量拼接起来再用 xgboost 进行分类
# 加载 word2vec 词向量
with open("word2vec_100d_concat.pkl", 'rb') as f:
train_word2vec = pickle.load(f) # 13887 list(未融合) 18966(融合)
test_word2vec = pickle.load(f) # 12955 list
# 加载 ngram 词向量
with open("ngram_vec_concat.pkl", 'rb') as f:
train_ngram = pickle.load(f) # (13887, 180858) csr_matrix(未融合) (18966, 205183)(融合)
y_train = pickle.load(f) # (13887,)
test_ngram = pickle.load(f) # (12955, 180858) csr_matrix
test_nums = pickle.load(f) # list 12955
train_word2vec = np.array(train_word2vec) # (18966, 100)
test_word2vec = np.array(test_word2vec) # (12955, 100) ndarray
train_ngram = train_ngram.A.astype('float32') # (13887, 180858) ndarray
x_train = np.concatenate((train_ngram, train_word2vec), axis=1) # (13887, 180958) ndarray
test_ngram = test_ngram.A.astype('float32')
x_test = np.concatenate((test_ngram, test_word2vec), axis=1) # (12955, 180958) ndarray
x_train = sparse.csr_matrix(x_train)
x_test = sparse.csr_matrix(x_test)
拼接前得分:0.435405
拼接后得分:0.433548
增加 word2vec 100维数据后,效果有了些许提升,word2vec 参数还没有调试,增加维数效果应该会更好。