数据
训练数据:
想要数据的同学可以私信我,暂时未上传百度云盘
数据格式只需符合常规的分类数据就可以了,例如:
content | label |
---|---|
代码内容 | java |
代码内容 | python |
开发思路
将代码内容转为向量,用xgboost训练一个多分类器
项目目录结构
代码
config.py
# -*- coding: UTF-8 -*-
import os
#项目路径
root_path = os.path.abspath(os.path.dirname('/Desktop/workplace/code_classfy_test/'))
python_path = os.path.join(root_path, 'data/Python')
java_path = os.path.join(root_path, 'data/Java')
html_path = os.path.join(root_path, 'data/HTML')
js_path = os.path.join(root_path, 'data/JavaScript')
go_path = os.path.join(root_path, 'data/Go')
data_save_path = os.path.join(root_path, 'data/train_data.csv')
w2v_path = os.path.join(root_path, 'model/w2v')
model_path = os.path.join(root_path, 'model')
if not os.path.exists(model_path):
os.makedirs(model_path)
xgb_model_path = os.path.join(root_path, 'model/xgb_model')
data_path = {
'python_path': python_path,
'java_path': java_path,
'html_path': html_path,
'js_path': js_path,
'go_path': go_path
}
params = {
'booster': 'gbtree',
'objective': 'multi:softmax', # 多分类的问题
'num_class': 5, # 类别数,与 multisoftmax 并用
'gamma': 0.1, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
'max_depth': 12, # 构建树的深度,越大越容易过拟合
'lambda': 2, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
'subsample': 0.7, # 随机采样训练样本
'colsample_bytree': 0.7, # 生成树时进行的列采样
'min_child_weight': 3,
'silent': 1, # 设置成1则没有运行信息输出,最好是设置为0.
'eta': 0.007, # 如同学习率
'seed': 1000,
'nthread': 4, # cpu 线程数
}
utils.py
#%%
import numpy as np
import pandas as pd
import os
from config import data_path, data_save_path, w2v_path, params, xgb_model_path
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from time import time
import multiprocessing
import gensim
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser,Phrases
from gensim.models import KeyedVectors
import xgboost as xgb
#得到文件夹下的所有.txt文件的路径
def get_files_path(file_dir, filetype='.txt'):
files_path=[]
for root, dirs, files in os.walk(file_dir):
for file in files:
if os.path.splitext(file)[1] == filetype:
files_path.append(os.path.join(root, file))
return files_path
#得到所有文件的内容
def get_file_content(files_path):
code = []
for file_path in files_path:
with open(file_path, 'r') as f:
code.append(f.read())
return code
def generate_train_data(datapath, save_path):
python_path = datapath['python_path']
python_files_path = get_files_path(python_path)
python_files_content = get_file_content(python_files_path)
python_code = pd.DataFrame(python_files_content, columns=['language'])
python_code['label'] = 'python'
#print(python_code)
java_path = datapath['java_path']
java_files_path = get_files_path(java_path)
java_files_content = get_file_content(java_files_path)
java_code = pd.DataFrame(java_files_content, columns=['language'])
java_code['label'] = 'java'
#print(java_code)
html_path = datapath['html_path']
html_files_path = get_files_path(html_path)
html_files_content = get_file_content(html_files_path)
html_code = pd.DataFrame(html_files_content, columns=['language'])
html_code['label'] = 'html'
#print(html_code)
js_path = datapath['js_path']
js_files_path = get_files_path(js_path)
js_files_content = get_file_content(js_files_path)
js_code = pd.DataFrame(js_files_content, columns=['language'])
js_code['label'] = 'js'
#print(js_code)
go_path = datapath['go_path']
go_files_path = get_files_path(go_path)
go_files_content = get_file_content(go_files_path)
go_code = pd.DataFrame(go_files_content, columns=['language'])
go_code['label'] = 'go'
#print(go_code)
data = pd.concat([python_code, java_code, html_code, js_code, go_code], axis=0)
codetype_mapping = {'python': 0, 'java': 1, 'html': 2, 'js': 3, 'go': 4}
data['label_id'] = data['label'].map(codetype_mapping)
data = shuffle(data)
data.index = range(data.shape[0])
data.to_csv(save_path, index=False)
return data
def train_w2v(data, to_file):
sent = [row.split() for row in data['language']]
phrases = Phrases(sent, min_count=5, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]
cores = multiprocessing.cpu_count()
w2v_model = Word2Vec(min_count=2,
window=2,
size=300,
sample=6e-5,
alpha=0.03,
min_alpha=0.0007,
negative=15,
workers=cores - 1,
iter=7)
t = time()
w2v_model.build_vocab(sentences)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))
t = time()
w2v_model.train(sentences,
total_examples=w2v_model.corpus_count,
epochs=15,
report_delay=1)
print('Time to train vocab: {} mins'.format(round((time() - t) / 60, 2)))
w2v_model.save(to_file)
return w2v_model
def wam(sentence, w2v_model):
arr = []
for s in sentence.split():
if s not in w2v_model.wv.vocab.keys():
arr.append(np.random.randn(1, 300))
else:
arr.append(w2v_model.wv.get_vector(s))
return np.mean(np.array(arr), axis=0).reshape(1, -1)
def data_load(data, w2v_model):
data['language_vec'] = data['language'].apply(
lambda x: wam(x, w2v_model))
data['language_vec'] = data['language_vec'].apply(
lambda x: x[0][0] if x.shape[1] != 300 else x)
data = data.dropna()
return data
def predict(testdata, w2v_model, xgb_model):
vec = wam(inputdata, w2v_model)
#print(vec)
col_names = []
for i in range(0, 300):
col_name = 'col' + str(i)
col_names.append(col_name)
testx_df = pd.DataFrame(columns=col_names)
testx_df.loc[0] = [vec[0][n] for n in range(300)]
Dm_indata = xgb.DMatrix(testx_df)
result = xgb_model.predict(Dm_indata)
return result
if __name__ == '__main__':
#data = generate_train_data(data_path, data_save_path)
# data = pd.read_csv(data_save_path, index_col=False)
if not os.path.exists(w2v_path):
w2v_model = train_w2v(data, w2v_path)
else:
w2v_model = KeyedVectors.load(w2v_path)
# train_data = data_load(data, w2v_model)
#
# trainx = train_data['language_vec']
# trainy = train_data['label_id']
# col_names = []
# for i in range(0, 300):
# col_name = 'col' + str(i)
# col_names.append(col_name)
#
# trainx_df = pd.DataFrame(columns=col_names)
# for i in range(len(trainx)):
# trainx_df.loc[i] = [trainx[i][0][n] for n in range(300)]
# print(trainx_df)
#
# X_train, X_test, y_train, y_test = train_test_split(trainx_df, trainy, test_size=0.2, random_state=100)
#
# dtrain = xgb.DMatrix(X_train, y_train)
# dtest = xgb.DMatrix(X_test, y_test)
# num_rounds = 500
if not os.path.exists(xgb_model_path):
model = xgb.train(params, dtrain, num_rounds)
model.save_model(xgb_model_path)
else:
model = xgb.Booster()
model.load_model(xgb_model_path)
# pred = model.predict(dtest)
# error_rate = np.sum(pred != y_test) / y_test.shape[0]
# print('测试集准确率{}'.format(1 - error_rate))
inputdata = """
def wam(sentence, w2v_model):
arr = []
for s in sentence.split():
if s not in w2v_model.wv.vocab.keys():
arr.append(np.random.randn(1, 300))
else:
arr.append(w2v_model.wv.get_vector(s))
return np.mean(np.array(arr), axis=0).reshape(1, -1)
def data_load(data, w2v_model):
data['language_vec'] = data['language'].apply(
lambda x: wam(x, w2v_model))
data['language_vec'] = data['language_vec'].apply(
lambda x: x[0][0] if x.shape[1] != 300 else x)
data = data.dropna()
return data
language_vec = data_load(datatrain, w2v_model)
# print(language_vec)
trainx = language_vec['language_vec']
trainy = language_vec['labelnum']
col_names = []
for i in range(0, 300):
col_name = 'col' + str(i)
col_names.append(col_name)
print(col_names)
trainx_df = pd.DataFrame( columns=col_names)
for i in range(len(trainx)):
trainx_df.loc[i] = [trainx[i][0][n] for n in range(300)]
#trainx_df.to_csv('/Users/zhangzc/Desktop/workplace/ask_test/train_vec.csv', index=False)
print(trainx_df)
"""
codetype_mapping = {'python': 0, 'java': 1, 'html': 2, 'js': 3, 'go': 4}
result = predict(inputdata, w2v_model, model)
print(int(result.item()))
key_list = list(filter(lambda k: codetype_mapping.get(k) == int(result.item()), codetype_mapping.keys()))
print(key_list)
开发过程中遇到的问题
array类型的数据无法输入到xgboost的DMatrix中,需处理成dataframe,处理过程可参考博客python将多维array转换成多列dataframe
存在的问题
由于我手上的数据不多(400左右),而且存在严重的数据不平衡,在五个类别上训练出来的效果只有56%,后续会继续优化,以提高准确率。
总结
以上为预测代码,训练代码把main里的注释去掉就好了,大家有什么更好的建议或者方案,欢迎随时私信,感谢🌹🌹