思路如下:
- 读取数据
- 数据的y值处理
- 把文本做word2vec模型
- 文本分词
- 把分完的词做word2vec向量映射
- 建立神经网络模型,并训练
- 预测
直接上代码:
import pandas as pd
import numpy as np
import jieba
import re
import multiprocessing
from multiprocessing import Pool
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape,BatchNormalization
from keras.layers import Conv1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D,SeparableConvolution1D
from keras import regularizers
from keras.layers.merge import concatenate
from sklearn.externals import joblib
import heapq
# 读取训练集和测试集数据
f=open(r'C:\Users\admin\Desktop\game_test\game\apptype_train.dat',encoding='utf-8')
sentimentlist = []
for line in f:
s = line.strip().split('\t')
sentimentlist.append(s)
f.close()
df_train=pd.DataFrame(sentimentlist,columns=['s_no','deal_code','text'])
# 训练集的数据处理
df_=df_train['deal_code'].str.split('|',expand=True)
df_.columns=['deal_code1','deal_code2']
df_2=pd.concat([df_train,df_],axis=1)
a=df_2[['s_no','deal_code1','text']]
a.columns=['s_no','deal_code2','text'] #a.rename(columns={'deal_code1':'deal_code2'}, inplace = True)
b=df_2[['s_no','deal_code2','text']]
df_3=pd.concat([a,b],axis=0)
df_train_end=df_3[df_3['deal_code2'].isnull().values==False]
# 读取测试集
f=open(r'C:\Users\admin\Desktop\game_test\game\app_desc.dat',encoding='utf-8')
sentimentlist = []
for line in f:
s = line.strip().split('\t')
sentimentlist.append(s)
f.close()
df_test=pd.DataFrame(sentimentlist,columns=['s_no','text'])
print