代码模块化
在不同子空间生成正样本对和负样本对的函数(前期筛选)
def train_pairs_pos_neg(SubSpace_dict):
documents=[]
index_list=[]
for key,value in SubSpace_dict.items():
index_list.append(key)
documents.append(value)
SubSpace_train_pairs=[]
SubSpace_train_pairs_=[]
doc_similar=DocumentSimilar(documents)
for key,value in SubSpace_dict.items():
a=key
temp=list(doc_similar.get_similar(value))
temp_=temp.copy()
Second_Num=SecMax(temp_)
min_num=min(temp)
Second_Maxnum_Index=temp.index(Second_Num)
min_num_index=temp.index(min_num)
c=index_list[min_num_index]
b=index_list[Second_Maxnum_Index]
SubSpace_train_pairs.append([a,b,Second_Num])
SubSpace_train_pairs_.append([a,c,min_num])
SubSpace_train_pairs=np.array(SubSpace_train_pairs)
SubSpace_train_pairs_=np.array(SubSpace_train_pairs_)
return SubSpace_train_pairs,SubSpace_train_pairs_
在每个子空间上生成100个正样本对
def Max100_pos(SubSpace_train_pairs_pos):
list_=[each[-1] for each in SubSpace_train_pairs_pos]
array=np.array(list_)
Max100=list(array.argsort()[-100:][::-1])
Max100_pos=[]
for each in Max100:
temp=[]
temp_li=SubSpace_train_pairs_pos[int(each)]
temp.append(int(temp_li[0]))
temp.append(int(temp_li[1]))
Max100_pos.append(temp)
return Max100_pos
在每个子空间上生成100个负样本对
def Max100_neg(SubSpace_train_pairs_neg):
list_=[each[-1] for each in SubSpace_train_pairs_neg]
array_=np.array(list_)
Min100_=list(array_.argsort()[:100][::])
Min100_neg=[]
for each in Min100_:
temp=[]
temp_li=SubSpace_train_pairs_neg[int(each)]
temp.append(int(temp_li[0]))
temp.append(int(temp_li[1]))
Min100_neg.append(temp)
return Min100_neg
训练集的样本(sentence)转化成sequence的函数
def sample2sequence(Max100_pos_list,Min100_neg_list,SubSpace_dict):
list_index_pos_first=[]
for each in Max100_pos_list:
temp_str=SubSpace_dict[each[0]]
temp_list_word=temp_str.split(" ")
temp_list_index=[]
for i in temp_list_word:
if i in index_list:
temp_list_index.append(word_index[i])
list_index_pos_first.append(temp_list_index)
pos_index_pad_array_first = pad_sequences(list_index_pos_first, maxlen=150)
list_index_pos_sec=[]
for each in Max100_pos_list:
temp_str=SubSpace_dict[each[1]]
temp_list_word=temp_str.split(" ")
temp_list_index=[]
for i in temp_list_word:
if i in index_list:
temp_list_index.append(word_index[i])
list_index_pos_sec.append(temp_list_index)
pos_index_pad_array_second = pad_sequences(list_index_pos_sec, maxlen=150)
list_index_neg_first=[]
for each in Min100_neg_list:
temp_str=SubSpace_dict[each[0]]
temp_list_word=temp_str.split(" ")
temp_list_index=[]
for i in temp_list_word:
if i in index_list:
temp_list_index.append(word_index[i])
list_index_neg_first.append(temp_list_index)
neg_index_pad_array_first = pad_sequences(list_index_neg_first, maxlen=150)
list_index_neg_sec=[]
for each in Min100_neg_list:
temp_str=SubSpace_dict[each[1]]+" "+SubSpace0_dict[each[1]]
temp_list_word=temp_str.split(" ")
temp_list_index=[]
for i in temp_list_word:
if i in index_list:
temp_list_index.append(word_index[i])
list_index_neg_sec.append(temp_list_index)
neg_index_pad_array_second = pad_sequences(list_index_neg_sec, maxlen=150)
index_pad_array_first=np.concatenate((pos_index_pad_array_first,neg_index_pad_array_first),axis=0)
index_pad_array_second=np.concatenate((pos_index_pad_array_second,neg_index_pad_array_second),axis=0)
return index_pad_array_first,index_pad_array_second
根据输入的两篇论文编号,获得两篇论文的sequence表示,主要用于后续的模型预测部分
def test2sequence(firstId,secondId,SubSpace_dict):
temp_str1=SubSpace_dict[firstId]
temp_list_word1=temp_str1.split(" ")
temp_list_index1=[]
for i in temp_list_word1:
if i in index_list:
temp_list_index1.append(word_index[i])
pad_array_first = pad_sequences([temp_list_index1], maxlen=150)
temp_str2=SubSpace_dict[secondId]
temp_list_word2=temp_str2.split(" ")
temp_list_index2=[]
for i in temp_list_word2:
if i in index_list:
temp_list_index2.append(word_index[i])
pad_array_second = pad_sequences([temp_list_index2], maxlen=150)
return pad_array_first,pad_array_second
模型
说明
关于模型的细节部分在上一篇博客(8)里面已经介绍了,因为后面我们要在5个子空间上都建立模型,所以为了减少代码的冗余,这里将模型的建立部分包转在了一个类里。
代码
另外,之前没有写模型预测部份,这里又增添了模型的预测部份。
class MyModel():
def __init__(self, batch_size=None, num_epochs=None, word_index=None, subId=None,
index_pad_array_first=None, index_pad_array_second=None, threeRules=None):
self.batch_size = batch_size
self.num_epochs = num_epochs
self.word_index = word_index
self.subId=subId
self.index_pad_array_first=index_pad_array_first
self.index_pad_array_second=index_pad_array_first
self.threeRules=threeRules
self.model = None
def buildmodel(self):
print('building model...')
embedding_layer = Embedding(len(word_index) + 1,
256,
weights=[embedword_matrix],
input_length=150, trainable=True)
sequence_input1 = Input(shape=(150,), name="first_paper")
sequence_input2 = Input(shape=(150,), name="second_paper")
sequence_input3=Input(shape=(3,),name="rule")
embedded_sequences1 = embedding_layer(sequence_input1)
embedded_sequences2 = embedding_layer(sequence_input2)
LSTM_Left1 = LSTM(512, implementation=2, return_sequences=True, go_backwards=False)(embedded_sequences1)
LSTM_Right1 = LSTM(512, implementation=2, return_sequences=True, go_backwards=True)(embedded_sequences1)
concat1 = merge([LSTM_Left1,LSTM_Right1], mode='concat', concat_axis=-1)
LSTM_Left2 = LSTM(512, implementation=2, return_sequences=True, go_backwards=False)(embedded_sequences2)
LSTM_Right2 = LSTM(512, implementation=2, return_sequences=True, go_backwards=True)(embedded_sequences2)
concat2 = merge([LSTM_Left2,LSTM_Right2], mode='concat', concat_axis=-1)
z1 = Dense(512, activation='tanh')(concat1)
z2 = Dense(512, activation='tanh')(concat2)
z1_MaxPool = Lambda(lambda x: K.max(x, axis=1), output_shape=(512,))(z1)
z2_MaxPool = Lambda(lambda x: K.max(x, axis=1), output_shape=(512,))(z2)
concat=merge([z1_MaxPool,z2_MaxPool], mode='concat', concat_axis=-1)
model_final = Dense(7, activation='relu')(concat)
model_final=merge([model_final,sequence_input3],mode='concat',concat_axis=-1)
model_final = Dropout(0.5)(model_final)
model_final = Dense(2, activation='softmax')(model_final)
self.model = Model(input=[sequence_input1, sequence_input2,sequence_input3],
outputs=model_final)
adam = optimizers.adam(lr=0.0001)
self.model.compile(loss='binary_crossentropy',
optimizer=adam,
metrics=['accuracy'])
print(self.model.summary())
def trainmodel(self):
self.buildmodel()
checkpointer = ModelCheckpoint(filepath="model/"+str(self.subId)+"_model-{epoch:02d}.hdf5", period=1)
pos_list=[[1,0]]*100
neg_list=[[0,1]]*100
y=pos_list+neg_list
y_train= np.asarray(y).astype('float32')
self.model.fit([self.index_pad_array_first,self.index_pad_array_second,self.threeRules],y_train,
self.batch_size,self.num_epochs, verbose=1,
callbacks=[checkpointer])
self.save_model()
def predmodel(self,modelname,index_pad_array_first, index_pad_array_second,threeRules):
self.model = load_model(modelname)
predlabel = self.model.predict([index_pad_array_first, index_pad_array_second, threeRules],
batch_size=512, verbose=1)
return predlabel
def save_model(self):
self.model.save("model/model" +str(self.subId)+ '.h5')
总结
这篇博客主要将之前的一些零散的代码都整理在了函数里,一些只运行一次的代码就没有写下函数里。
因为后面要实现的功能是根据输入的论文集合,在不同的子空间上输出所有的相似的论文对,所以对于测试数据部分的处理部分还需要进行完善。
另外目前搭建的模型也需要进一步调整,可以和更多的附加信息进行组合。
除此之外就是和其他同学实现的功能代码进行整合。