前言
参加了贝壳找房的房产问答匹配比赛(比赛链接:https://www.datafountain.cn/competitions/474),于是利用matchzoo库解决房产行业问答匹配比赛问题。
比赛流程
导入第三方库包
import matchzoo as mz
import pandas as pd
import numpy as np
import numpy as np
import tensorflow.keras as K
from matchzoo.preprocessors import BasicPreprocessor
from sklearn.model_selection import train_test_split
import datetime
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import tensorflow as tf
from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam
from random import choice
# from keras_bert import load_trained_model_from_checkpoint, Tokenizer
import re, os
import codecs
from keras.callbacks import Callback
数据预处理
1.读取数据集
#数据读取及处理
train_left = pd.read_csv('./train/train.query.tsv',sep='\t',header=None)
train_left.columns=['id','q1']
train_right = pd.read_csv('./train/train.reply.tsv',sep='\t',header=None)
train_right.columns=['id','id_sub','q2','label']
df_train = train_left.merge(train_right, how='left')
df_train['q2'] = df_train['q2'].fillna('好的')
test_left = pd.read_csv('./test/test.query.tsv',sep='\t',header=None, encoding='gbk')
test_left.columns = ['id','q1']
test_right = pd.read_csv('./test/test.reply.tsv',sep='\t',header=None, encoding='gbk')
test_right.columns=['id','id_sub','q2']
df_test = test_left.merge(test_right, how='left')
2.检查训练集和测试集
训练集
测试集
3.将数据集转化成matchzoo的形式
#构造训练集和验证集
sent1_=df_train.q1.values
sent2_=df_train.q2.values
label_=df_train.label.values
all_data=pd.DataFrame()
all_data['id_left']=range(len(df_train))
all_data['text_left']=sent1_
all_data['id_right']=range(len(df_train))
all_data['text_right']=sent2_
all_data['label']=label_
#构造测试集
_sent1=df_test.q1.values
_sent2=df_test.q2.values
# _label=label[2501:]
tmp_data=pd.DataFrame()
tmp_data['id_left']=range(len(df_test))
tmp_data['text_left']=_sent1
tmp_data['id_right']=range(len(df_test))
tmp_data['text_right']=_sent2
# test_data['label']=_label
4.转化成matchzoo需要的格式,构造数据管道
def load_data(df_data):
# df_data = pd.read_csv(data_path, sep='\t', header=None)
# df_data = pd.DataFrame(df_data.values, columns=['id_left', 'text_left', 'id_right', 'text_right', 'label'])
df_data = mz.pack(df_data)
return df_data
train_data = load_data(all_data)
test_data=load_data(tmp_data)
#用最基础的BasicPreprocessor
#将匹配的文本都拓展成15个字符的长度
preprocessor=BasicPreprocessor(15,15)
train = train_data[:train_split]
dev = train_data[train_split:]
train_pack_processed = preprocessor.fit_transform(train)
# 其实就是做了一个字符转id操作,所以对于中文文本,不需要分词
dev_pack_processed = preprocessor.transform(dev)
test_pack_processed = preprocessor.transform(test_data)
train_data_generator = mz.DataGenerator(train_pack_processed
, batch_size=32
, shuffle=True) # 训练数据生成器
test_x, test_y = test_pack_processed.unpack()
dev_x, dev_y = dev_pack_processed.unpack()
4.创建目录的源码
def mkdir(path):
# 引入模块
import os
# 去除首位空格
path=path.strip()
# 去除尾部 \ 符号
path=path.rstrip("\\")
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists=os.path.exists(path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(path)
print(path+' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path+' 目录已存在')
return False
模型训练
DenseBaseline
### 定义任务,包含两种,一个是Ranking,一个是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 创建模型以及修改参数(可以使用mz.models.list_available()查看可用的模型列表)
model = mz.models.DenseBaseline()
model.params['task'] = task
model.params['mlp_num_units'] = 3
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 训练, 评估, 预测
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result','DenseBaseline')
logdir1 = os.path.join('outputs', 'model', 'DenseBaseline')
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+'DenseBaseline'+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 保存模型
model.save('./outputs/model/'+'DenseBaseline'+'/'+stamp)
print('保存成功')
#这里是加载
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
DenseBaseline的运行结果(之后的模型结果由于篇幅就不展示了)
DRMMTKS
### 定义任务,包含两种,一个是Ranking,一个是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 创建模型以及修改参数(可以使用mz.models.list_available()查看可用的模型列表)
model =mz.models.DRMMTKS()
model.params['embedding_input_dim'] = 10000
model.params['embedding_output_dim'] = 100
model.params['top_k'] = 20
model.params['mlp_num_layers'] = 1
model.params['mlp_num_units'] = 5
model.params['mlp_num_fan_out'] = 1
model.params['mlp_activation_func'] = 'tanh'
model.guess_and_fill_missing_params(verbose=0)
# model.build()
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 训练, 评估, 预测
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result','DRMMTKS')
logdir1 = os.path.join('outputs', 'model', 'DRMMTKS')
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+'DRMMTKS'+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 保存模型
model.save('./outputs/model/'+'DRMMTKS'+'/'+stamp)
print('保存成功')
#这里是加载
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
KNRM
### 定义任务,包含两种,一个是Ranking,一个是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 创建模型以及修改参数(可以使用mz.models.list_available()查看可用的模型列表)
model =mz.models.KNRM()
model.params['embedding_input_dim'] = 10000
model.params['embedding_output_dim'] = 10
model.params['embedding_trainable'] = True
model.params['kernel_num'] = 11
model.params['sigma'] = 0.1
model.params['exact_sigma'] = 0.001
model.guess_and_fill_missing_params(verbose=0)
# model.build()
# model.build()
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 训练, 评估, 预测
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result','KNRM')
logdir1 = os.path.join('outputs', 'model', 'KNRM')
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+'KNRM'+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 保存模型
model.save('./outputs/model/'+'KNRM'+'/'+stamp)
print('保存成功')
#这里是加载
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
MVLSTM
### 定义任务,包含两种,一个是Ranking,一个是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 创建模型以及修改参数(可以使用mz.models.list_available()查看可用的模型列表)
model =mz.models.MVLSTM()
model.params['lstm_units'] = 32
model.params['top_k'] = 50
model.params['mlp_num_layers'] = 2
model.params['mlp_num_units'] = 20
model.params['mlp_num_fan_out'] = 10
model.params['mlp_activation_func'] = 'relu'
model.params['dropout_rate'] = 0.5
model.guess_and_fill_missing_params(verbose=0)
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 训练, 评估, 预测
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='MVLSTM'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 保存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('保存成功')
#这里是加载
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
HBMP
### 定义任务,包含两种,一个是Ranking,一个是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 创建模型以及修改参数(可以使用mz.models.list_available()查看可用的模型列表)
model = mz.contrib.models.HBMP()
model.guess_and_fill_missing_params(verbose=0)
model.params['embedding_input_dim'] = 200
model.params['embedding_output_dim'] = 100
model.params['embedding_trainable'] = True
model.params['alpha'] = 0.1
model.params['mlp_num_layers'] = 3
model.params['mlp_num_units'] = [10, 10]
model.params['lstm_num_units'] = 5
model.params['dropout_rate'] = 0.1
# model.build()
# model.build()
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 训练, 评估, 预测
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='HBMP'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 保存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('保存成功')
#这里是加载
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
ArcI
### 定义任务,包含两种,一个是Ranking,一个是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 创建模型以及修改参数(可以使用mz.models.list_available()查看可用的模型列表)
model =mz.models.ArcI()
model.params['num_blocks'] = 1
model.params['left_filters'] = [32]
model.params['right_filters'] = [32]
model.params['left_kernel_sizes'] = [3]
model.params['right_kernel_sizes'] = [3]
model.params['left_pool_sizes'] = [2]
model.params['right_pool_sizes'] = [4]
model.params['conv_activation_func'] = 'relu'
model.params['mlp_num_layers'] = 1
model.params['mlp_num_units'] = 64
model.params['mlp_num_fan_out'] = 32
model.params['mlp_activation_func'] = 'relu'
model.params['dropout_rate'] = 0.5
model.guess_and_fill_missing_params(verbose=0)
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 训练, 评估, 预测
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='ArcI'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 保存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('保存成功')
#这里是加载
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
ConvKNRM
### 定义任务,包含两种,一个是Ranking,一个是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 创建模型以及修改参数(可以使用mz.models.list_available()查看可用的模型列表)
model = mz.models.ConvKNRM()
model.params['embedding_input_dim'] = 10000
model.params['embedding_output_dim'] = 300
model.params['embedding_trainable'] = True
model.params['filters'] = 128
model.params['conv_activation_func'] = 'tanh'
model.params['max_ngram'] = 3
model.params['use_crossmatch'] = True
model.params['kernel_num'] = 11
model.params['sigma'] = 0.1
model.params['exact_sigma'] = 0.001
model.guess_and_fill_missing_params(verbose=0)
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 训练, 评估, 预测
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='ConvKNRM'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 保存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('保存成功')
#这里是加载
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
DUET
### 定义任务,包含两种,一个是Ranking,一个是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 创建模型以及修改参数(可以使用mz.models.list_available()查看可用的模型列表)
model = mz.models.DUET()
model.params['embedding_input_dim'] = 1000
model.params['embedding_output_dim'] = 300
model.params['lm_filters'] = 32
model.params['lm_hidden_sizes'] = [64, 32]
model.params['dropout_rate'] = 0.5
model.params['dm_filters'] = 32
model.params['dm_kernel_size'] = 3
model.params['dm_d_mpool'] = 4
model.params['dm_hidden_sizes'] = [64, 32]
model.guess_and_fill_missing_params(verbose=0)
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 训练, 评估, 预测
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='DUET'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 保存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('保存成功')
#这里是加载
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
ESIM
### 定义任务,包含两种,一个是Ranking,一个是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 创建模型以及修改参数(可以使用mz.models.list_available()查看可用的模型列表)
# model = mz.models.ConvKNRM()
model = mz.contrib.models.ESIM()
model.params['task'] = task
model.params['input_shapes'] = [(15, ), (15, )]
model.params['lstm_dim'] = 300
model.params['mlp_num_units'] = 300
model.params['embedding_input_dim'] = 5000
model.params['embedding_output_dim'] = 10
model.params['embedding_trainable'] = False
model.params['mlp_num_layers'] = 0
model.params['mlp_num_fan_out'] = 300
model.params['mlp_activation_func'] = 'tanh'
model.params['mask_value'] = 0
model.params['dropout_rate'] = 0.5
model.params['optimizer'] = K.optimizers.Adam(lr=4e-4)
model.guess_and_fill_missing_params()
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 训练, 评估, 预测
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='ESIM'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 保存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('保存成功')
#这里是加载
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
MatchLSTM
### 定义任务,包含两种,一个是Ranking,一个是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 创建模型以及修改参数(可以使用mz.models.list_available()查看可用的模型列表)
# model = mz.models.ConvKNRM()
model =mz.contrib.models.MatchLSTM()
model.guess_and_fill_missing_params(verbose=0)
model.params['embedding_input_dim'] = 10000
model.params['embedding_output_dim'] = 100
model.params['embedding_trainable'] = True
model.params['fc_num_units'] = 200
model.params['lstm_num_units'] = 256
model.params['dropout_rate'] = 0.5
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 训练, 评估, 预测
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='MatchLSTM'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 保存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('保存成功')
#这里是加载
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
MatchSRNN
### 定义任务,包含两种,一个是Ranking,一个是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 创建模型以及修改参数(可以使用mz.models.list_available()查看可用的模型列表)
# model = mz.models.ConvKNRM()
model =mz.contrib.models.MatchSRNN()
model.params['channels'] = 4
model.params['units'] = 10
model.params['dropout_rate'] = 0.0
model.params['direction'] = 'lt'
model.guess_and_fill_missing_params(verbose=0)
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 训练, 评估, 预测
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='MatchSRNN'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 保存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('保存成功')
#这里是加载
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')
BiMPM
### 定义任务,包含两种,一个是Ranking,一个是classification
task = mz.tasks.Ranking()
print('='*20)
print(task)
### 创建模型以及修改参数(可以使用mz.models.list_available()查看可用的模型列表)
# model = mz.models.ConvKNRM()
model = mz.contrib.models.BiMPM()
model.guess_and_fill_missing_params(verbose=0)
model.params.update(preprocessor.context)
model.params.completed()
model.build()
model.compile()
model.backend.summary()
### 训练, 评估, 预测
x, y = train_pack_processed .unpack()
test_x, test_y = test_pack_processed.unpack()
model.fit(x , y,batch_size=32, epochs=10)
print(model.evaluate(dev_x,dev_y))
output_csv=pd.DataFrame()
pred=model.predict(test_x)
output_csv["pred"]=list(pred)
modeltype='BiMPM'
stamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
logdir = os.path.join('outputs', 'result',modeltype)
logdir1 = os.path.join('outputs', 'model',modeltype)
mkdir(logdir)
output_csv.to_csv('./outputs/result/'+modeltype+'/'+stamp+'-pred.csv')
mkdir(logdir1)
### 保存模型
model.save('./outputs/model/'+modeltype+'/'+stamp)
print('保存成功')
#这里是加载
# loaded_model = mz.load_model('./outputs/model/DenseBaseline-model.h5')