在几个公众号中都看到有matchzoo的推荐,是一个通用的文本匹配工具包,主要是几种最新的深度学习文本匹配模型,到本篇博客为止,这里记录下自己在看matchzoo中文支持研究的笔记,原github地址:https://github.com/NTMC-Community/MatchZoo
以github的tutorials为例:
import matchzoo as mz
task = mz.tasks.Ranking()
print(task)
train_raw = mz.datasets.qa.load_data(stage='train', task=task) #qa是datasets下新建的包,放置中文数据
test_raw = mz.datasets.qa.load_data(stage='test', task=task)
print(train_raw.left.head())
print(train_raw.right.head())
print(train_raw.relation.head())
print(train_raw.frame().head()) #数据格式如下图3
emb = mz.embedding.load_from_file(mz.datasets.embeddings.EMBED_CPWS,mode='word2vec') #加载word2vec词向量
model_class = mz.models.ArcI
model, preprocessor, data_generator_builder, embedding_matrix = mz.auto.prepare(
task=task,
model_class=model_class,
data_pack=train_raw,
embedding=emb
)
print(model.params) #展示模型中可调参数
model.params['mlp_num_units'] = 3 #直接调整参数
print("embedding_matrix: \n",type(embedding_matrix),'\n',embedding_matrix)
preprocessor._units = [
mz.preprocessors.units.tokenize_ch.Tokenize(),
# mz.preprocessors.units.lowercase.Lowercase(), #preprocessor中数据预处理单元修改,可直接赋值
mz.preprocessors.units.punc_removal.PuncRemoval(),
]
# preprocessor.fit(train_raw)
train_processed = preprocessor.transform(train_raw, verbose=0)
test_processed = preprocessor.transform(test_raw, verbose=0)
#