仓库地址:https://github.com/DengBoCong/text-similarity
Overview
Dataset : 中文/English 语料, ☞ 点这里 Paper : 相关论文详解, ☞ 点这里 The implemented method is as follows: :
TF-IDF BM25 LSH SIF/uSIF FastText RNN Base (Siamese RNN, Stack RNN) CNN Base (Fast Text, Text CNN, Char CNN, VDCNN) Bert Base Albert NEZHA RoBERTa SimCSE Poly-Encoder ColBERT RE2(Simple-Effective-Text-Matching)
Usages
1:examples目录下有不同模型对应的 preprocess/train/evalute代码,可自行修改
2:如下示例从examples中引入actuator方法,准备好对应的模型配置文件即可执行
3:examples目录下的inference.py为训练好的模型推理代码
TF-IDF
from examples. run_tfidf_sklearn import actuator
actuator( "./corpus/chinese/breeno/train.tsv" , query1= "12 23 4160 276" , query2= "29 23 169 1495" )
from examples. run_tfidf import actuator
actuator( "./corpus/chinese/breeno/train.tsv" , query1= "12 23 4160 276" , query2= "29 23 169 1495" )
from sim. tf_idf import TFIdf
tokens_list = [ "这是 一个 什么 样 的 工具" , "..." ]
query = [ "非常 好用 的 工具" ]
tf_idf = TFIdf( tokens_list, split= " " )
print ( tf_idf. get_score( query, 0 ) )
print ( tf_idf. get_score_list( query, 10 ) )
print ( tf_idf. weight( ) )
BM25
from examples. run_bm25 import actuator
actuator( "./corpus/chinese/breeno/train.tsv" , query1= "12 23 4160 276" , query2= "29 23 169 1495" )
from sim. bm25 import BM25
tokens_list = [ "这是 一个 什么 样 的 工具" , "..." ]
query = [ "非常 好用 的 工具" ]
bm25 = BM25( tokens_list, split= " " )
print ( bm25. get_score( query, 0 ) )
print ( bm25. get_score_list( query, 10 ) )
print ( bm25. weight( ) )
LSH
from sim. lsh import E2LSH
from sim. lsh import MinHash
e2lsh = E2LSH( )
min_hash = MinHash( )
candidates = [ [ 3.6216 , 8.6661 , - 2.8073 , - 0.44699 , 0 ] , . . . ]
query = [ - 2.7769 , - 5.6967 , 5.9179 , 0.37671 , 1 ]
print ( e2lsh. search( candidates, query) )
print ( min_hash. search( candidates, query) )
SIF
sentences = [ [ "token1" , "token2" , "..." ] , . . . ]
vector = [ [ [ 1 , 1 , 1 ] , [ 2 , 2 , 2 ] , [ . . . ] ] , . . . ]
from sim. sif_usif import SIF
from sim. sif_usif import uSIF
sif = SIF( n_components= 5 , component_type= "svd" )
sif. fit( tokens_list= sentences, vector_list= vector)
usif = uSIF( n_components= 5 , n= 1 , component_type= "svd" )
usif. fit( tokens_list= sentences, vector_list= vector)
FastText
from examples. tensorflow. run_fast_text import actuator
actuator( execute_type= "train" , model_type= "bert" , model_dir= "./data/chinese_wwm_L-12_H-768_A-12" )
from examples. pytorch. run_fast_text import actuator
actuator( execute_type= "train" , model_type= "bert" , model_dir= "./data/chinese_wwm_pytorch" )
RNN Base
from examples. tensorflow. run_siamese_rnn import actuator
actuator( "./data/config/siamse_rnn.json" , execute_type= "train" )
from examples. pytorch. run_siamese_rnn import actuator
actuator( "./data/config/siamse_rnn.json" , execute_type= "train" )
CNN Base
from examples. tensorflow. run_cnn_base import actuator
actuator( execute_type= "train" , model_type= "bert" , model_dir= "./data/chinese_wwm_L-12_H-768_A-12" )
from examples. pytorch. run_cnn_base import actuator
actuator( execute_type= "train" , model_type= "bert" , model_dir= "./data/chinese_wwm_pytorch" )
Bert Base
from examples. tensorflow. run_basic_bert import actuator
actuator( model_dir= "./data/chinese_wwm_L-12_H-768_A-12" , execute_type= "train" )
from examples. pytorch. run_basic_bert import actuator
actuator( model_dir= "./data/chinese_wwm_pytorch" , execute_type= "train" )
Albert
from examples. tensorflow. run_albert import actuator
actuator( model_dir= "./data/albert_small_zh_google" , execute_type= "train" )
from examples. pytorch. run_albert import actuator
actuator( model_dir= "./data/albert_chinese_small" , execute_type= "train" )
NEZHA
from examples. tensorflow. run_nezha import actuator
actuator( model_dir= "./data/NEZHA-Base-WWM" , execute_type= "train" )
from examples. pytorch. run_nezha import actuator
actuator( model_dir= "./data/nezha-base-wwm" , execute_type= "train" )
RoBERTa
from examples. tensorflow. run_basic_bert import actuator
actuator( model_dir= "./data/chinese_roberta_L-6_H-384_A-12" , execute_type= "train" )
from examples. pytorch. run_basic_bert import actuator
actuator( model_dir= "./data/chinese-roberta-wwm-ext" , execute_type= "train" )
SimCSE
from examples. tensorflow. run_simcse import actuator
actuator( model_dir= "./data/chinese_wwm_L-12_H-768_A-12" , execute_type= "train" , model_type= "bert" )
from examples. pytorch. run_simcse import actuator
actuator( model_dir= "./data/chinese_wwm_pytorch" , execute_type= "train" , model_type= "bert" )
Poly-Encoder
from examples. tensorflow. run_poly_encoder import actuator
actuator( model_dir= "./data/chinese_wwm_L-12_H-768_A-12" , execute_type= "train" , model_type= "bert" )
from examples. pytorch. run_poly_encoder import actuator
actuator( model_dir= "./data/chinese_wwm_pytorch" , execute_type= "train" , model_type= "bert" )
ColBERT
from examples. tensorflow. run_colbert import actuator
actuator( model_dir= "./data/chinese_wwm_L-12_H-768_A-12" , execute_type= "train" , model_type= "bert" )
from examples. pytorch. run_colbert import actuator
actuator( model_dir= "./data/chinese_wwm_pytorch" , execute_type= "train" , model_type= "bert" )
RE2
from examples. tensorflow. run_re2 import actuator
actuator( "./data/config/re2.json" , execute_type= "train" )
from examples. pytorch. run_re2 import actuator
actuator( "./data/config/re2.json" , execute_type= "train" )