def gen_id2corpus(corpus_file):
id2corpus = {}
with open(corpus_file, "r", encoding="utf-8") as f:
for idx, line in enumerate(f):
id2corpus[idx] = line.rstrip()
return id2corpus
corpus_file='./datasets/zwqa/corpus.csv'
id2corpus = gen_id2corpus(corpus_file)
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
model_dir='./output'
model_file = model_dir + "/inference.get_pooled_embedding.pdmodel"
params_file = model_dir + "/inference.get_pooled_embedding.pdiparams"
import os
import paddle
if not os.path.exists(model_file):
raise ValueError("not find model file path {}".format(model_file))
if not os.path.exists(params_file):
raise ValueError("not find params file path {}".format(params_file))
config = paddle.inference.Config(model_file, params_file)
from paddle import inference
precision='fp32'
output_emb_size = 256
config.enable_use_gpu(100, 0)
precision_map = {
"fp16": inference.PrecisionType.Half,
"fp32": inference.PrecisionType.Float32,
"int8": inference.PrecisionType.Int8,
}
precision_mode = precision_map[precision]
batch_size=32
# config.enable_tensorrt_engine(max_batch_size=batch_size,
# min_subgraph_size=30,
# precision_mode=precision_mode)
predictor.get_input_names()
config.switch_use_feed_fetch_ops(False)
predictor = paddle.inference.create_predictor(config)
input_handles = [predictor.get_input_handle(name) for name in predictor.get_input_names()]
output_handle = predictor.get_output_handle(predictor.get_output_names()[0])
from paddlenlp.data import Pad, Tuple
from paddlenlp.transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('ernie-3.0-medium-zh')
def batchify_fn(
samples,
fn=Tuple(
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # input
Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"), # segment
),
):
return fn(samples)
corpus_list[:3]#索引文本集,与问答对中的问题相对应,用于构建标准问题语义库
def convert_example(example, tokenizer, max_seq_length=512