llama
模型加载:
import torch
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
base_model= 'dictionary_to_llama'
lora_weights='dictionary_to_lora'
tokenizer = LlamaTokenizer.from_pretrained(base_model)
model = LlamaForCausalLM.from_pretrained(
base_model,
load_in_8bit=False,
torch_dtype=torch.float16,
device_map="auto",
)
model = PeftModel.from_pretrained(
model,
lora_weights,
torch_dtype=torch.float16,
)
推理1:
def inference(inputs='How to craft a diamond pickaxe in Minecraft?',
max_new_tokens=512,
do_sample=True,
top_p=0.75,
temperature=0.9,
repetition_penalty=1,
eos_token_id=2,
bos_token_id=1,
pad_token_id=0
):
input_ids = tokenizer(inputs, return_tensors="pt").input_ids.to(device)
generation_output = model.generate(
input_ids=input_ids,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
do_sample=do_sample,
repetition_penalty=repetition_penalty,
eos_token_id=eos_token_id,
bos_token_id=bos_token_id,
pad_token_id=pad_token_id
)
output = tokenizer.batch_decode(generation_output)[0]
return output
print(inference())
推理2:
# 添加停止准则
stop_list = ['\nHuman:', '\n```\n']
stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids
from transformers import StoppingCriteria, StoppingCriteriaList
# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
for stop_ids in stop_token_ids:
if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
return True
return False
stopping_criteria = StoppingCriteriaList([StopOnTokens()])
generate_text = transformers.pipeline(
model=model,
tokenizer=tokenizer,
return_full_text=True, # langchain expects the full text
task='text-generation',
# we pass model parameters here too
do_sample=samp,
stopping_criteria=stopping_criteria, # without this model rambles during chat
temperature=temp, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
max_new_tokens=512, # max number of tokens to generate in the output
repetition_penalty=rep, # without this output begins repeating
top_k=top_k,
top_p=top_p,
num_beams=nb,
device=‘cuda’
)
query='How to craft a diamond pickaxe in Minecraft?'
output = generate_text(query)
phi
推理
def inference(inputs='How to craft a diamond pickaxe in Minecraft?',
max_new_tokens=200,
do_sample=True,
top_p=0.75,
temperature=0.01,
repetition_penalty=1.2,
eos_token_id=50256, # eos变了
bos_token_id=1,
pad_token_id=0
):
input_ids = tokenizer(inputs, return_tensors="pt").input_ids.to(device)
generation_output = model.generate(
input_ids=input_ids,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
do_sample=do_sample,
repetition_penalty=repetition_penalty,
eos_token_id=eos_token_id,
bos_token_id=bos_token_id,
pad_token_id=pad_token_id
)
output = tokenizer.batch_decode(generation_output)[0]
return output.replace("<|endoftext|>","")