LLM部署

llama

模型加载:


import torch
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer

base_model= 'dictionary_to_llama'
lora_weights='dictionary_to_lora'

tokenizer = LlamaTokenizer.from_pretrained(base_model)
model = LlamaForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(
    model,
    lora_weights,
    torch_dtype=torch.float16,
)    

推理1:

def inference(inputs='How to craft a diamond pickaxe in Minecraft?',
            max_new_tokens=512, 
            do_sample=True, 
            top_p=0.75,
            temperature=0.9, 
            repetition_penalty=1, 
            eos_token_id=2, 
            bos_token_id=1, 
            pad_token_id=0
              ):
    input_ids = tokenizer(inputs, return_tensors="pt").input_ids.to(device)
    generation_output = model.generate(
        input_ids=input_ids,  
        max_new_tokens=max_new_tokens, 
        temperature=temperature,
        top_p=top_p,
        do_sample=do_sample, 
        repetition_penalty=repetition_penalty, 
        eos_token_id=eos_token_id, 
        bos_token_id=bos_token_id, 
        pad_token_id=pad_token_id
    )
    
    output = tokenizer.batch_decode(generation_output)[0]
    return output

print(inference())

推理2:

# 添加停止准则
stop_list = ['\nHuman:', '\n```\n']
stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]

stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids

from transformers import StoppingCriteria, StoppingCriteriaList

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

generate_text = transformers.pipeline(
    model=model, 
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    do_sample=samp, 
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=temp,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=rep,  # without this output begins repeating
    top_k=top_k,
    top_p=top_p,
    num_beams=nb,
    device=‘cuda’
)

query='How to craft a diamond pickaxe in Minecraft?'
output = generate_text(query)

phi

推理

def inference(inputs='How to craft a diamond pickaxe in Minecraft?',
            max_new_tokens=200, 
            do_sample=True, 
            top_p=0.75,
            temperature=0.01, 
            repetition_penalty=1.2, 
            eos_token_id=50256, 	# eos变了
            bos_token_id=1, 
            pad_token_id=0
              ):
    input_ids = tokenizer(inputs, return_tensors="pt").input_ids.to(device)
    generation_output = model.generate(
        input_ids=input_ids,  
        max_new_tokens=max_new_tokens, 
        temperature=temperature,
        top_p=top_p,
        do_sample=do_sample, 
        repetition_penalty=repetition_penalty, 
        eos_token_id=eos_token_id, 
        bos_token_id=bos_token_id, 
        pad_token_id=pad_token_id
    )
    
    output = tokenizer.batch_decode(generation_output)[0]
    return output.replace("<|endoftext|>","")
  • 9
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值