LLM部署

Wanderer X

已于 2024-01-20 19:29:55 修改

阅读量725

点赞数 9

分类专栏：代码文章标签：深度学习人工智能机器学习

于 2023-12-18 22:22:44 首次发布

本文链接：https://blog.csdn.net/wandererxx/article/details/135073006

版权

代码专栏收录该内容

6 篇文章 0 订阅

订阅专栏

llama

模型加载：


import torch
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer

base_model= 'dictionary_to_llama'
lora_weights='dictionary_to_lora'

tokenizer = LlamaTokenizer.from_pretrained(base_model)
model = LlamaForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=False,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(
    model,
    lora_weights,
    torch_dtype=torch.float16,
)

推理1：

def inference(inputs='How to craft a diamond pickaxe in Minecraft?',
            max_new_tokens=512, 
            do_sample=True, 
            top_p=0.75,
            temperature=0.9, 
            repetition_penalty=1, 
            eos_token_id=2, 
            bos_token_id=1, 
            pad_token_id=0
              ):
    input_ids = tokenizer(inputs, return_tensors="pt").input_ids.to(device)
    generation_output = model.generate(
        input_ids=input_ids,  
        max_new_tokens=max_new_tokens, 
        temperature=temperature,
        top_p=top_p,
        do_sample=do_sample, 
        repetition_penalty=repetition_penalty, 
        eos_token_id=eos_token_id, 
        bos_token_id=bos_token_id, 
        pad_token_id=pad_token_id
    )
    
    output = tokenizer.batch_decode(generation_output)[0]
    return output

print(inference())

推理2：

# 添加停止准则
stop_list = ['\nHuman:', '\n```\n']
stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]

stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids

from transformers import StoppingCriteria, StoppingCriteriaList

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

generate_text = transformers.pipeline(
    model=model, 
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    do_sample=samp, 
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=temp,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=rep,  # without this output begins repeating
    top_k=top_k,
    top_p=top_p,
    num_beams=nb,
    device=‘cuda’
)

query='How to craft a diamond pickaxe in Minecraft?'
output = generate_text(query)

phi

推理

def inference(inputs='How to craft a diamond pickaxe in Minecraft?',
            max_new_tokens=200, 
            do_sample=True, 
            top_p=0.75,
            temperature=0.01, 
            repetition_penalty=1.2, 
            eos_token_id=50256, 	# eos变了
            bos_token_id=1, 
            pad_token_id=0
              ):
    input_ids = tokenizer(inputs, return_tensors="pt").input_ids.to(device)
    generation_output = model.generate(
        input_ids=input_ids,  
        max_new_tokens=max_new_tokens, 
        temperature=temperature,
        top_p=top_p,
        do_sample=do_sample, 
        repetition_penalty=repetition_penalty, 
        eos_token_id=eos_token_id, 
        bos_token_id=bos_token_id, 
        pad_token_id=pad_token_id
    )
    
    output = tokenizer.batch_decode(generation_output)[0]
    return output.replace("<|endoftext|>","")