cpu版本 。
from transformers import AutoTokenizer, AutoModel
from huggingface_hub.hf_api import HfFolder
HfFolder.save_token('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')
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "croissantllm/CroissantLLMChat-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
chat = [
{"role": "user", "content": "Que puis-je faire à Marseille en hiver?"},
]
chat_input = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(chat_input, return_tensors="pt", add_special_tokens=True).to(model.device)
tokens = model.generate(**inputs, max_new_tokens=150, do_sample=True, top_p=0.95, top_k=60, temperature=0.3)
print(tokenizer.decode(tokens[0]))