流式输出
流式输出通常指的是以连续流的形式处理数据,而不是一次性处理整个数据集。这种处理方式在数据量很大时非常有用,可以有效地减少内存使用。
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
device = "cuda"
model_name = "/gemini/pretrain/Qwen2-0.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
messages = [
{"role": "system", "content": "You are a helpful assistant."},
]
def stream_generate(prompt, model, tokenizer, device):
messages.append({"role": "user", "content": prompt})
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generate_params = dict(
inputs=model_inputs.input_ids,
max_new_tokens=512,
do_sample=True,
top_k=50,
temperature=0.7,
pad_token_id=tokenizer.eos_token_id,
streamer=streamer
)
thread = Thread(target=model.generate, kwargs=generate_params)
thread.start()
generated_text = ""
for new_text in streamer:
generated_text += new_text
print(new_text, end='', flush=True)
print()
# generated_text
messages.append({"role": "user", "content": generated_text})
# 多轮对话
while True:
user_input = input("User: ")
if user_input.lower() == 'exit':
print("Exiting...")
break
# 生成回复并流式输出
print("Assistant: ", end="")
stream_generate(user_input, model, tokenizer, device)
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread
# 设备选择,根据可用性选择 "cuda" 或 "cpu"
device = "cuda"
# 模型名称和路径
model_name = "/gemini/pretrain/Qwen2-0.5B-Instruct"
# 加载模型和tokenizer
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 初始化流式文本输出器
streamer = TextIteratorStreamer(tokenizer)
# 对话历史记录,初始系统消息表明助手是有帮助的
messages = [
{"role": "system", "content": "You are a helpful assistant."},
# {"role": "user", "content": prompt} # 用户问题将动态添加到这里
]
def stream_generate(prompt, model, tokenizer, device):
"""
使用流式输出生成回复。
参数:
- prompt: 用户的输入提示。
- model: 用于生成的预训练模型。
- tokenizer: 用于文本处理的tokenizer。
- device: 模型运行的设备,如 "cuda" 或 "cpu"。
返回:
无返回值,该函数将打印生成的回复。
"""
# 将用户问题添加到对话历史中
messages.append({"role": "user", "content": prompt})
# 准备文本模板并转换为模型输入
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
# 定义生成参数
generate_params = dict(
model_inputs,
max_new_tokens=512,
do_sample=True,
top_k=50,
temperature=0.7,
pad_token_id=tokenizer.eos_token_id,
streamer=streamer
)
# 使用线程异步执行生成任务
thread = Thread(target=model.generate, kwargs=generate_params)
thread.start()
# 流式输出生成的回复
generated_text = ""
for new_text in streamer:
generated_text += new_text
print(new_text, end='', flush=True)
print()
# 将生成的回复添加到对话历史中
messages.append({"role": "assistant", "content": generated_text})
# 多轮对话循环
while True:
user_input = input("User: ")
if user_input.lower() == 'exit':
print("Exiting...")
break
# 生成回复并流式输出
print("Assistant: ", end="")
stream_generate(user_input, model, tokenizer, device)